Community
cancel
Showing results for 
Search instead for 
Did you mean: 
Altera_Forum
Honored Contributor I
1,392 Views

Error: Optimizer FAILED

Hi,

 

 

I am quite new to OpenCl.

I have downloded the following code (only a portion shown here) of Vladimir Antonenko which compresses image to a jpeg.

The program originally ment to run on GPU but I intend implement it on Altera FPGA.

I run it on Altera OpenCl compiler for FPGA and encounter the following behavior:

 

 

In compilation for Emulation no problem is encountered and after compilation (also host part) the program runs without any

problem and indeed compresses an image.

 

 

Compiling the code for the device (no emulation) the compiler stops with "Optimizer FAILED" problem.

The strange part is that there is no single line or segment that causes the problem but combinations of parts of the code.

 

 

1: the whole function v4_rgb2yuv420() causes "Optimizer FAILED" and does not finish the first compilation stage.

2: if I remark out segment A or only the 8 occurrences of the lines "t = as_uint4(shuffle(as_uchar16(t), mask));" 

in segment A depicted with "//####" then the compiler finishes the first stage successfully (I don't wait for the whole compilation cause it takes hours).

3: If the above 8 occurrences of the lines are kept but segment B is remarked out then the compiler finishes the first stage successfully.

 

 

so to summarize: if segment A or B are remarked the compiler succeeds and if both A and B are kept then the Optimizer fails.

 

the program and the resulted printing are attached here:

 

 

 

The code (only a portion that displays the problem):

================================================== ==

 

 

 

 

// jpeg.cl

 

 

/////////////////////////////////////////////////////////////////////////////

inline uchar8 RGB2Y2(const ushort8 R0, const ushort8 G0, const ushort8 B0)

{

const uint8 R = convert_uint8(R0);

const uint8 G = convert_uint8(G0);

const uint8 B = convert_uint8(B0);

return convert_uchar8(( 32768 + 19595*R + 38470*G + 7471*B) >> 16);

}

/////////////////////////////////////////////////////////////////////////////

inline uchar8 RGB2Cb2(const short8 R0, const short8 G0, const short8 B0)

{

const int8 R = convert_int8(R0);

const int8 G = convert_int8(G0);

const int8 B = convert_int8(B0);

return convert_uchar8((8421376*4 - 11058*R - 21709*G + 32767*B) >> 18);

}

/////////////////////////////////////////////////////////////////////////////

inline uchar8 RGB2Cr2(const short8 R0, const short8 G0, const short8 B0)

{

const int8 R = convert_int8(R0);

const int8 G = convert_int8(G0);

const int8 B = convert_int8(B0);

return convert_uchar8((8421376*4 + 32767*R - 27438*G - 5329*B) >> 18);

}

 

 

/////////////////////////////////////////////////////////////////////////////

 

 

inline ushort4 hsum2(const ushort8 t)

{

return (ushort4) (t.s0+t.s1, t.s2+t.s3, t.s4+t.s5, t.s6+t.s7);

}

 

 

/////////////////////////////////////////////////////////////////////////////

// BGR to YUV conversion + chroma subsampling 4:2:0

// Input: 2D], dimension [SIZEX/16,SIZEY/2]

__kernel void v4_rgb2yuv420

(

__global const uint *bgr,

__global uchar8 *yuv

)

{

const uchar16 mask = (uchar16)(2,5,8,11, 1,4,7,10, 0,3,6,9, 0,0,0,0);

const size_t x = get_global_id(0);

size_t y = get_global_id(1)*2;

size_t i = (y*get_global_size(0) + x)*4;

size_t o = ((y/8)*get_global_size(0) + x)*16 + (y%8);

uint2 rgb[3][2][2];

 

 

//======= start segment A

 

 

{

uint4 t = (uint4)(vload3(i+0,bgr), 0);

t = as_uint4(shuffle(as_uchar16(t), mask)); //####

rgb[0][0][0].lo = t.s0;

rgb[1][0][0].lo = t.s1;

rgb[2][0][0].lo = t.s2;

}

 

{

uint4 t = (uint4)(vload3(i+1,bgr), 0);

t = as_uint4(shuffle(as_uchar16(t), mask)); //####

rgb[0][0][0].hi = t.s0;

rgb[1][0][0].hi = t.s1;

rgb[2][0][0].hi = t.s2;

}

{

uint4 t = (uint4)(vload3(i+2,bgr), 0);

t = as_uint4(shuffle(as_uchar16(t), mask)); //####

rgb[0][0][1].lo = t.s0;

rgb[1][0][1].lo = t.s1;

rgb[2][0][1].lo = t.s2;

}

 

 

{

uint4 t = (uint4)(vload3(i+3,bgr), 0);

t = as_uint4(shuffle(as_uchar16(t), mask)); //####

rgb[0][0][1].hi = t.s0;

rgb[1][0][1].hi = t.s1;

rgb[2][0][1].hi = t.s2;

}

 

 

 

 

// second line

y += 1;

i = (y*get_global_size(0) + x)*4;

 

 

 

 

{

uint4 t = (uint4)(vload3(i+0,bgr), 0);

t = as_uint4(shuffle(as_uchar16(t), mask)); //####

rgb[0][1][0].lo = t.s0;

rgb[1][1][0].lo = t.s1;

rgb[2][1][0].lo = t.s2;

 

 

}

{

uint4 t = (uint4)(vload3(i+1,bgr), 0);

t = as_uint4(shuffle(as_uchar16(t), mask)); //####

rgb[0][1][0].hi = t.s0;

rgb[1][1][0].hi = t.s1;

rgb[2][1][0].hi = t.s2;

}

 

{

uint4 t = (uint4)(vload3(i+2,bgr), 0);

t = as_uint4(shuffle(as_uchar16(t), mask)); //####

rgb[0][1][1].lo = t.s0;

rgb[1][1][1].lo = t.s1;

rgb[2][1][1].lo = t.s2;

}

{

uint4 t = (uint4)(vload3(i+3,bgr), 0);

t = as_uint4(shuffle(as_uchar16(t), mask)); //####

rgb[0][1][1].hi = t.s0;

rgb[1][1][1].hi = t.s1;

rgb[2][1][1].hi = t.s2;

}

//======= end segment A

 

 

//======= start segment B

 

 

 

 

{

ushort8 R = convert_ushort8(as_uchar8(rgb[0][0][0]));

rgb[0][0][0] = as_uint2(hsum2(R));

ushort8 G = convert_ushort8(as_uchar8(rgb[1][0][0]));

rgb[1][0][0] = as_uint2(hsum2(G));

ushort8 B = convert_ushort8(as_uchar8(rgb[2][0][0]));

rgb[2][0][0] = as_uint2(hsum2(B));

yuv[o] = RGB2Y2(R,G,B);

}

{

ushort8 R = convert_ushort8(as_uchar8(rgb[0][0][1]));

rgb[0][0][1] = as_uint2(hsum2(R));

ushort8 G = convert_ushort8(as_uchar8(rgb[1][0][1]));

rgb[1][0][1] = as_uint2(hsum2(G));

ushort8 B = convert_ushort8(as_uchar8(rgb[2][0][1]));

rgb[2][0][1] = as_uint2(hsum2(B));

yuv[o+8] = RGB2Y2(R,G,B);

}

o = ((y/8)*get_global_size(0) + x)*16 + (y%8);

{

ushort8 R = convert_ushort8(as_uchar8(rgb[0][1][0]));

rgb[0][1][0] = as_uint2(hsum2(R));

ushort8 G = convert_ushort8(as_uchar8(rgb[1][1][0]));

rgb[1][1][0] = as_uint2(hsum2(G));

ushort8 B = convert_ushort8(as_uchar8(rgb[2][1][0]));

rgb[2][1][0] = as_uint2(hsum2(B));

yuv[o] = RGB2Y2(R,G,B);

}

{

ushort8 R = convert_ushort8(as_uchar8(rgb[0][1][1]));

rgb[0][1][1] = as_uint2(hsum2(R));

ushort8 G = convert_ushort8(as_uchar8(rgb[1][1][1]));

rgb[1][1][1] = as_uint2(hsum2(G));

ushort8 B = convert_ushort8(as_uchar8(rgb[2][1][1]));

rgb[2][1][1] = as_uint2(hsum2(B));

yuv[o+8] = RGB2Y2(R,G,B);

}

 

 

 

{

const size_t offset_cb = get_global_size(1)*get_global_size(0)*4;

const size_t offset_cr = offset_cb + get_global_size(1)*get_global_size(0);

 

 

y = get_global_id(1);

o = ((y/8)*get_global_size(0) + x)*8 + (y%8);

 

 

// RGB addition of the pair in vetical direction

short8 R = ((short8)(as_short4(rgb[0][0][0]),as_short4(rgb[0][0][1])))

+ ((short8)(as_short4(rgb[0][1][0]),as_short4(rgb[0][1][1])));

 

 

short8 G = ((short8)(as_short4(rgb[1][0][0]),as_short4(rgb[1][0][1])))

+ ((short8)(as_short4(rgb[1][1][0]),as_short4(rgb[1][1][1])));

 

 

short8 B = ((short8)(as_short4(rgb[2][0][0]),as_short4(rgb[2][0][1])))

+ ((short8)(as_short4(rgb[2][1][0]),as_short4(rgb[2][1][1])));

 

 

yuv[offset_cb+o] = RGB2Cb2(R,G,B);

yuv[offset_cr+o] = RGB2Cr2(R,G,B);

 

//======= end segment B

}

 

 

 

 See attached for printing

0 Kudos
3 Replies
Altera_Forum
Honored Contributor I
92 Views

Compile optimization crashes like this cannot be fixed by the user, you have to directly open a service request with Altera for that. What you should expect is that they will probably tell you this code is too complex for the specific architecture of an FPGA and you would have to rewrite it in a way that it better matches the underlying FPGA architecture. What I can tell you is that even if the code compiles without issues, it will perform terribly because an FPGA's architecture is completely different from a GPU; you will eventually have to rewrite it if you want to get proper performance. 

 

P.S. You should probably try compiling the code using the latest Quartus v16.1.2 and against one of Altera's reference BSPs first before reporting; they might have already fixed the issue in the latest version.
Altera_Forum
Honored Contributor I
92 Views

Problem solved.

Altera_Forum
Honored Contributor I
92 Views

Shaile, 

 

Please share what the solution to the problem was.
Reply