Intel® Quartus® Prime Software
Intel® Quartus® Prime Design Software, Design Entry, Synthesis, Simulation, Verification, Timing Analysis, System Design (Platform Designer, formerly Qsys)
16606 Discussions

multiple kernel in a .cl file

Altera_Forum
Honored Contributor II
1,128 Views

I have a question about in appended single CL file case, 

1、in followed CL file ,when only have clip_8b_kernel、ITransformOne_kernel、GetSSE_kernel and if put ITransformOne_kernel before GetSSE_kernel will get an error as followed if place them in the order of clip_8b_kernel、GetSSE_kernel、ITransformOne_kernel can compile successfully . 

2、this CL file include multiple kernel and i find If compile any of these kernels separately have no error but the kernel of ITransform_C_kernel and ITransform_one_kernel Cannot exist simultaneously, if exist simultaneously it will have the error followed. 

 

 

error content: 

aoc: Linking with IP library ... 

/build/swbuild/SJ/nightly/17.0/290/l64/p4/acl/llvm/include/acl/Analysis/LocalMemSizing.h(83) ******* 

Requested memsize for unavailable aspace 

 

 

m_local_mem_size_per_workgroup.count(aspace) FAILED 

0 libLLVM-3.0.so 0x00007f348ee2846f 

1 libLLVM-3.0.so 0x00007f348ee2a3e2 

2 libpthread.so.0 0x00007f348da905e0 

3 libLLVM-3.0.so 0x00007f348f617ef9 acl::get_bits(llvm::Value const*, llvm::TargetData*, acl::LocalMemSizing*, bool, bool, bool) + 7289 

4 libLLVM-3.0.so 0x00007f348e37a880 grif::Pass_AreaReport::add_private_var_resources() + 8800 

5 libLLVM-3.0.so 0x00007f348e37c7e1 grif::Pass_AreaReport::runPass(grif::XNode*) + 913 

6 libLLVM-3.0.so 0x00007f348e34c58a acl::DSDKGenerate::runOnModule(llvm::Module&) + 8010 

7 libLLVM-3.0.so 0x00007f348f049fc1 llvm::MPPassManager::runOnModule(llvm::Module&) + 577 

8 libLLVM-3.0.so 0x00007f348f04a16b llvm::PassManagerImpl::run(llvm::Module&) + 187 

9 aocl-llc 0x000000000040be40 main + 5360 

10 libc.so.6 0x00007f348ca9ec05 __libc_start_main + 245 

11 aocl-llc 0x00000000004098e9 

Stack dump: 

0. Program arguments:  

/home/****/inteldevstack/intelFPGA_pro/hld/linux64/bin/aocl-llc -march=griffin -board /home/****/inteldevstack/a10_gx_pac_ias_1_0_prq/opencl/opencl_bsp/hardware/pac_a10/board_spec.xml -dbg-info-enabled test.bc -o test.v  

1. Running pass 'Generate DSDK netlist and HDL' on module 'test.bc'. 

Error: Verilog generator FAILED. 

Refer to test/test.log for details. 

 

cl file: 

 

typedef uchar uint8_t; 

typedef short uint16_t; 

typedef int uint32_t;# define clip_8b_const_v_LEN 1# define clip_8b_out_clip_8b_return_LEN 1# define ITransformOne_in_ref_LEN 200# define ITransformOne_in_in_LEN 200# define ITransformOne_inout_dst_LEN 200# define GetSSE_in_a_LEN 200# define GetSSE_in_b_LEN 200# define GetSSE_const_w_LEN 1# define GetSSE_const_h_LEN 1# define GetSSE_out_GetSSE_return_LEN 1# define ITransform_C_in_ref_LEN 200# define ITransform_C_in_in_LEN 200# define ITransform_C_inout_dst_LEN 200# define ITransform_C_const_do_two_LEN 1 

/****** VP8Calc -- VP8Transform *************************************************/# define WEBP_INLINE inline# define BPS 32 

 

 

static WEBP_INLINE uchar clip_8b(int v) { 

return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255; 

 

# define STORE(x, y, v)  

dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))# define kC1 (20091 + (1 << 16)) // DF: NEW# define kC2 35468 // DF: NEW# define MUL(a, b) (((a) * (b)) >> 16) 

 

 

static WEBP_INLINE void ITransformOne(const uint8_t * ref, const uint16_t* in, 

uint8_t* dst) { 

int C[4 * 4], *tmp; 

int i; 

tmp = C; 

for (i = 0; i < 4; ++i) { // vertical pass 

const int a = in[0] + in[8]; 

const int b = in[0] - in[8]; 

const int c = MUL(in[4], kC2) - MUL(in[12], kC1); 

const int d = MUL(in[4], kC1) + MUL(in[12], kC2); 

tmp[0] = a + d; 

tmp[1] = b + c; 

tmp[2] = b - c; 

tmp[3] = a - d; 

tmp += 4; 

in++; 

 

 

tmp = C; 

for (i = 0; i < 4; ++i) { // horizontal pass 

const int dc = tmp[0] + 4; 

const int a = dc + tmp[8]; 

const int b = dc - tmp[8]; 

const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1); 

const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2); 

STORE(0, i, a + d); 

STORE(1, i, b + c); 

STORE(2, i, b - c); 

STORE(3, i, a - d); 

tmp++; 

 

 

void ITransform_C(const uint8_t * ref,const uint16_t * in,uint8_t * dst, int do_two) { 

ITransformOne(ref, in, dst); 

if (do_two) { 

ITransformOne(ref+ 4, in + 16, dst+ 4); 

 

 

 

 

static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b, 

int w, int h) { 

int count = 0; 

int y, x; 

for (y = 0; y < h; ++y) { 

// DF: TBC, may need to unroll this 

for (x = 0; x < w; ++x) { 

const int diff = (int)a[x] - b[x]; 

count += diff * diff; 

a += BPS; 

b += BPS; 

return count; 

 

 

__kernel void clip_8b_kernel( int v,__global uchar* restrict clip_8b_return){  

printf("start func:clip_8b\n"); 

uchar clip_8b_return_[clip_8b_out_clip_8b_return_LEN]; 

clip_8b_return[0] = clip_8b(v); 

for(int i = 0;i<clip_8b_out_clip_8b_return_LEN;i++){ 

clip_8b_return = clip_8b_return_

__kernel void ITransformOne_kernel(__global uint8_t * restrict ref,__global uint16_t* restrict in,__global uint8_t* restrict dst){  

printf("start func:ITransformOne\n"); 

uint8_t ref_[ITransformOne_in_ref_LEN]; 

uint16_t in_[ITransformOne_in_in_LEN]; 

uint8_t dst_[ITransformOne_inout_dst_LEN]; 

for(int i = 0;i<ITransformOne_in_ref_LEN;i++){ 

ref_ = ref

for(int i = 0;i<ITransformOne_in_in_LEN;i++){ 

in_ = in

for(int i = 0;i<ITransformOne_inout_dst_LEN;i++){ 

dst_ = dst

ITransformOne(ref_,in_,dst_); 

for(int i = 0;i<ITransformOne_inout_dst_LEN;i++){ 

dst = dst_

__kernel void GetSSE_kernel(__global uint8_t* restrict a,__global uint8_t* restrict b, int w, int h,__global int* restrict GetSSE_return){  

printf("start func:GetSSE\n"); 

uint8_t a_[GetSSE_in_a_LEN]; 

uint8_t b_[GetSSE_in_b_LEN]; 

int GetSSE_return_[GetSSE_out_GetSSE_return_LEN]; 

for(int i = 0;i<GetSSE_in_a_LEN;i++){ 

a_ = a

for(int i = 0;i<GetSSE_in_b_LEN;i++){ 

b_ = b

GetSSE_return[0] = GetSSE(a_,b_,w,h); 

for(int i = 0;i<GetSSE_out_GetSSE_return_LEN;i++){ 

GetSSE_return = getsse_return_

__kernel void ITransform_C_kernel(__global uint8_t * restrict ref,__global uint16_t * restrict in,__global uint8_t * restrict dst, int do_two){ 

printf("start func:ITransform_C\n"); 

uint8_t ref_[ITransform_C_in_ref_LEN]; 

uint16_t in_[ITransform_C_in_in_LEN]; 

uint8_t dst_[ITransform_C_inout_dst_LEN]; 

for(int i = 0;i<ITransform_C_in_ref_LEN;i++){ 

ref_ = ref

for(int i = 0;i<ITransform_C_in_in_LEN;i++){ 

in_ = in

for(int i = 0;i<ITransform_C_inout_dst_LEN;i++){ 

dst_ = dst

ITransform_C(ref_,in_,dst_,do_two); 

for(int i = 0;i<ITransform_C_inout_dst_LEN;i++){ 

dst = dst_

 

 

 

 

 

 

who can help me? thanks in advance.
0 Kudos
1 Reply
Altera_Forum
Honored Contributor II
373 Views

Well, yes, your code crashes the compiler from v17.0 all the way to v18.0. However, it compiles correctly with v16.1.2. You have a lot of issues in your code reported by the v16.1.2 compiler. If you fix these, it will likely also compile with the newer versions: 

 

Compiler Warning: Auto-unrolled loop at /nfshome/hrz/test.cl:40 Compiler Warning: Auto-unrolled loop at /nfshome/hrz/test.cl:40 Compiler Warning: Auto-unrolled loop at /nfshome/hrz/test.cl:40 /nfshome/hrz/test.cl:130: Compiler Warning: Aggressive compiler optimization: removing unnecessary storage to local memory /nfshome/hrz/test.cl:133: Compiler Warning: Aggressive compiler optimization: removing unnecessary storage to local memory Warning: Kernel 'clip_8b_kernel' has unused argument 'v' Warning: Kernel 'clip_8b_kernel' has unused argument 'clip_8b_return' Warning: Kernel 'GetSSE_kernel' has unused argument 'a' Warning: Kernel 'GetSSE_kernel' has unused argument 'b' Warning: Kernel 'GetSSE_kernel' has unused argument 'w' Warning: Kernel 'GetSSE_kernel' has unused argument 'h' Warning: Kernel 'GetSSE_kernel' has unused argument 'GetSSE_return' 

 

Specifically, the unnecessary usage of local memory in lines 130 and 133 is very likely the cause of the crash with the new versions of the compiler. If you remove the local memory usage in those lines, it will likely allow the kernel to be compiled correctly.
0 Kudos
Reply