Solved: Multiple reflexive access from channel

KYENS · ‎10-01-2019

My program works very well while emulating,

but when it comes to compiling the kernel to get the html report,

the compiler gives me this error

Compiler Error: Multiple reflexive accesses from channel c0[0][0]

Error: Optimizer FAILED.

Is there any way to resolve this problem?

HRZ · ‎10-08-2019

Multiple channel call sites are now allowed in the compiler (since v17.1 I think), even though I would personally avoid them. However, I think the problem in your case is that you might have data circling in the chain of reads and writes in the merge_tree kernel, which, coupled with the fact that you also have multiple call sites per channel, makes the design impossible to implement on hardware. If you can avoid multiple call sites per channel altogether, that would likely fix the problem. However, as it is, it is difficult to pinpoint the problem considering the complexity of the channel operations. One point I can add is that you can probably use a struct to merge the two writes in lines 17 and 18, and the reads in 35 and 36, into one pair of write and read, and maybe even use a set of different channels for those altogether. As long as the channel depth is small (<16 indexes), you do not need to worry about the channel's area overhead since it will be implemented using registers (instead of Block RAM), and you can use as many channels as necessary to avoid reusing the same channel with multiple call sites.

View solution in original post

MEIYAN_L_Intel · ‎10-02-2019

Hi,

May I know what version of OpenCL compiler/ BSP are you using ?

Can you share your kernel code here to look further into it?

Thanks

KYENS · ‎10-02-2019

I am currently using the OpenCL compiler of version 17.1 from the acceleration stack for development of arria10 GX

and here is my code

#pragma OPENCL EXTENSION cl_intel_channels : enable
channel uint4 c0[31];
uint compare(uint4 A,uint4 B,uint4 C);
uint8  merger(uint4 A,	uint4 B);
void swap(uint* A,uint*B);
uint4  read(uint channel_id);
void write(uint channel_id,uint4 write_out_data);
__kernel void read_in_kernel(__global uint* input,__global const uint *restrict lengthG){  //work size=8
	uint4 A,B;
	__const uint len=lengthG[0];
	__const uint id=get_global_id(0);
	__const uint offset_0=2*id,offset_1=2*id+1;
	__const uint arr_idx_0=offset_0*len,arr_idx_1=offset_1*len;
	for(int i=0;i<len;i+=4){
		A=(uint4) (input[arr_idx_0+i],input[arr_idx_0+i+1],input[arr_idx_0+i+2],input[arr_idx_0+i+3]);
		B=(uint4) (input[arr_idx_1+i],input[arr_idx_1+i+1],input[arr_idx_1+i+2],input[arr_idx_1+i+3]);
		write(offset_0,A);//write_channel_intel(c0[offset_0],A);
		write(offset_1,B);//write_channel_intel(c0[offset_1],B);
	}
}
 
 
__kernel void merge_tree(__global const uint *restrict lengthG) {
	__private uint4 A,B,regist;
	__private uint8 merge_result=(uint8) (0,0,0,0,0,0,0,0);
	__private uint counter_a=0,counter_b=0;
	__private	uint largest=2;
	
	__const uint id=get_global_id(0);			
	__const uint length=lengthG[0];		
	__const uint dim=get_global_size(0); 
	__const uint index_a=2*id+32-dim*4;
	__const uint index_b=2*id+33-dim*4;
	__const uint index_out=id+32-dim*2;
	A=read(index_a);//read_channel_intel(c0[index_a]);
	B=read(index_b);//read_channel_intel(c0[index_b]);
	counter_a+=4;
	counter_b+=4;
	merge_result=merger(A,B);
	write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],merge_result.lo);
	regist=merge_result.hi;
	while(counter_a<length&&counter_b<length){
		if(largest==1){
			A=read(index_a);//read_channel_intel(c0[index_a]);
		}
		else if(largest==0){
			B=read(index_b);//read_channel_intel(c0[index_b]);
		}
		else if(largest==2){
			A=read(index_a);//read_channel_intel(c0[index_a]);
			B=read(index_b);//read_channel_intel(c0[index_b]);
		}
		largest=compare(A,B,regist);
		if(largest==0){
			merge_result=merger(B,regist);
			counter_b+=4;}
		else if(largest==1){
			merge_result=merger(A,regist);
			counter_a+=4;}
		else if(largest==2){
			merge_result=merger(A,B);
			counter_a+=4;
			counter_b+=4;}	
		write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],merge_result.lo);
		regist=merge_result.hi;		
	}
	if(counter_a==(length)&&counter_b!=(length))
	{  // flush others
		merge_result=merger(B,regist);
		counter_b+=4;
		regist=merge_result.hi;
		write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],merge_result.lo);
		while(counter_b<length){
			B=read(index_b);//read_channel_intel(c0[index_b]);
			counter_b+=4;
			merge_result=merger(B,regist);
			regist=merge_result.hi;
			write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],merge_result.lo);
		}
	}
	else if(counter_a!=(length)&&counter_b==(length)){
		merge_result=merger(A,regist);
		regist=merge_result.hi;
		write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],merge_result.lo);
		counter_a+=4;
		while(counter_a<length){
			A=read(index_a);//read_channel_intel(c0[index_a]);
			counter_a+=4;
			merge_result=merger(A,regist);
			regist=merge_result.hi;
			write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],merge_result.lo); 
		}
	}
	write(index_out,merge_result.lo);//write_channel_intel(c0[index_out],regist);
}
__kernel void write_back(  __global uint* sorted_array) {
	int index=0;
	uint4 A;
	for(int i=0;i<4096;++i){
		A=read(30);//read_channel_intel(c0[30]);
		sorted_array[index++]=A.x;
		sorted_array[index++]=A.y;
		sorted_array[index++]=A.z;
		sorted_array[index++]=A.w;
	}
}
uint compare(uint4 A,uint4 B,uint4 C){
	if		(A.x>=B.x&&A.x>=C.x) return 0;
	else if(B.x>=A.x&&B.x>=C.x) return 1;
	else if(C.x>=A.x&&C.x>=B.x) return 2;
	return 0;
}
 
uint8  merger(uint4 A,	uint4 B){
	uint end=0;
	uint AB[8]={A.x,A.y,A.z,A.w,B.x,B.y,B.z,B.w};
 
	for(int i=0;i<4;i++){
		if(AB[i]>AB[7-i])	swap(&AB[i],&AB[7-i]);
	}
	for(int i=0;i<8;i+=4){
		for(int j=0;j<2;j++)
			if(AB[i+j]>AB[i+j+2])	swap(&AB[i+j],&AB[i+j+2]);
	}
	for(int i=0;i<8;i+=2){
		if(AB[i]>AB[i+1])	swap(&AB[i],&AB[i+1]);
	}
	uint8 merge_result=(uint8) (AB[0],AB[1],AB[2],AB[3],AB[4],AB[5],AB[6],AB[7]);
	return merge_result;                                           
}
 
void swap(uint* A,uint*B){
	uint temp;
	temp=*A;
	*A=*B;
	*B=temp;}
 
uint4  read(uint channel_id){
	uint4 data;
	switch(channel_id){
		case 0:  data=read_channel_intel(c0[0]); break;
		case 1:  data=read_channel_intel(c0[1]); break;
		case 2:  data=read_channel_intel(c0[2]); break;
		case 3:  data=read_channel_intel(c0[3]); break;
		case 4:  data=read_channel_intel(c0[4]); break;
		case 5:  data=read_channel_intel(c0[5]); break;
		case 6:  data=read_channel_intel(c0[6]); break;
		case 7:  data=read_channel_intel(c0[7]); break;
		case 8:  data=read_channel_intel(c0[8]); break;
		case 9:  data=read_channel_intel(c0[9]); break;
		case 10: data=read_channel_intel(c0[10]); break;
		case 11: data=read_channel_intel(c0[11]); break;
		case 12: data=read_channel_intel(c0[12]); break;
		case 13: data=read_channel_intel(c0[13]); break;
		case 14: data=read_channel_intel(c0[14]); break;
		case 15: data=read_channel_intel(c0[15]); break;
		case 16: data=read_channel_intel(c0[16]); break;
		case 17: data=read_channel_intel(c0[17]); break;
		case 18: data=read_channel_intel(c0[18]); break;
		case 19: data=read_channel_intel(c0[19]); break;
		case 20: data=read_channel_intel(c0[20]); break;
		case 21: data=read_channel_intel(c0[21]); break;
		case 22: data=read_channel_intel(c0[22]); break;
		case 23: data=read_channel_intel(c0[23]); break;
		case 24: data=read_channel_intel(c0[24]); break;
		case 25: data=read_channel_intel(c0[25]); break;
		case 26: data=read_channel_intel(c0[26]); break;
		case 27: data=read_channel_intel(c0[27]); break;
		case 28: data=read_channel_intel(c0[28]); break;
		case 29: data=read_channel_intel(c0[29]); break;
		case 30: data=read_channel_intel(c0[30]); break;
	}                                             
	return data;                                  
}
void write(uint channel_id,uint4 write_out_data){
	switch(channel_id){
		case 0:  write_channel_intel(c0[0],write_out_data);break;
		case 1:  write_channel_intel(c0[1],write_out_data);break;
		case 2:  write_channel_intel(c0[2],write_out_data);break;
		case 3:  write_channel_intel(c0[3],write_out_data);break;
		case 4:  write_channel_intel(c0[4],write_out_data);break;
		case 5:  write_channel_intel(c0[5],write_out_data);break;
		case 6:  write_channel_intel(c0[6],write_out_data); break;       
		case 7:  write_channel_intel(c0[7],write_out_data); break;       
		case 8:  write_channel_intel(c0[8],write_out_data); break;
		case 9:  write_channel_intel(c0[9],write_out_data); break;
		case 10: write_channel_intel(c0[10],write_out_data); break;
		case 11: write_channel_intel(c0[11],write_out_data); break;
		case 12: write_channel_intel(c0[12],write_out_data); break;
		case 13: write_channel_intel(c0[13],write_out_data); break;
		case 14: write_channel_intel(c0[14],write_out_data); break;
		case 15: write_channel_intel(c0[15],write_out_data); break;
		case 16: write_channel_intel(c0[16],write_out_data); break;
		case 17: write_channel_intel(c0[17],write_out_data); break;
		case 18: write_channel_intel(c0[18],write_out_data); break;
		case 19: write_channel_intel(c0[19],write_out_data); break;
		case 20: write_channel_intel(c0[20],write_out_data); break;
		case 21: write_channel_intel(c0[21],write_out_data); break;
		case 22: write_channel_intel(c0[22],write_out_data); break;
		case 23: write_channel_intel(c0[23],write_out_data); break;
		case 24: write_channel_intel(c0[24],write_out_data); break;
		case 25: write_channel_intel(c0[25],write_out_data); break;
		case 26: write_channel_intel(c0[26],write_out_data); break;
		case 27: write_channel_intel(c0[27],write_out_data); break;
		case 28: write_channel_intel(c0[28],write_out_data); break;
		case 29: write_channel_intel(c0[29],write_out_data); break;
		case 30: write_channel_intel(c0[30],write_out_data); break;
	}
}

HRZ · ‎10-07-2019

Can you edit your post and re-post your kernel using the "code snippet" option provided by the forum with correct indentation? It is impossible to read your code like this.

You likely have a chain of channels with multiple call-sites in your kernel.

KYENS · ‎10-07-2019

Sorry for the inconvenience,

I have re-posted my kernel code above.

Should I should be improper to access a channel from multiple call sites in the kernel and reduce access to a single site

to cope with this error code is that right?

HRZ · ‎10-08-2019

Multiple channel call sites are now allowed in the compiler (since v17.1 I think), even though I would personally avoid them. However, I think the problem in your case is that you might have data circling in the chain of reads and writes in the merge_tree kernel, which, coupled with the fact that you also have multiple call sites per channel, makes the design impossible to implement on hardware. If you can avoid multiple call sites per channel altogether, that would likely fix the problem. However, as it is, it is difficult to pinpoint the problem considering the complexity of the channel operations. One point I can add is that you can probably use a struct to merge the two writes in lines 17 and 18, and the reads in 35 and 36, into one pair of write and read, and maybe even use a set of different channels for those altogether. As long as the channel depth is small (<16 indexes), you do not need to worry about the channel's area overhead since it will be implemented using registers (instead of Block RAM), and you can use as many channels as necessary to avoid reusing the same channel with multiple call sites.

MEIYAN_L_Intel · ‎10-04-2019

Hi,

Can you provide the host code and kernel code attached as file here, so that I can run it on my side for further investigation.

Thanks.

KYENS · ‎10-04-2019

HI,

here is the host code used,

the code is supposed to implement a merged tree composed by mergers.

The tree sorts 16 streams of sorted data of length 1024.

and the read_in kernel reads the data from global memory and then transfers it to separate branches through channel.

Thank you very much for your support.

KYENS · ‎10-04-2019

And By the way this is the test_case that I used.

As for the correct sorted sequence for reference, the host code will generate the reference answer.