Community
cancel
Showing results for 
Search instead for 
Did you mean: 
Altera_Forum
Honored Contributor I
837 Views

Obtaining different results while executing a kernel on GPU or FPGA

Hello, 

 

I have got another question concerning OpenCL. 

 

My problem: 

When I execute the same kernel on GPU and on FPGA (pre-compiled binary), I get different results when I read the buffer afterwards. 

 

Are there any device specific operations that can result in a different output? 

 

The kernel code: 

# pragma OPENCL EXTENSION cl_amd_printf : enable struct __attribute__ ((packed)) gm_component { float w; float4 m; float16 P; }; struct __attribute__ ((packed)) gm_component_survive { float w; float4 m; float16 P; float2 eta; float4 S; float8 K; }; //4x4 matrix multiplication float16 matrixMult4x4f(float16 M, float16 N,float4 unit4) { //return M*N float4 a=M.s0123; float4 b=M.s4567; float4 c=M.s89ab; float4 d=M.scdef; float4 e=N.s048c; float4 f=N.s159d; float4 g=N.s26ae; float4 h=N.s37bf; float16 tmp = (float16){dot(a*e,unit4),dot(a*f,unit4),dot(a*g,unit4),dot(a*h,unit4), dot(b*e,unit4),dot(b*f,unit4),dot(b*g,unit4),dot(b*h,unit4), dot(c*e,unit4),dot(c*f,unit4),dot(c*g,unit4),dot(c*h,unit4), dot(d*e,unit4),dot(d*f,unit4),dot(d*g,unit4),dot(d*h,unit4)}; return tmp; } // OpenCL Kernel to compute multiplication and addition __kernel void update(__global struct gm_component_survive * restrict predict_mixture, float8 Hk, __global float2 *Zk, __global struct gm_component * restrict update_mixture,int size, float pr_dk, int updateMixtureSize) { int zk_index=get_global_id(0); int survive_index = get_global_id(1); if(zk_index<size && survive_index<updateMixtureSize){ __global struct gm_component *um = &update_mixture; __global struct gm_component_survive *pm = &predict_mixture; float2 unit2={1.f,1.f}; float4 unit4={1.f,1.f,1.f,1.f}; //Multivarate guassian calculation //calculate miu float2 miu = Zk -pm->eta; //Inverse of covarience.. start float4 inv_covariance = pm->S; //calculate denominator float determenent = inv_covariance.s3*inv_covariance.s0 - inv_covariance.s2*inv_covariance.s1; inv_covariance =(float4){inv_covariance.s3,-inv_covariance.s1,-inv_covariance.s2,inv_covariance.s0}; inv_covariance = inv_covariance/determenent; //inverse calculation end //multiplication of miu.T*covariance*miu float2 number = { dot(miu*inv_covariance.even,unit2),dot(miu*inv_covariance.odd,unit2)}; number=number*miu; //Calculate denominator :pow(2*M_PI, 2)* determenent 39.4784 float denom = 39.4784* determenent; denom=sqrt(denom); //calculate weight um->w = pr_dk*pm->w*native_exp(-0.5f * dot(number,unit2)) / denom; //calculate mean number = (float2){dot(Hk.lo*pm->m,unit4) , dot(Hk.hi*pm->m,unit4)}; number = Zk-number; inv_covariance =(float4){dot(pm->K.lo.lo*number,unit2),dot(pm->K.lo.hi*number,unit2),dot(pm->K.hi.lo*number,unit2),dot(pm->K.hi.hi*number,unit2)}; um->m = pm->m+inv_covariance; //calculate covarince float16 temp1 = (float16){1-dot(pm->K.lo.lo*Hk.s04,unit2),-dot(pm->K.lo.lo*Hk.s15,unit2),-dot(pm->K.lo.lo*Hk.s26,unit2),-dot(pm->K.lo.lo*Hk.s37,unit2), -dot(pm->K.lo.hi*Hk.s04,unit2),1-dot(pm->K.lo.hi*Hk.s15,unit2),-dot(pm->K.lo.hi*Hk.s26,unit2),-dot(pm->K.lo.hi*Hk.s37,unit2), -dot(pm->K.hi.lo*Hk.s04,unit2),-dot(pm->K.hi.lo*Hk.s15,unit2),1-dot(pm->K.hi.lo*Hk.s26,unit2),-dot(pm->K.hi.lo*Hk.s37,unit2), -dot(pm->K.hi.hi*Hk.s04,unit2),-dot(pm->K.hi.hi*Hk.s15,unit2),-dot(pm->K.hi.hi*Hk.s26,unit2),1-dot(pm->K.hi.hi*Hk.s37,unit2)}; um->P = matrixMult4x4f(temp1, pm->P,unit4);; } }  

 

Any hints will be apprecciated. 

 

Tobias
0 Kudos
6 Replies
Altera_Forum
Honored Contributor I
52 Views

How different are the results? If the results are only different in the last few digits of the numbers, then it is probably caused by some rounding difference. Note that if you use --fpc or --fp-relaxed for FPGA compilation, output of floating-point operations will be different.

Altera_Forum
Honored Contributor I
52 Views

Thank you for your answer! 

 

The output is somehow very different. As example look at the following outputs obtained by GPU and FPGA: 

 

 

FPGA: 

 

w: 0 

m: 0.1 51410 0.1 0  

P: 8.25 13.5 0 0  

-879.562 -1435.38 -0.0762125 -0.124711  

0 0 8.25 13.5  

0 0 13.5 26  

 

 

w: 0 

m: 0.1 56227.3 0.1 0  

P: 8.25 13.5 0 0  

-879.562 -1435.38 -0.0762125 -0.124711  

0 0 8.25 13.5  

0 0 13.5 26  

 

 

w: 0 

m: 0.1 95197.5 0.1 0  

P: 8.25 13.5 0 0  

-879.562 -1435.38 -0.0762125 -0.124711  

0 0 8.25 13.5  

0 0 13.5 26  

 

 

w: 0 

m: 0.1 85130.8 0.1 0  

P: 8.25 13.5 0 0  

-879.562 -1435.38 -0.0762125 -0.124711  

0 0 8.25 13.5  

0 0 13.5 26  

 

 

w: 0 

m: 0.1 68243.1 0.1 0  

P: 8.25 13.5 0 0  

-879.562 -1435.38 -0.0762125 -0.124711  

0 0 8.25 13.5  

0 0 13.5 26  

 

 

w: 0 

m: 0 0 0 0  

P: 0 0 0 0  

0 0 0 0  

0 0 0 0  

0 0 0 0 

 

 

GPU: 

 

w: 0 

m: 4.48707 0 2.19607 0  

P: 8.17379 13.3753 0 0  

13.5 26 0 0  

0 0 8.17379 13.3753  

0 0 13.5 26  

 

 

w: 0 

m: 4.89815 0 2.33002 0  

P: 8.17379 13.3753 0 0  

13.5 26 0 0  

0 0 8.17379 13.3753  

0 0 13.5 26  

 

 

w: 0 

m: 8.22379 0 2.51016 0  

P: 8.17379 13.3753 0 0  

13.5 26 0 0  

0 0 8.17379 13.3753  

0 0 13.5 26  

 

 

w: 0 

m: 7.36467 0 3.0552 0  

P: 8.17379 13.3753 0 0  

13.5 26 0 0  

0 0 8.17379 13.3753  

0 0 13.5 26  

 

 

w: 0 

m: 5.92356 0 2.4224 0  

P: 8.17379 13.3753 0 0  

13.5 26 0 0  

0 0 8.17379 13.3753  

0 0 13.5 26  

 

 

w: 0 

m: 0 0 0 0  

P: 0 0 0 0  

0 0 0 0  

0 0 0 0  

0 0 0 0
Altera_Forum
Honored Contributor I
52 Views

Thank you for your answer! 

 

The results are quite different. A sample is shown below: 

 

FPGA: w: 0 m: 0.1 51410 0.1 0 P: 8.25 13.5 0 0 -879.562 -1435.38 -0.0762125 -0.124711 0 0 8.25 13.5 0 0 13.5 26 w: 0 m: 0.1 56227.3 0.1 0 P: 8.25 13.5 0 0 -879.562 -1435.38 -0.0762125 -0.124711 0 0 8.25 13.5 0 0 13.5 26 w: 0 m: 0.1 95197.5 0.1 0 P: 8.25 13.5 0 0 -879.562 -1435.38 -0.0762125 -0.124711 0 0 8.25 13.5 0 0 13.5 26 w: 0 m: 0.1 85130.8 0.1 0 P: 8.25 13.5 0 0 -879.562 -1435.38 -0.0762125 -0.124711 0 0 8.25 13.5 0 0 13.5 26 w: 0 m: 0.1 68243.1 0.1 0 P: 8.25 13.5 0 0 -879.562 -1435.38 -0.0762125 -0.124711 0 0 8.25 13.5 0 0 13.5 26 w: 0 m: 0 0 0 0 P: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 GPU: w: 0 m: 4.48707 0 2.19607 0 P: 8.17379 13.3753 0 0 13.5 26 0 0 0 0 8.17379 13.3753 0 0 13.5 26 w: 0 m: 4.89815 0 2.33002 0 P: 8.17379 13.3753 0 0 13.5 26 0 0 0 0 8.17379 13.3753 0 0 13.5 26 w: 0 m: 8.22379 0 2.51016 0 P: 8.17379 13.3753 0 0 13.5 26 0 0 0 0 8.17379 13.3753 0 0 13.5 26 w: 0 m: 7.36467 0 3.0552 0 P: 8.17379 13.3753 0 0 13.5 26 0 0 0 0 8.17379 13.3753 0 0 13.5 26 w: 0 m: 5.92356 0 2.4224 0 P: 8.17379 13.3753 0 0 13.5 26 0 0 0 0 8.17379 13.3753 0 0 13.5 26 w: 0 m: 0 0 0 0 P: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Altera_Forum
Honored Contributor I
52 Views

Thank you for your answer! 

 

The outputs are in fact very different.
Altera_Forum
Honored Contributor I
52 Views

One example: 

 

FPGA: 

 

m: 0.1 51410 0.1 0  

 

GPU: 

 

m: 4.48707 0 2.19607 0
Altera_Forum
Honored Contributor I
52 Views

That is certainly not because of rounding difference. Are you using the exact same kernel and host (minus the clCreateProgram difference) code for both? Have you tried Altera's emulator?

Reply