Obtaining different results while executing a kernel on GPU or FPGA

Altera_Forum · ‎09-08-2017

Hello,

I have got another question concerning OpenCL.

My problem:

When I execute the same kernel on GPU and on FPGA (pre-compiled binary), I get different results when I read the buffer afterwards.

Are there any device specific operations that can result in a different output?

The kernel code:

# pragma OPENCL EXTENSION cl_amd_printf : enable
struct __attribute__ ((packed)) gm_component {
           float w;
           float4 m;
           float16 P;
       };
struct __attribute__ ((packed)) gm_component_survive {
        float w;
        float4 m;
        float16 P;
        float2 eta;
        float4 S;
        float8 K;
    };    
       
//4x4 matrix multiplication
 float16 matrixMult4x4f(float16 M, float16 N,float4 unit4)
{
    //return M*N
float4 a=M.s0123;
float4 b=M.s4567;
float4 c=M.s89ab;
float4 d=M.scdef;
float4 e=N.s048c;
float4 f=N.s159d;
float4 g=N.s26ae;
float4 h=N.s37bf;  
float16 tmp = (float16){dot(a*e,unit4),dot(a*f,unit4),dot(a*g,unit4),dot(a*h,unit4),
              dot(b*e,unit4),dot(b*f,unit4),dot(b*g,unit4),dot(b*h,unit4),
              dot(c*e,unit4),dot(c*f,unit4),dot(c*g,unit4),dot(c*h,unit4),
              dot(d*e,unit4),dot(d*f,unit4),dot(d*g,unit4),dot(d*h,unit4)};
return tmp;
}
// OpenCL Kernel to compute multiplication and addition
__kernel void
update(__global struct gm_component_survive * restrict predict_mixture, 
           float8 Hk, __global float2 *Zk,
          __global struct gm_component * restrict update_mixture,int size, float pr_dk, int updateMixtureSize)
{
  int zk_index=get_global_id(0);
  
  int survive_index = get_global_id(1);
 
if(zk_index<size && survive_index<updateMixtureSize){
   __global struct gm_component *um = &update_mixture; 
 
 __global struct gm_component_survive *pm = &predict_mixture; 
 
  float2 unit2={1.f,1.f};
  float4 unit4={1.f,1.f,1.f,1.f};
  
  //Multivarate guassian calculation
  //calculate miu
  float2 miu = Zk -pm->eta;
       
  //Inverse of covarience.. start
  float4 inv_covariance = pm->S;
  //calculate denominator 
  float determenent = inv_covariance.s3*inv_covariance.s0 - inv_covariance.s2*inv_covariance.s1;
  
  inv_covariance =(float4){inv_covariance.s3,-inv_covariance.s1,-inv_covariance.s2,inv_covariance.s0};
  inv_covariance = inv_covariance/determenent;               //inverse calculation end
  
  //multiplication of miu.T*covariance*miu
  float2 number = { dot(miu*inv_covariance.even,unit2),dot(miu*inv_covariance.odd,unit2)};
  number=number*miu;
  
  //Calculate denominator :pow(2*M_PI, 2)* determenent   39.4784
  float denom = 39.4784* determenent;
  
  denom=sqrt(denom);  
 
 //calculate weight
  um->w = pr_dk*pm->w*native_exp(-0.5f * dot(number,unit2)) / denom;
  
 //calculate mean 
  number = (float2){dot(Hk.lo*pm->m,unit4) , dot(Hk.hi*pm->m,unit4)};
 
  number = Zk-number;    
  inv_covariance =(float4){dot(pm->K.lo.lo*number,unit2),dot(pm->K.lo.hi*number,unit2),dot(pm->K.hi.lo*number,unit2),dot(pm->K.hi.hi*number,unit2)};
  um->m = pm->m+inv_covariance;
  //calculate covarince
  
  float16 temp1 = (float16){1-dot(pm->K.lo.lo*Hk.s04,unit2),-dot(pm->K.lo.lo*Hk.s15,unit2),-dot(pm->K.lo.lo*Hk.s26,unit2),-dot(pm->K.lo.lo*Hk.s37,unit2),
                          -dot(pm->K.lo.hi*Hk.s04,unit2),1-dot(pm->K.lo.hi*Hk.s15,unit2),-dot(pm->K.lo.hi*Hk.s26,unit2),-dot(pm->K.lo.hi*Hk.s37,unit2),
              -dot(pm->K.hi.lo*Hk.s04,unit2),-dot(pm->K.hi.lo*Hk.s15,unit2),1-dot(pm->K.hi.lo*Hk.s26,unit2),-dot(pm->K.hi.lo*Hk.s37,unit2),
                          -dot(pm->K.hi.hi*Hk.s04,unit2),-dot(pm->K.hi.hi*Hk.s15,unit2),-dot(pm->K.hi.hi*Hk.s26,unit2),1-dot(pm->K.hi.hi*Hk.s37,unit2)}; 
  um->P = matrixMult4x4f(temp1, pm->P,unit4);;  
  }
}

Any hints will be apprecciated.

Tobias

Altera_Forum · ‎09-08-2017

How different are the results? If the results are only different in the last few digits of the numbers, then it is probably caused by some rounding difference. Note that if you use --fpc or --fp-relaxed for FPGA compilation, output of floating-point operations will be different.

Altera_Forum · ‎09-08-2017

Thank you for your answer!

The output is somehow very different. As example look at the following outputs obtained by GPU and FPGA:

FPGA:

w: 0

m: 0.1 51410 0.1 0

P: 8.25 13.5 0 0

-879.562 -1435.38 -0.0762125 -0.124711

0 0 8.25 13.5

0 0 13.5 26

w: 0

m: 0.1 56227.3 0.1 0

P: 8.25 13.5 0 0

-879.562 -1435.38 -0.0762125 -0.124711

0 0 8.25 13.5

0 0 13.5 26

w: 0

m: 0.1 95197.5 0.1 0

P: 8.25 13.5 0 0

-879.562 -1435.38 -0.0762125 -0.124711

0 0 8.25 13.5

0 0 13.5 26

w: 0

m: 0.1 85130.8 0.1 0

P: 8.25 13.5 0 0

-879.562 -1435.38 -0.0762125 -0.124711

0 0 8.25 13.5

0 0 13.5 26

w: 0

m: 0.1 68243.1 0.1 0

P: 8.25 13.5 0 0

-879.562 -1435.38 -0.0762125 -0.124711

0 0 8.25 13.5

0 0 13.5 26

w: 0

m: 0 0 0 0

P: 0 0 0 0

0 0 0 0

GPU:

w: 0

m: 4.48707 0 2.19607 0

P: 8.17379 13.3753 0 0

13.5 26 0 0

0 0 8.17379 13.3753

0 0 13.5 26

w: 0

m: 4.89815 0 2.33002 0

P: 8.17379 13.3753 0 0

13.5 26 0 0

0 0 8.17379 13.3753

0 0 13.5 26

w: 0

m: 8.22379 0 2.51016 0

P: 8.17379 13.3753 0 0

13.5 26 0 0

0 0 8.17379 13.3753

0 0 13.5 26

w: 0

m: 7.36467 0 3.0552 0

P: 8.17379 13.3753 0 0

13.5 26 0 0

0 0 8.17379 13.3753

0 0 13.5 26

w: 0

m: 5.92356 0 2.4224 0

P: 8.17379 13.3753 0 0

13.5 26 0 0

0 0 8.17379 13.3753

0 0 13.5 26

w: 0

m: 0 0 0 0

P: 0 0 0 0

0 0 0 0

Altera_Forum · ‎09-09-2017

Thank you for your answer!

The results are quite different. A sample is shown below:


FPGA:
w: 	0
m: 	0.1	51410	0.1	0	
P: 	8.25	13.5	0	0	
	-879.562	-1435.38	-0.0762125	-0.124711	
	0	0	8.25	13.5	
	0	0	13.5	26	
	
w: 	0
m: 	0.1	56227.3	0.1	0	
P: 	8.25	13.5	0	0	
	-879.562	-1435.38	-0.0762125	-0.124711	
	0	0	8.25	13.5	
	0	0	13.5	26	
	
w: 	0
m: 	0.1	95197.5	0.1	0	
P: 	8.25	13.5	0	0	
	-879.562	-1435.38	-0.0762125	-0.124711	
	0	0	8.25	13.5	
	0	0	13.5	26	
	
w: 	0
m: 	0.1	85130.8	0.1	0	
P: 	8.25	13.5	0	0	
	-879.562	-1435.38	-0.0762125	-0.124711	
	0	0	8.25	13.5	
	0	0	13.5	26	
	
w: 	0
m: 	0.1	68243.1	0.1	0	
P: 	8.25	13.5	0	0	
	-879.562	-1435.38	-0.0762125	-0.124711	
	0	0	8.25	13.5	
	0	0	13.5	26	
	
w: 	0
m: 	0	0	0	0	
P: 	0	0	0	0	
	0	0	0	0	
	0	0	0	0	
	0	0	0	0
GPU:
w: 	0
m: 	4.48707	0	2.19607	0	
P: 	8.17379	13.3753	0	0	
	13.5	26	0	0	
	0	0	8.17379	13.3753	
	0	0	13.5	26	
	
w: 	0
m: 	4.89815	0	2.33002	0	
P: 	8.17379	13.3753	0	0	
	13.5	26	0	0	
	0	0	8.17379	13.3753	
	0	0	13.5	26	
	
w: 	0
m: 	8.22379	0	2.51016	0	
P: 	8.17379	13.3753	0	0	
	13.5	26	0	0	
	0	0	8.17379	13.3753	
	0	0	13.5	26	
	
w: 	0
m: 	7.36467	0	3.0552	0	
P: 	8.17379	13.3753	0	0	
	13.5	26	0	0	
	0	0	8.17379	13.3753	
	0	0	13.5	26	
	
w: 	0
m: 	5.92356	0	2.4224	0	
P: 	8.17379	13.3753	0	0	
	13.5	26	0	0	
	0	0	8.17379	13.3753	
	0	0	13.5	26	
	
w: 	0
m: 	0	0	0	0	
P: 	0	0	0	0	
	0	0	0	0	
	0	0	0	0	
	0	0	0	0

Altera_Forum · ‎09-09-2017

Thank you for your answer!

The outputs are in fact very different.

Altera_Forum · ‎09-09-2017

One example:

FPGA:

m: 0.1 51410 0.1 0

GPU:

m: 4.48707 0 2.19607 0

Altera_Forum · ‎09-10-2017

That is certainly not because of rounding difference. Are you using the exact same kernel and host (minus the clCreateProgram difference) code for both? Have you tried Altera's emulator?