Hello,
As the NDA seems to be over, I have few questions about HD4000 and OpenCL.
I am testing performances of HD4000 embedded into IVB (i7 3720QM), with OpenCL GPU. I try to find the best efficient scheme of kernel for picture analysis, which is used for h264 encoding.
The operations I want to compute are very simple, it's basicly made of differences of grayscale pixels values between 2 different frames, or into one same frame. And accumulate the result over HD pictures (1920*1080).
My experimentations led me to use image2d obj. I read 2 pixels values from global memory, make the operation and store result into local memory. And finally I do the reduction into the kernel, and write the result into global memory.
It appears that the reduction is the most important part in term of execution time, which leads to quite "bad" result to my opinion.
My optimized version of kernel for GPU hardly beats the CPU version. And it's quite far from OpenMP performances.
As there is no tool such as GPA which supports HD4000 with OpenCL, and the offline compiler does not provide the assembly code, it is not easy to understand the behaviour and optimize.
Then I am wondering if I could expect to get much more performances from the HD4000 ?
Here is my kernel:
__kernel void ker1_MIX_c_img2d (
const int stride,
read_only image2d_t pix,
read_only image2d_t pix1,
read_only image2d_t pix2,
__global uint* p_fladIntra_sum,
__global uint* p_fladInter1_sum,
__global uint* p_fladInter2_sum,
__global uint* p_fldc_sum,
__local int4* localmem_fladIntra,
__local int4* localmem_fladInter1,
__local int4* localmem_fladInter2,
__local int4* localmem_fldc,
const int localSize,
const int rest_x,
const int rest_y)
{
const sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int4 tmp_Intra;
int4 tmp_Inter1;
int4 tmp_Inter2;
int4 tmp_Fldc;
int g_i= get_global_id(1);
int g_j = get_global_id(0);
int l_i= get_local_id(1);
int l_j = get_local_id(0);
int gid_i= get_group_id(1);
int gid_j= get_group_id(0);
int wg_i=get_num_groups (1);
int wg_j=get_num_groups (0);
int l_size_i = get_local_size(1);
int l_size_j = get_local_size(0);
int g_size_i = get_global_size(1);
int g_size_j = get_global_size(0);
//Load data and perform FLAD
if(g_j%2==0 && g_j
localmem_fladIntra[(l_j/2)*l_size_i+l_i]=convert_int4(abs(read_imagei(pix, samplerA, (int2)(g_i, g_j)) - read_imagei(pix, samplerA, (int2)(g_i, g_j+1))));
localmem_fladInter1[l_j*l_size_i+l_i]=convert_int4(abs(read_imagei(pix, samplerA, (int2)(g_i, g_j)) - read_imagei(pix1, samplerA, (int2)(g_i, g_j))));
localmem_fladInter2[l_j*l_size_i+l_i]=convert_int4(abs(read_imagei(pix, samplerA, (int2)(g_i, g_j)) - read_imagei(pix2, samplerA, (int2)(g_i, g_j))));
localmem_fldc[l_j*l_size_i+l_i]=convert_int4(abs(read_imagei(pix, samplerA, (int2)(g_i, g_j)) ));
#ifndef SKIP_REDUCTION
/* ########################################################################*/
barrier(CLK_LOCAL_MEM_FENCE);
/* #######################################################################*/
// repeat reduction in local memory
for(int s = localSize>>1; s > 1; s >>= 1)
{
//skip non-valide values from partially filled workgroups (last WG of each dimension
if(l_j*l_size_i+l_i< s)
{
if(s <= localSize>>2)
localmem_fladIntra[l_j*l_size_i+l_i] += localmem_fladIntra[l_j*l_size_i+l_i+ s];
localmem_fladInter1[l_j*l_size_i+l_i] += localmem_fladInter1[l_j*l_size_i+l_i+ s];
localmem_fladInter2[l_j*l_size_i+l_i] += localmem_fladInter2[l_j*l_size_i+l_i+ s];
localmem_fldc[l_j*l_size_i+l_i] += localmem_fldc[l_j*l_size_i+l_i+ s];
}
// keep barrier outside conditional
/* ###################################################################*/
barrier(CLK_LOCAL_MEM_FENCE);
/* ####################################################################*/
}
#endif //SKIP_REDUCTION
// write result to global memory
if (l_i== 0 && l_j==0) //&& (gid_i < (wg_i-1) || rest_x==0) && (gid_j < (wg_j-1) || rest_y==0))
{
tmp_Intra=(localmem_fladIntra[0] + localmem_fladIntra[1]);
tmp_Inter1=(localmem_fladInter1[0] + localmem_fladInter1[1]);
tmp_Inter2=(localmem_fladInter2[0] + localmem_fladInter2[1]);
tmp_Fldc=(localmem_fldc[0] + localmem_fldc[1]);
p_fladIntra_sum[gid_j*wg_i + gid_i] =tmp_Intra.s0 + tmp_Intra.s1 + tmp_Intra.s2 + tmp_Intra.s3;
p_fladInter1_sum[gid_j*wg_i + gid_i] =tmp_Inter1.s0 + tmp_Inter1.s1 + tmp_Inter1.s2 + tmp_Inter1.s3;
p_fladInter2_sum[gid_j*wg_i + gid_i] =tmp_Inter2.s0 + tmp_Inter2.s1 + tmp_Inter2.s2 + tmp_Inter2.s3;
p_fldc_sum[gid_j*wg_i + gid_i] =tmp_Fldc.s0 + tmp_Fldc.s1 + tmp_Fldc.s2 + tmp_Fldc.s3;
}
}
I use workgroup size of 128 x 2, which is the best after testing different sizes.
The execution time is about 2.25ms for this kernel. I do not take into account the time for data transfert which is about 2ms.
The reduction uses 1.5ms of GPU time.
Also, as I am measuring it from a desktop computer which has no other graphic card, is there some king of configuration I should use during the measurements, no to disturb the GPU?
Thank you.
Chris