Problems with reduction done in CPU

Diego_G_ · ‎05-29-2015

Hi all. I have been trying to code reductions for CPU and GPU. The kernels attached below work really

well for Intel GPU's and Nvidia GPU. But, when I compile for CPU (Intel). The results are not consistent.

Sometimes, the result is right sometimes the result is wrong. There are two kernels: reduction_vector

is called many times by the host. When, the global_size is reduced to local_size. I issue complete_vector to finalize

the reduction.

__kernel void reduction_vector(__global int* data, __local int* partial_sums)

{

int lid = get_local_id(0);

int group_size = get_local_size(0);

partial_sums[lid] = data[get_global_id(0)];

barrier(CLK_LOCAL_MEM_FENCE);

for (int i = 1; i < group_size; i <<= 1) {

int mask = (i << 1) - 1;

if ((lid & mask) == 0) {

partial_sums[lid] += partial_sums[lid + i];

}

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0) {

data[get_group_id(0)] = partial_sums[0];

}

__kernel void reduction_complete(__global int* data,

__local int* partial_sums, __global int *sum) {

int lid = get_local_id(0);

int group_size = get_local_size(0);

partial_sums[lid] = data[get_local_id(0)];

barrier(CLK_LOCAL_MEM_FENCE);

for (int i = 1; i < group_size; i <<= 1) {

int mask = (i << 1) - 1;

if ((lid & mask) == 0) {

partial_sums[lid] += partial_sums[lid + i];

}

barrier(CLK_LOCAL_MEM_FENCE);

}

if(lid == 0) {

*sum = partial_sums[0];

}

This is the host code

local_size = 128;

/* Create data buffer */

data_buffer = clCreateBuffer(oclobjects.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_int)* ARRAY_SIZE, data, &err);

sum_buffer = clCreateBuffer(oclobjects.context, CL_MEM_WRITE_ONLY,sizeof(cl_int), NULL, &err);

if(err < 0) {

perror("Couldn't create a buffer");

exit(1);

};

clEnqueueWriteBuffer(oclobjects.queue, data_buffer, CL_TRUE, 0, sizeof(cl_int) * ARRAY_SIZE, data, 0, NULL, NULL);

clFinish(oclobjects.queue);

/* Set arguments for vector kernel */

err = clSetKernelArg(vector_kernel, 0, sizeof(cl_mem), &data_buffer);

err |= clSetKernelArg(vector_kernel, 1, local_size * sizeof(cl_int), NULL);

/* Set arguments for complete kernel */

err = clSetKernelArg(complete_kernel, 0, sizeof(cl_mem), &data_buffer);

err |= clSetKernelArg(complete_kernel, 1, local_size * sizeof(cl_int), NULL);

err |= clSetKernelArg(complete_kernel, 2, sizeof(cl_mem), &sum_buffer);

if(err < 0) {

perror("Couldn't create a kernel argument");

exit(1);

}

/* Enqueue kernels */

global_size = ARRAY_SIZE;

err = clEnqueueNDRangeKernel(oclobjects.queue, vector_kernel, 1, NULL, &global_size,

&local_size, 0, NULL, NULL);

if(err < 0) {

perror("Couldn't enqueue the kernel");

exit(1);

}

printf("Global size = %lu\n", global_size);

/* Perform successive stages of the reduction */

while(global_size/local_size > local_size) {

global_size = global_size/local_size;

err = clEnqueueNDRangeKernel(oclobjects.queue, vector_kernel, 1, NULL, &global_size,

&local_size, 0, NULL, NULL);

printf("Global size = %lu\n", global_size);

if(err < 0) {

perror("Couldn't enqueue the kernel");

exit(1);

}

global_size = global_size/(local_size);

local_size = global_size;

err = clEnqueueNDRangeKernel(oclobjects.queue, complete_kernel, 1, NULL, &global_size,

&local_size, 0, NULL, NULL);

printf("Global size = %lu\n", global_size);

/* Read the result */

err = clEnqueueReadBuffer(oclobjects.queue, sum_buffer, CL_TRUE, 0, sizeof(cl_int), &sum, 0, NULL, NULL);

clFinish(oclobjects.queue);

if (err < 0) {

perror("Couldn't read the buffer");

exit(1);

}

/* Finish processing the queue and get profiling information */

clFinish(oclobjects.queue);

It does look to me that this Intel's bug in the CPU runtime. Notice, I tried two runtimes:

1. Runtime 14.2 x64

2. Runtime 15.1.x64

Thanks, for your help....

Diego

Robert_I_Intel · ‎05-29-2015

Diego,

Could you please provide more details about your system, e.g. processor, OS, graphics and CPU OpenCL driver versions?

Thanks!

Diego_G_ · ‎05-30-2015

Thank you Robert, this is the system:

Windows 8.1
Microsoft Visual Studio 2013
Code Builder for OpenCL Applications Version 1.3.0.92
OpenCL runtime for Intel Core and Xeon Processors 5.0.0.57
Intel(R) Core(TM) i3-3220 CPU@3.30GHz

I have attached the file SumRedux.rar. It has the whole project. Notice, that I use

Intel OpenCL C++ wrappers. Like, the Intel OpenCL demos. The code runs fine

with Intel HD Graphics 4400 and Nvidia GPU. Try to run the code multiple times

sometimes it works sometimes it fails. Larger is the data set bigger instances of failure

will happen. The code is small... Thank, you for your help....

Robert_I_Intel · ‎06-01-2015

Hi Diego,

I was able to reproduce the issue on my end. I suspect there is a subtle race condition that affects CPU part since CPU executes things one work item at a time, but I couldn't find one. Sent to our in-house expert for review.

One more thing: for some reason, when I split

if (lid < stride) {

partial_sums[lid] += partial_sums[lid + i];

}

which does both local reads and local writes in the same statement into two separate pieces (one doing only reads then barrier then only write) - CPU version ALWAYS fails.

Diego_G_ · ‎06-01-2015

Thank you Robert, it is a race condition at CPU code generator. Let us wait for the expert, He may have a simple fix.

By the way, other type of reductions runs fine in the CPU. I tried min and argmin with OpenCL intrinsics (select, isless). It just works fine

using the same address mode as the accumulator code.

Best regards,

Diego

Robert_I_Intel · ‎06-01-2015

Hi Diego,

What about the following reduction:

__kernel void 
reduce( __global float* input, __global float* output, __local float* localmem)
{
    int lid= get_local_id(0);
    int gid= get_global_id(0);

    // Phase 1: First addition + Load data to local memory
    localmem[lid] = input[gid*2] + input[gid*2+1];

    barrier(CLK_LOCAL_MEM_FENCE);

    // Phases 2,3: repeat reduction in local memory
    float tmp;
    for (int s = get_local_size(0)/2; s>1; s >>= 1)
    {
        if (lid < s)  // only do work on “remaining” work items
            tmp = localmem[lid*2] + localmem[lid*2+1];

        // All threads must reach barrier
        barrier(CLK_LOCAL_MEM_FENCE);

        if (lid < s)
            localmem[lid] = tmp;

        // All threads must reach barrier
        barrier(CLK_LOCAL_MEM_FENCE);    
    }

    // Phase 4: write result to global memory
    if (lid == 0)
        output[get_group_id(0)] = localmem[0] + localmem[1];

}

You might want to modify this one to fit your needs: shouldn't have a race on the CPU.

Diego_G_ · ‎06-02-2015

Thanks Robert, I will try your suggestion.

Diego

Robert_I_Intel · ‎06-03-2015

Hi Diego, your code does have a race and Yuri Kulakov, our OpenCL expert found it. Here is his explanation:

The problem is in the kernel code, at line 18:

data[get_group_id(0)] = partial_sums[0];

Basically, this logic expects that work-groups are executed sequentially, but this is not the case (for CPU at least). What happens is that some work-group with a higher index might be executed earlier and so it rewrites the data that has not been processed yet by the lower index (say 0) work-group.

One obvious solution would be to write temporary reduction sums to indexes above the current global size (of course allocating necessary amount of space first) and adjust offsets for each iteration of reduction.

Diego_G_ · ‎06-03-2015

Robert, the explanation from Yuri makes sense. The GPU always serialize that. I thought that the same would be true for the CPU.

The deal that I am confused why the attached code always seems to work in any condition.It works in CPU's and GPU's...

I have coded below a maxredux code

where, I am trying to find out the max and argmax of the array under study. Notice, that I am doing things in place (destroy original data)

because, I am dealing with video data (I am a video signal processing guy).

__kernel void reduction_vector(global float* data, global uint *index, local float* partial_max, local uint* partial_index)
{

   int greatere_than;
   int lid = get_local_id(0);
   int lid1;
   int gid = get_global_id(0);
   int grid = get_group_id(0);
   int group_size = get_local_size(0);
   int global_size = get_global_size(0);

   partial_max[lid] = data[gid];
   partial_index[lid] = index[gid];
   barrier(CLK_LOCAL_MEM_FENCE);

   for (int i = group_size >>1; i > 0; i >>= 1) {

       if (lid < i) {
           lid1 = lid + i;
           greatere_than = isgreaterequal(partial_max[lid],partial_max[lid1]);
           partial_max[lid] = select(partial_max[lid1], partial_max[lid], greatere_than);
           partial_index[lid] = select(partial_index[lid1], partial_index[lid], greatere_than);
       }
       // Synchronize all work-items so we know all writes to partial's have ocurred
       barrier(CLK_LOCAL_MEM_FENCE);
   }
   // Write to nth position in global output array n=work_group_id
   if(lid == 0) {
       data[grid] = partial_max[0];
       index[grid] = partial_index[0];
   }
}

I cannot get this code to fail ! According, to Yuri this should fail. Anyway, because of performance I may decide to this with AVX.

Sometimes, OpenCL becomes really deceiving. I have attached the project... Thank you so much for your help... Sorry to bother...

Diego

Diego_G_ · ‎06-05-2015

I would like to thanks Robert & Yuri. My reduction code for argmax and max fails for the CPU (Earlier post). I did a gigantic loop of iterations.

It sometimes fail. I did follow Yuri comments for the race condition. My solution is to serialize the CPU code in the following way:

__kernel void reduction_cpu_vector(global float8* data, global uint8 *index, local float8* partial_max, local uint8* partial_index)

{

int8 greatere_than;

int lid = get_local_id(0);

int lid1;

int gid = get_global_id(0);

int grid = get_group_id(0);

int group_size = get_local_size(0);

int global_size = get_global_size(0);

partial_max[lid] = data[gid];

partial_index[lid] = index[gid];

barrier(CLK_LOCAL_MEM_FENCE);

if (lid == 0)

{

for (int i = 0; i < group_size; i++) {

lid1 = lid + i;

greatere_than = isgreaterequal(partial_max[lid],partial_max[lid1]);

partial_max[lid] = select(partial_max[lid1], partial_max[lid], greatere_than);

partial_index[lid] = select(partial_index[lid1], partial_index[lid], greatere_than);

}

// Write to nth position in global output array n=work_group_id

data[grid] = partial_max[0];

index[grid] = partial_index[0];

}

I am using one work-item to perform a max operation within a work-group. I do take advantage of the SIMD side of the story. This works as

AVX code. It is faster than scalar code. It seems that this is the right way to do it for CPU's....

Diego