Undefined behavior in matrix multiplication kernel

Richard_S_7 · ‎06-30-2022

Hi,

I have a generalized matrix multiplication kernel and am getting incorrect results. This seems to stem from undefined behavior, as the result is correct when I do one of the following:

add a barrier inside the `step` loop
remove the last pair of parentheses in the index calculation of the `c_glb` access in the last loop nest

I have tried the following so far to find the error but with no luck:

traced every read and write access from every work-item: they access the correct array elements and there should be no data races
I found that the incorrect results are produced in the last loop nest in line `c_glb[...] += a_glb[...] * b_lcl[...];`. When tracing the memory accesses of a work-item I can see that the first accumulation produces the correct result in `c_glb`, but the second iteration overwrites the previously written value to `c_glb` instead of accumulating the results. I have checked that no other work-item tries to overwrite that array element.
execute with oclgrind: no warnings shown
translate to CUDA and execute on GPU: no undefined behavior and no warnings/errors when executing with cuda-memcheck

Do you by any chance see anything that could be causing undefined behavior in this kernel or do you have any recommendations for other tools/methods I could use to further narrow down where this problem comes from?

Many thanks in advance!

__kernel void matmul( __global float const * const __restrict__ a_glb, __global float const * const __restrict__ b_glb, __global float * const __restrict__ _, __global float * const __restrict__ c_glb) {
    __local float b_lcl[(4 * 1 * 1) * (2 * 2 * 2) * (1 * 1 * 2) * (2 * 1 * 1)];
    const size_t wi_3 = get_local_id(0) / (2 * 4 * 1) % (2);
    const size_t wi_5 = get_local_id(0) / (2) % (4);
    const size_t wi_6 = get_local_id(0) % (2);

    // init result memory
    for (size_t lcl_6 = 0; lcl_6 < 2; ++lcl_6) {
    for (size_t prv_6 = 0; prv_6 < 2; ++prv_6) {
        c_glb[wi_3 * 32 + wi_5 * 8 + wi_6 * 4 + lcl_6 * 2 + prv_6] = 0.0f;
    }}

    // cache b_glb
    #pragma unroll
    for (size_t step = 0; step < (((4 * 1 * 1 ) * (2 * 2 * 2 ) * (1 * 1 * 2 ) * (2 * 1 * 1 )) / 1) / (1 * 1 * 2 * 1 * 4 * 2 * 1); ++step) {
        b_lcl[step * (1 * 1 * 2 * 1 * 4 * 2 * 1) + get_local_id(0)] =
        b_glb[((0 * 1 * 4 * 1 * 1 + 0 * 4 * 1 * 1 ) + ((step * (1 * 1 * 2 * 1 * 4 * 2 * 1) + get_local_id(0)) / ((2 * 2 * 2 ) * (1 * 1 * 2 ) * ((2 * 1 * 1 ) / 1)) % ((4 * 1 * 1 )))) * (8) * (2) * ((2 / 1)) + ((0 * 1 * 2 * 2 * 2 + 0 * 2 * 2 * 2 ) + ((step * (1 * 1 * 2 * 1 * 4 * 2 * 1) + get_local_id(0)) / ((1 * 1 * 2 ) * ((2 * 1 * 1 ) / 1)) % ((2 * 2 * 2 )))) * (2) * ((2 / 1)) + ((0 * 1 * 1 * 1 * 2 + 0 * 1 * 1 * 2 ) + ((step * (1 * 1 * 2 * 1 * 4 * 2 * 1) + get_local_id(0)) / ((2 * 1 * 1 ) / 1) % ((1 * 1 * 2 )))) * ((2 / 1)) + (((0 * 1 * 2 * 1 * 1 + 0 * 2 * 1 * 1 ) / 1) + ((step * (1 * 1 * 2 * 1 * 4 * 2 * 1) + get_local_id(0)) % (((2 * 1 * 1 ) / 1)))) ];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    // process tile
    for (size_t lcl_6 = 0; lcl_6 < 2; ++lcl_6) {
    for (size_t prv_7 = 0; prv_7 < 2; ++prv_7) {
    for (size_t prv_6 = 0; prv_6 < 2; ++prv_6) {
        c_glb[(wi_3) * (1) * (4) * (8) + (wi_5) * (8) + (wi_6 * 2 * 2 + lcl_6 * 2 + prv_6)] +=
        a_glb[prv_7] *
        b_glb[(wi_5) * (2 * 2 * 2) * (1 * 1 * 2) * (2 * 1 * 1) + (wi_6 * 2 * 2 + lcl_6 * 2 + prv_6) * (1 * 1 * 2) * (2 * 1 * 1) + (prv_7) * (2 * 1 * 1) + (wi_3) ];
    }}}
}

Undefined behavior in matrix multiplication kernel

Runtime error