Solved: OpenCL FPGA: actual results differ from emulation results

jackgreen · ‎02-22-2021

Platform: DE10-nano soc, Intel FPGA SDK for OpenCL 18.1

I am designing a matrix multiplication kernel similar to this one: https://cnugteren.github.io/tutorial/pages/page8.html

It uses 3D work items to basically multiply many sets of two-matrix pairs and output the results.

The emulation passes, while the actual design on-chip didn't. When running on FPGA, only the first few digits match with correct results.

I am thinking maybe it has something to do with the way the emulator emulates multiple work items. But I add barriers whenever I load values to local memory.

Could anyone provide some insights on the difference between multiple work items implementation in emulation and actual design?

#include "config.h"

uint8_t gf_mu_x86(uint8_t a, uint8_t b) {
	uint8_t p = 0; /* the product of the multiplication */
    #pragma unroll
	for (int i=0;i<8;i++){
            // if (!(a && b)){
            //         break;
            //     }
            if (b & 1) /* if b is odd, then add the corresponding a to p (final product = sum of all a's corresponding to odd b's) */
                p ^= a; /* since we're in GF(2^m), addition is an XOR */

            if (a & 0x80) /* GF modulo: if a >= 128, then it will overflow when shifted left, so reduce */
                a = (a << 1) ^ 0x11D; /* XOR with the primitive polynomial x^8 + x^4 + x^3 + x + 1 (0b1_0001_1011) – you can change it but it must be irreducible */
            else
                a <<= 1; /* equivalent to a*2 */
            b >>= 1; /* equivalent to b // 2 */
            
	}
	return p;
}

int address_interpretor(int x, int y, int offset, __global const uint8_t* restrict sample_idx){
    // use x to find index of required packet (file space) in sample_idx    
    uint8_t file_pkt_idx = sample_idx[offset+x];
    // calculate idx of required data in file space
    return file_pkt_idx*PKT_SIZE + y;
}

// Use 2D register blocking (further increase in work per thread)
__kernel 
// __attribute__((num_compute_units(CMP_UNIT)))
// __attribute__((max_work_group_size(256))) 
__attribute__((reqd_work_group_size(TSM/WPTM, TSN/WPTN, 1)))  // 8, 1, 1
void myGEMM6(
            __global const uint8_t* restrict A,
            __global const uint8_t* restrict B,
            __global uint8_t* restrict C,
            __global const uint8_t* restrict DEGREE_,
            __global const uint8_t* restrict sample_idx // cached
      
                      ) {
                

    // Thread identifiers
    const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM == RTSM)
    const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN == RTSN)
    const int offsetM = TSM*get_group_id(0); // Work-group offset
    const int offsetN = TSN*get_group_id(1); // Work-group offset
    const int batch_id = get_global_id(2); // max: N_BATCH

    // Local memory to fit a tile of A and B
    __local uint8_t Asub[TSK][TSM];
    __local uint8_t Bsub[TSN][TSK+2];
    __local uint8_t degrees[MAX_NUM_BATCH];
    // Allocate register space
    uint8_t Areg;
    uint8_t Breg[WPTN];
    uint8_t acc[WPTM][WPTN];
    int deg_offset = 0;
    uint8_t my_deg;

    // Initialise the accumulation registers
    #pragma unroll
    for (int wm=0; wm<WPTM; wm++) {
        #pragma unroll
        for (int wn=0; wn<WPTN; wn++) {
            acc[wm][wn] = 0;
        }
    }
    
    // load degrees and calculate offsets                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
    if(tidm == 0 && tidn == 0){
        #pragma unroll
        for(int i=0;i<MAX_NUM_BATCH;i++){
            degrees[i] = DEGREE_[i];
        }
    }                                                                                     
    barrier(CLK_LOCAL_MEM_FENCE);                                                                                                                                                                                                                                      
    for(int i=0;i<batch_id;i++){                                                                                             
        deg_offset += degrees[i];                                                                                            
    }
    my_deg = degrees[batch_id];                                                                                                                        
    
    // Loop over all tiles
    const int numTiles = my_deg/TSK;
    barrier(CLK_LOCAL_MEM_FENCE);
  
    for(int t=0;t<numTiles;t++){

        // Load one tile of A and B into local memory
        // #pragma unroll
        for (int la=0; la<LPTA; la++) {
            int tid = tidn*RTSM + tidm;
            int id = la*RTSN*RTSM + tid;
            int row = MOD2(id,TSM);
            int col = DIV2(id,TSM);
            // float row_ = MOD2(id,TSM);
            // float col_ = DIV2(id,TSM);
            // printf("%f,%f\n",row_,col_);
            int tiledIndex = TSK*t + col;
            int A_vec = address_interpretor(tiledIndex, offsetM + row, deg_offset,sample_idx);
            // Asub[col][row] = A[tiledIndex*PKT_SIZE + offsetM + row];
            Asub[col][row] = A[A_vec];
            Bsub[row][col]= B[tiledIndex*BATCH_SIZE + offsetN + row + deg_offset*BATCH_SIZE];
        }

        // Synchronise to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Loop over the values of a single tile
        // #pragma unroll
        for (int k=0; k<TSK; k++) {
            // Cache the values of Bsub in registers
            #pragma unroll
            for (int wn=0; wn<WPTN; wn++) {
                int col = tidn + wn*RTSN;
                Breg[wn] = Bsub[col][k];
            }

            // Perform the computation
            #pragma unroll
            for (int wm=0; wm<WPTM; wm++) {
                int row = tidm + wm*RTSM;
                Areg = Asub[k][row];
                #pragma unroll
                for (int wn=0; wn<WPTN; wn++) {
                    acc[wm][wn] ^= gf_mu_x86(Areg , Breg[wn]);
                }
            }
        }

        // Synchronise before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    // Store the final results in C
    // #pragma unroll
    for (int wm=0; wm<WPTM; wm++) {
        int globalRow = offsetM + tidm + wm*RTSM;
        #pragma unroll
        for (int wn=0; wn<WPTN; wn++) {
            int globalCol = offsetN + tidn + wn*RTSN; 
            C[globalCol*PKT_SIZE + globalRow + batch_id*PKT_SIZE*BATCH_SIZE] = acc[wm][wn];
          
        }
    }
    
}

jackgreen · ‎02-23-2021

I got this problem solved. The access of local memory B_sub is out of range when loading value to it. It works well when I increase the size of B_sub.

It's strange though the emulator didn't complain any run-time error.

View solution in original post

jackgreen · ‎02-23-2021

I got this problem solved. The access of local memory B_sub is out of range when loading value to it. It works well when I increase the size of B_sub.

It's strange though the emulator didn't complain any run-time error.

BoonBengT_Intel · ‎05-18-2021

Hi @jackgreen,

Thank you for posting in Intel community forum, hope this message find you well and apologies for the delayed in response due to some technical issues from our platform.
Good to know that you managed to figure this out and sharing the finding on the community.
This thread will be transitioned to community support. If you have new queries, please feel free to open a new thread to get support from Intel experts, and we would be right with you.
Otherwise, the community users will further help you with doubts in this thread.

Best Wishes
BB

OpenCL FPGA: actual results differ from emulation results

FPGA Target

Runtime error