Platform: DE10-nano soc, Intel FPGA SDK for OpenCL 18.1
I am designing a matrix multiplication kernel similar to this one: https://cnugteren.github.io/tutorial/pages/page8.html
It uses 3D work items to basically multiply many sets of two-matrix pairs and output the results.
The emulation passes, while the actual design on-chip didn't. When running on FPGA, only the first few digits match with correct results.
I am thinking maybe it has something to do with the way the emulator emulates multiple work items. But I add barriers whenever I load values to local memory.
Could anyone provide some insights on the difference between multiple work items implementation in emulation and actual design?
#include "config.h"
uint8_t gf_mu_x86(uint8_t a, uint8_t b) {
uint8_t p = 0; /* the product of the multiplication */
#pragma unroll
for (int i=0;i<8;i++){
// if (!(a && b)){
// break;
// }
if (b & 1) /* if b is odd, then add the corresponding a to p (final product = sum of all a's corresponding to odd b's) */
p ^= a; /* since we're in GF(2^m), addition is an XOR */
if (a & 0x80) /* GF modulo: if a >= 128, then it will overflow when shifted left, so reduce */
a = (a << 1) ^ 0x11D; /* XOR with the primitive polynomial x^8 + x^4 + x^3 + x + 1 (0b1_0001_1011) – you can change it but it must be irreducible */
else
a <<= 1; /* equivalent to a*2 */
b >>= 1; /* equivalent to b // 2 */
}
return p;
}
int address_interpretor(int x, int y, int offset, __global const uint8_t* restrict sample_idx){
// use x to find index of required packet (file space) in sample_idx
uint8_t file_pkt_idx = sample_idx[offset+x];
// calculate idx of required data in file space
return file_pkt_idx*PKT_SIZE + y;
}
// Use 2D register blocking (further increase in work per thread)
__kernel
// __attribute__((num_compute_units(CMP_UNIT)))
// __attribute__((max_work_group_size(256)))
__attribute__((reqd_work_group_size(TSM/WPTM, TSN/WPTN, 1))) // 8, 1, 1
void myGEMM6(
__global const uint8_t* restrict A,
__global const uint8_t* restrict B,
__global uint8_t* restrict C,
__global const uint8_t* restrict DEGREE_,
__global const uint8_t* restrict sample_idx // cached
) {
// Thread identifiers
const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM == RTSM)
const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN == RTSN)
const int offsetM = TSM*get_group_id(0); // Work-group offset
const int offsetN = TSN*get_group_id(1); // Work-group offset
const int batch_id = get_global_id(2); // max: N_BATCH
// Local memory to fit a tile of A and B
__local uint8_t Asub[TSK][TSM];
__local uint8_t Bsub[TSN][TSK+2];
__local uint8_t degrees[MAX_NUM_BATCH];
// Allocate register space
uint8_t Areg;
uint8_t Breg[WPTN];
uint8_t acc[WPTM][WPTN];
int deg_offset = 0;
uint8_t my_deg;
// Initialise the accumulation registers
#pragma unroll
for (int wm=0; wm<WPTM; wm++) {
#pragma unroll
for (int wn=0; wn<WPTN; wn++) {
acc[wm][wn] = 0;
}
}
// load degrees and calculate offsets
if(tidm == 0 && tidn == 0){
#pragma unroll
for(int i=0;i<MAX_NUM_BATCH;i++){
degrees[i] = DEGREE_[i];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for(int i=0;i<batch_id;i++){
deg_offset += degrees[i];
}
my_deg = degrees[batch_id];
// Loop over all tiles
const int numTiles = my_deg/TSK;
barrier(CLK_LOCAL_MEM_FENCE);
for(int t=0;t<numTiles;t++){
// Load one tile of A and B into local memory
// #pragma unroll
for (int la=0; la<LPTA; la++) {
int tid = tidn*RTSM + tidm;
int id = la*RTSN*RTSM + tid;
int row = MOD2(id,TSM);
int col = DIV2(id,TSM);
// float row_ = MOD2(id,TSM);
// float col_ = DIV2(id,TSM);
// printf("%f,%f\n",row_,col_);
int tiledIndex = TSK*t + col;
int A_vec = address_interpretor(tiledIndex, offsetM + row, deg_offset,sample_idx);
// Asub[col][row] = A[tiledIndex*PKT_SIZE + offsetM + row];
Asub[col][row] = A[A_vec];
Bsub[row][col]= B[tiledIndex*BATCH_SIZE + offsetN + row + deg_offset*BATCH_SIZE];
}
// Synchronise to make sure the tile is loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Loop over the values of a single tile
// #pragma unroll
for (int k=0; k<TSK; k++) {
// Cache the values of Bsub in registers
#pragma unroll
for (int wn=0; wn<WPTN; wn++) {
int col = tidn + wn*RTSN;
Breg[wn] = Bsub[col][k];
}
// Perform the computation
#pragma unroll
for (int wm=0; wm<WPTM; wm++) {
int row = tidm + wm*RTSM;
Areg = Asub[k][row];
#pragma unroll
for (int wn=0; wn<WPTN; wn++) {
acc[wm][wn] ^= gf_mu_x86(Areg , Breg[wn]);
}
}
}
// Synchronise before loading the next tile
barrier(CLK_LOCAL_MEM_FENCE);
}
// Store the final results in C
// #pragma unroll
for (int wm=0; wm<WPTM; wm++) {
int globalRow = offsetM + tidm + wm*RTSM;
#pragma unroll
for (int wn=0; wn<WPTN; wn++) {
int globalCol = offsetN + tidn + wn*RTSN;
C[globalCol*PKT_SIZE + globalRow + batch_id*PKT_SIZE*BATCH_SIZE] = acc[wm][wn];
}
}
}
Link Copied
I got this problem solved. The access of local memory B_sub is out of range when loading value to it. It works well when I increase the size of B_sub.
It's strange though the emulator didn't complain any run-time error.
For more complete information about compiler optimizations, see our Optimization Notice.