Bidirectional OpenCL Channel Stalls (both at read and write insts)

SChun19 · ‎05-30-2020

Hi, I am experiencing an issue where there are stalls both at the read and write instructions for the channel in a feed-forward situation. I am not sure how to fix this or interpret the issue even since the implication is contradictory. I have attached the profiler and will paste the code below. Thanks.

#pragma OPENCL EXTENSION cl_intel_channels : enable
 
#define Tc 4
#define Tm 16
#define Tn 8
 
struct input_features {
  float input_buf[Tc];
  bool rc_zero;
  bool flush;
};
 
struct filter_weights {
  float weight_buf[Tn];
};
 
struct outputs {
  float output_buf[Tm][Tc];
};
 
channel struct input_features loadAChannel __attribute__((depth(64)));
channel struct filter_weights loadBChannel __attribute__((depth(64)));
channel struct outputs storeCChannel __attribute__((depth(64)));
 
kernel void loadA(global float* restrict compute, global float* volatile restrict input0, global float* restrict input1, global float* restrict T_clip, global float* restrict input2, global float* restrict input3,
        int ax1_bound, int yy_bound, int xx_bound, int rc_bound) {
    for (int ax1 = 0; ax1 < ax1_bound; ax1 += Tm) {
      for (int yy = 0; yy < yy_bound; yy++) {
        for (int xx = 0; xx < yy_bound; xx += Tc) {
          for (int rc = 0; rc < rc_bound; rc += Tn) {
            struct input_features i_local;
            i_local.rc_zero = (rc == 0);
            i_local.flush = (rc + Tn) >= rc_bound;
 
            for (int tii = 0; tii < Tn; tii++) {
              #pragma unroll
              for (int tcc = 0; tcc < Tc; tcc++) {
                uint tmp = (0x3F800000 + tcc) + ((rc * 1 + tii) & 0xFFFF);
                i_local.input_buf[tcc] = *(float *) &tmp;
              }
              write_channel_intel(loadAChannel, i_local);
            }
          }
      }
    }
  }
}
 
kernel void loadB(global float* restrict compute, global float* restrict input0, global float* volatile restrict input1, global float* restrict T_clip, global float* restrict input2, global float* restrict input3,
        int ax1_bound, int yy_bound, int xx_bound, int rc_bound) {
    for (int ax1 = 0; ax1 < ax1_bound; ax1 += Tm) {
      for (int yy = 0; yy < yy_bound; yy++) {
        for (int xx = 0; xx < yy_bound; xx += Tc) {
          for (int rc = 0; rc < rc_bound; rc += Tn) {
            struct filter_weights w_local;
 
            for (int too = 0; too < Tm; too++) { // ax1
              #pragma unroll
              for (int tii = 0; tii < Tn; tii++) { // rc
                uint tmp = (0x3F800000 + too) + ((rc * 1 + tii) & 0xFFFF);
                w_local.weight_buf[tii] = *(float *) &tmp;
              }
              write_channel_intel(loadBChannel, w_local);
            }
 
          }
        }
      }
    }
}
 
__attribute__((max_global_work_dim(0)))
__attribute__((autorun))
kernel void monolithic() {
    float __attribute__((memory)) output_buf[Tm][Tc];
    float __attribute__((memory)) weight_buf[Tm][Tn];
    float __attribute__((memory)) input_buf[Tn][Tc];
 
 
    while (1) {
        struct outputs out;
        bool resetsum, flush;
 
        for (int tii = 0; tii < Tn; tii++) { // rc - input feature maps (C)
          struct input_features valA = read_channel_intel(loadAChannel);
          resetsum = valA.rc_zero;
          flush = valA.flush;
          #pragma unroll
          for (int tcc = 0; tcc < Tc; tcc++) { // xx - output columns (Q)
            input_buf[tii][tcc] = valA.input_buf[tcc];
          }
        }
        for (int too = 0; too < Tm; too++) { // ax1 - output features (K)
          struct filter_weights valB = read_channel_intel(loadBChannel);
          #pragma unroll
          for (int tii = 0; tii < Tn; tii++) { // rc - input feature maps (C)
            weight_buf[too][tii] = valB.weight_buf[tii];
          }
        }
 
        /* compute here */
 
        if (flush) {
            #pragma unroll
            for (int too = 0; too < Tm; too++) { // ax1 - output features (K)
              #pragma unroll
              for (int tcc = 0; tcc < Tc; tcc++) { // xx - output columns (Q)
                out.output_buf[too][tcc] = output_buf[too][tcc];
              }
            }
            write_channel_intel(storeCChannel, out);
        }
    }
}
 
kernel void storeC(global float* restrict compute, global float* restrict input0, global float* restrict input1, global float* restrict T_clip, global float* restrict input2, global float* restrict input3,
        int ax1_bound, int yy_bound, int xx_bound, int rc_bound) {
    for (int ax1 = 0; ax1 < ax1_bound; ax1 += Tm) {
      for (int yy = 0; yy < yy_bound; yy++) {
        for (int xx = 0; xx < yy_bound; xx += Tc) {
          struct outputs out_local = read_channel_intel(storeCChannel);
 
          #pragma unroll
          for (int too = 0; too < Tm; too++) {
              #pragma unroll
              for (int tcc = 0; tcc < Tc; tcc++) {
                out_local.output_buf[too][tcc] += 1;
              }
          }
        }
      }
    }
}

HRZ · ‎05-30-2020

It is actually not very surprising to see stalls on both sides of a channel. Stalls, especially if they come from external memory, tends to propagate all the way through the pipeline. It is very much possible that the stalls on the channel write are coming from the external memory reads supplying those channel writes, and stalls on channel reads come from the channel being empty due to channel writes being stalled. i.e. external memory read stalls are propagating all the way down the pipeline. However, I do not see any external memory accesses in your design, nor do I see any access to external memory ports being instantiated by the compiler when compiling your kernel either and hence, I am not sure what it is exactly doing. What is the point of the pointer arithmetic on lines 38 and 60?

SChun19 · ‎05-30-2020

I intentionally removed all external memory accesses and replaced them with a pseudorandom generator on 38 and 60 rather than pointer arithmetic so that I can rule that out. There are no global memory accesses here.

(that pseudorandom generation technique is from the matrix multiplication example from Quartus 19.1)

AnilErinch_A_Intel · ‎06-09-2020

Hi ,

Can you increase the depth of the channel from 64 and let us know the results.

Thanks and Regards

Anil