Compiler doesn't vectorize even with simd directive

luca_l_ · ‎03-31-2017

I have this function taken from [here][1]:

    bool interpolate(const Mat &im, float ofsx, float ofsy, float a11, float a12, float a21, float a22, Mat &res)
    {         
       bool ret = false;
       // input size (-1 for the safe bilinear interpolation)
       const int width = im.cols-1;
       const int height = im.rows-1;
       // output size
       const int halfWidth  = res.cols >> 1;
       const int halfHeight = res.rows >> 1;
       float *out = res.ptr<float>(0);
       for (int j=-halfHeight; j<=halfHeight; ++j)
       {
          const float rx = ofsx + j * a12;
          const float ry = ofsy + j * a22;
          for(int i=-halfWidth; i<=halfWidth; ++i)
          {
             float wx = rx + i * a11;
             float wy = ry + i * a21;
             const int x = (int) floor(wx);
             const int y = (int) floor(wy);
             if (x >= 0 && y >= 0 && x < width && y < height)
             {
                // compute weights
                wx -= x; wy -= y;
                // bilinear interpolation
                *out++ = 
                   (1.0f - wy) * ((1.0f - wx) * im.at<float>(y,x)   + wx * im.at<float>(y,x+1)) +
                   (       wy) * ((1.0f - wx) * im.at<float>(y+1,x) + wx * im.at<float>(y+1,x+1));
             } else {
                *out++ = 0;
                ret =  true; // touching boundary of the input            
             }
          }
       }
       return ret;
    }

As suggested by [Intel Advisor][2], I added:

#pragma omp simd
for(int i=-halfWidth; i<=halfWidth; ++i)

However, while compiling I got:

warning #15552: loop was not vectorized with "simd"

Googling it, I found [this][3], but it's still not clear to me how I could solve this and vectorize this loop.

[1]: https://github.com/perdoch/hesaff/blob/master/helpers.cpp
[2]: https://www.google.it/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=0ahUKEwing__AqoDTAhUGwxQKHYvjDDgQFggaMAA&url=https%3A%2F%2Fsoftware.intel.com%2Fen-us%2Fintel-advisor-xe&usg=AFQjCNGmyci64qb8BQs66l_dLoXdF332dA&sig2=bpIr36nSicjl8sGzZKeF2A&bvm=bv.151325232,d.d24
[3]: https://software.intel.com/en-us/articles/fdiag13379

jimdempseyatthecove · ‎03-31-2017

In order for a loop to vectorize efficiently, if not at all, requires the iteration space to be "contiguous" (sequential), and for a sufficient number of iterations. Your computing of the x and y, as well as the boundary check makes it impossible to make this determination at compile time. The best the compiler can do is to generate scalar instructions (with conditional branching.

Depending on the size of the array, it may be effective to construct 6 temporary arrays:

bool interpolate(const Mat &im, float ofsx, float ofsy, float a11, float a12, float a21, float a22, Mat &res)
{         
   bool ret = false;
   float _wx[im.cols*im.rows];
   float _wy[im.cols*im.rows];
   float _t1[im.cols*im.rows];
   float _t2[im.cols*im.rows];
   float _t3[im.cols*im.rows];
   float _t4[im.cols*im.rows];
   // input size (-1 for the safe bilinear interpolation)
   const int width = im.cols-1;
   const int height = im.rows-1;
   // output size
   const int halfWidth  = res.cols >> 1;
   const int halfHeight = res.rows >> 1;
   float *out = res.ptr<float>(0);
   int iOut = 0;
   for (int j=-halfHeight; j<=halfHeight; ++j)
   {
      const float rx = ofsx + j * a12;
      const float ry = ofsy + j * a22;
      for(int i=-halfWidth; i<=halfWidth; ++i)
      {
         float wx = rx + i * a11;
         float wy = ry + i * a21;
         const int x = (int) floor(wx);
         const int y = (int) floor(wy);
         if (x >= 0 && y >= 0 && x < width && y < height)
         {
            // compute weights
            wx -= x; wy -= y;
            _wx[iOut] = wx;
            _wy[iOut] = wy;
            _t1[iOut] = im.at<float>(y,x);
            _t2[iOut] = im.at<float>(y,x+1);
            _t3[iOut] = im.at<float>(y+1,x);
            _t4[iOut] = im.at<float>(y+1,x+1);
         } else {
            _wy[iOut] = 0.0f;
            _wx[iOut] = 0.0f;
            _t1[iOut] = 0.0f;
            _t2[iOut] = 0.0f;
            _t3[iOut] = 0.0f;
            _t4[iOut] = 0.0f;
            ret =  true; // touching boundary of the input            
         }
         ++iOut;
      }
   }
   for (int i=0; i<iOut; ++i)
   {
      // bilinear interpolation
      out = 
               (1.0f - _wy) * ((1.0f - _wx) * _t1 + _wx * _t2) +
               (       _wy) * ((1.0f - _wx) * _t3 + _wx * _t4);
      }
   }
   return ret;
}
The above is Untested code

The unknown is as to if the cost of populating the temp arrays is returned by the gains of vectorization. You will have to test this (as well as correct for any typographical errors above.

Jim Dempsey