>>I execute that sample code

mmmmm__hamed · ‎02-15-2017

Hi

I use intel compiler 15 and I have a code that include huge number of maximum operaton

I write a a vectorize code for that max operation and performance of my application dramatically goes down .

my previouse code is :

while (cycles < MAX_CYCLE)
{

       for (int i = len - 1; i >= 0; i--)
       {
  illr = i*llr_height;

           BG8 = inz[illr + 1];
           BG17 = inx[illr];
           BG18 = inx[illr + 1];
           BG3 = inx[illr + 2];
           BG17_8 = BG17 + BG8;
           BG18_8 = BG18 + BG8;
           BG3_8 = BG3 + BG8;

float max4 = 0;//gamma[0];

max4 = max_float(max4,BG17_8 + beta_7);
max4 = max_float(max4, BG18_8 + beta_4);
max4 = max_float(max4, BG3 + beta_3);
tempab[0] = max4;

max4 = beta_4;
max4 = max_float(max4, BG17_8 + beta_3);
max4 = max_float(max4, BG18_8);
max4 = max_float(max4, BG3 + beta_7);
tempab[1] = max4;

max4 = BG8 + beta_1;
max4 = max_float(max4, BG17 + beta_6);
max4 = max_float(max4, BG18 + beta_5);
max4 = max_float(max4, BG3_8 + beta_2);
tempab[2] = max4;

max4 = BG8 + beta_5;
max4 = max_float(max4, BG17 + beta_2);
max4 = max_float(max4, BG18 + beta_1);
max4 = max_float(max4, BG3_8 + beta_6);
tempab[3] = max4;

max4 = BG8 + beta_6;
max4 = max_float(max4, BG17 + beta_1);
max4 = max_float(max4, BG18 + beta_2);
max4 = max_float(max4, BG3_8 + beta_5);
tempab[4] = max4;

max4 = BG8 + beta_2;
max4 = max_float(max4, BG17 + beta_5);
max4 = max_float(max4, BG18 + beta_6);
max4 = max_float(max4, BG3_8 + beta_1);
tempab[5] = max4;

max4 = beta_7;
max4 = max_float(max4, BG17_8);
max4 = max_float(max4, BG18_8 + beta_3);
max4 = max_float(max4, BG3 + beta_4);
tempab[6] = max4;

max4 = beta_3;
max4 = max_float(max4, BG17_8 + beta_4);
max4 = max_float(max4, BG18_8 + beta_7);
max4 = max_float(max4, BG3);
tempab[7] = max4;
              }

cycles++;
}

and new vectorize code is :

__m256 Vec1,Vec2,Vec3,Vec4,Vec5,Vec6,Vec7,Vec8;

float V1[8];float V2[8];float V3[8];float V4[8];

while (cycles < MAX_CYCLE)
{

for (int i = len - 1; i >= 0; i--) // calculate beta[] based on beta[][i+1]
{

           illr = i*llr_height;
           BG8 = inz[illr + 1];
           //float BG24 = 0;
           //basegamma[16] = basegamma[8];
           BG17 = inx[illr];
           BG18 = inx[illr + 1];
           BG3 = inx[illr + 2];
           BG17_8 = BG17 + BG8;
           BG18_8 = BG18 + BG8;
           BG3_8 = BG3 + BG8;
           int iplus1 = i + 1;

V1[0] = 0;

           V2[0] = BG17_8 + beta_7;
           V3[0] = BG18_8 + beta_4;
           V4[0] = BG3 + beta_3;

           V1[1] = beta_4;
           V2[1] = BG17_8 + beta_3;
           V3[1] = BG18_8;
           V4[1] = BG3 + beta_7;

           V1[2] = BG8 + beta_1;
           V2[2] = BG17 + beta_6;
           V3[2] = BG18 + beta_5;
           V4[2] = BG3_8 + beta_2;

           V1[3] = BG8 + beta_5;
           V2[3] = BG17 + beta_2;
           V3[3] = BG18 + beta_1;
           V4[3] = BG3_8 + beta_6;

           V1[4] = BG8 + beta_6;
           V2[4] = BG17 + beta_1;
           V3[4] = BG18 + beta_2;
           V4[4] = BG3_8 + beta_5;

           V1[5] = BG8 + beta_2;
           V2[5] = BG17 + beta_5;
           V3[5] = BG18 + beta_6;
           V4[5] = BG3_8 + beta_1;

           V1[6] = beta_7;
           V2[6] = BG17_8;
           V3[6] = BG18_8 + beta_3;
           V4[6] = BG3 + beta_4;

           V1[7] = beta_3;
           V2[7] = BG17_8 + beta_4;
           V3[7] = BG18_8 + beta_7;
           V4[7] = BG3;

           Vec1 = _mm256_load_ps(V1);
           Vec2 = _mm256_load_ps(V2);
           Vec3 = _mm256_load_ps(V3);
           Vec4 = _mm256_load_ps(V4);

           Vec5=_mm256_max_ps(Vec1, Vec2);
           Vec6 = _mm256_max_ps(Vec5, Vec3);
           Vec7 = _mm256_max_ps(Vec6, Vec4);

_mm256_storeu_ps(V1, Vec7);

}//for (int i = len - 1; i >= 0; i--)

       cycles++;
   }

I will appreciate if some one tell me why my performance fall down and what can I do to correct that?

jimdempseyatthecove · ‎02-15-2017

Your code is not how you would vectorize. To vectorize using AVX intrinsics it would look something like this (*** untested code) ***):

while (cycles < MAX_CYCLE)
     {

        for (int i = len - 1; i >= 0; i--)  
         {
             illr = i*llr_height;

            BG8 = inz[illr + 1];
            BG17 = inx[illr];
            BG18 = inx[illr + 1];
            BG3 = inx[illr + 2];
            BG17_8 = BG17 + BG8;
            BG18_8 = BG18 + BG8;
            BG3_8 = BG3 + BG8;
 
//  __m256 float lanes:                lane7          lane6         lane5         lane4         lane3         lane2          lane1          lane0            
__m256 V_max4  = _mm256_set_ps(       beta_3,        beta_7,   BG8+beta_2,   BG8+beta_6,   BG8+beta_5,   BG8+beta_1,        beta_4,          0.0f);
__m256 V_test1 = _mm256_set_ps(BG17_8+beta_4,        BG17_8,  BG17+beta_5,  BG17+beta_1,  BG17+beta_2,  BG17+beta_6, BG17_8+beta_3, BG17_8+beta_7); // 2)
__m256 V_test2 = _mm256_set_ps(BG18_8+beta_7, BG18_8+beta_3,  BG18+beta_6,  BG18+beta_2,  BG18+beta_1,  BG18+beta_5,        BG18_8, BG18_8+beta_4);
__m256 V_test3 = _mm256_set_ps(          BG3,    BG3+beta_4, BG3_8+beta_1, BG3_8+beta_5, BG3_8+beta_6, BG3_8+beta_2,    BG3+beta_7,    BG3+beta_3);

__m256 GTmask  = _mm256_cmp_ps(V_max4, V_test1, _CMP_GT_OQ); // 2)
       V_max4  = _mm256_or_ps(_mm256_and_ps(GTmask, V_max4), _mm256_andnot_ps(GTmask, V_test1)); // 2)
       GTmask  = _mm256_cmp_ps(V_max4, V_test2, _CMP_GT_OQ); // 2)
       V_max4  = _mm256_or_ps(_mm256_and_ps(GTmask, V_max4), _mm256_andnot_ps(GTmask, V_test2)); // 2)
       GTmask  = _mm256_cmp_ps(V_max4, V_test3, _CMP_GT_OQ); // 2)
       V_max4  = _mm256_or_ps(_mm256_and_ps(GTmask, V_max4), _mm256_andnot_ps(GTmask, V_test3)); // 2)

       _mm256_store_ps(tempab, V_max4); // assuming you want to include write to memory
          }

        cycles++;
     }

Think in terms of lanes.

Jim Dempsey

TimP · ‎02-15-2017

You may not need simd intrinsic for vectorization. Older versions of icpc needed std::max for optimization. Newer ones are more flexible, e.g. ? operation should work, sometimes needing #pragma vector always.

SergeyKostrov · ‎02-15-2017

You've declared a set of variables VecX of type __m256: ... __m256 Vec1, Vec2, Vec3, Vec4, Vec5, Vec6, Vec7, Vec8; ... A set of variables Vx of type float: ... float V1[8];float V2[8];float V3[8];float V4[8]; ... and initialized variables VecX using _mm256_load_ps intrinsic function: ... Vec1 = _mm256_load_ps( V1 ); Vec2 = _mm256_load_ps( V2 ); Vec3 = _mm256_load_ps( V3 ); Vec4 = _mm256_load_ps( V4 ); ... Initialization could be done in a different way using a memory "mapping" technique without using _mm256_load_ps intrinsic function: ... __m256 *pVec1 = NULL, *pVec2 = NULL, *pVec3 = NULL, *pVec4 = NULL; ... float V1[8]; float V2[8]; float V3[8]; float V4[8]; ... // Initialization of V-variables ... pVec1 = ( __m256 * )&V1[0]; pVec2 = ( __m256 * )&V2[0]; pVec3 = ( __m256 * )&V3[0]; pVec4 = ( __m256 * )&V4[0]; ...

SergeyKostrov · ‎02-15-2017

Another tip is: Your main for-loop is too big to be completely auto-vectorized / simd-vectorized and you're mixing integer operations with floating point operations. In cases like yours a for-loop could be divided into two parts: - A for-loop #1 that does all integer operations, for example calculation of all indexes ( a helper array to store indexes is needed ) - A for-loop #2 that does all floating point operations

jimdempseyatthecove · ‎02-15-2017

Sergey,

Hamed's original code (listed as prievious code) did not use a mixture of float and int other than index and loop control. The candidate intrinsic code I presented uses a mixture of float and boolian (internally as int32) but there is no conversion between formats. The int32's are used to mask and select which lanes of the vector to include/exclude/combine for placing the max value of the respective lanes. If (when) AVX512F is available the alternate __mmask16 _mm256_cmp_ps_mask etc... intrinsics could be used to reduce the number of instructions further.

TimP,

It would be interesting to find out if the newest compiler (targeting AVX2 or KNL) would vectorize the code in a similar manner as post #2.

Jim Dempsey

SergeyKostrov · ‎02-15-2017

>>...It would be interesting to find out if the newest compiler (targeting AVX2 or KNL) would vectorize the code in a similar manner as >>post #2... I don't think it could be auto-vectorized because a complexity of the original for-loop ( 1st example in the Post # 1 ) is very high. If auto-vectorization is selected then it is always a good thing to simplify such for-loops to make a job for a C++ compiler easier. Also, I recently checked a vectorization report of a C/C++ project ( ~200,000 code lines ) and more than 50% of all for-loops are not auto-vectorized because these for-loops could be rated as too complex. Some of them have hundreds of lines of code, mixed data types, calls to different functions, etc.

mmmmm__hamed · ‎02-16-2017

dear jimdempseyatthecove at first I appreciate you for what you do for me and spending such big time for my problem. I execute that sample code that you wrote for me. it's speed is much better than vectorize code that I had written but it's speed is equal or slighty lower than non-vectorize code as I mentioned befor with name "my previous code". as if compiler generate more efficient code rather than our vectorize code. whats your idea? thanks

TimP · ‎02-16-2017

I'm suspecting that when you say "vectorize code" you mean use of simd intrinsics. It's been over 5 years since any compiler was put on the market without auto-vectorization, which helps a great deal in avoiding the need for the intrinsics.

You might compare the opt-report for the non-intrinsics and intrinsics versions and see whether in fact you have added (or removed) some vectorization.

SergeyKostrov · ‎02-17-2017

>>I execute that sample code that you wrote for me. it's speed is much better than vectorize code that I had written but it's speed is >>equal or slighty lower than non-vectorize code as I mentioned befor with name "my previous code". You have at least three options: - Analyze vectorization reports - Analyze performance with VTune - Analyze codes and performance with Advisor One more thing. This is not a forum on how to solve vectorization problems and the forum is dedicated to problems with Intel C++ compilers for Windows, Linux and Mac operating systems.

vectorize operator decrease performance