Intel® C++ Compiler
Community support and assistance for creating C++ code that runs on platforms based on Intel® processors.

vectorize operator decrease performance

mmmmm__hamed
Beginner
631 Views

Hi 

I use intel compiler 15 and I  have a code that include huge number of maximum operaton

I write a a vectorize code for that max operation and performance of my application dramatically goes down .

my previouse code is :

while (cycles < MAX_CYCLE)
    {

        for (int i = len - 1; i >= 0; i--)  
        {
            illr = i*llr_height;

            BG8 = inz[illr + 1];
            BG17 = inx[illr];
            BG18 = inx[illr + 1];
            BG3 = inx[illr + 2];
            BG17_8 = BG17 + BG8;
            BG18_8 = BG18 + BG8;
            BG3_8 = BG3 + BG8;
           

            float max4 = 0;//gamma[0];

            max4 = max_float(max4,BG17_8 + beta_7);
            max4 = max_float(max4, BG18_8 + beta_4);
            max4 = max_float(max4, BG3 + beta_3);
            tempab[0] = max4;
            
            max4 = beta_4;
            max4 = max_float(max4, BG17_8 + beta_3);
            max4 = max_float(max4, BG18_8);
            max4 = max_float(max4, BG3 + beta_7);
            tempab[1] = max4;
            
            max4 = BG8 + beta_1;
            max4 = max_float(max4, BG17 + beta_6);
            max4 = max_float(max4, BG18 + beta_5);
            max4 = max_float(max4,  BG3_8 + beta_2);
            tempab[2] = max4;
            
            max4 = BG8 + beta_5;
            max4 = max_float(max4, BG17 + beta_2);
            max4 = max_float(max4, BG18 + beta_1);
            max4 = max_float(max4, BG3_8 + beta_6);
            tempab[3] = max4;
            
            max4 = BG8 + beta_6;
            max4 = max_float(max4,  BG17 + beta_1);
            max4 = max_float(max4, BG18 + beta_2);
            max4 = max_float(max4, BG3_8 + beta_5);
            tempab[4] = max4;
            
            max4 = BG8 + beta_2;
            max4 = max_float(max4,  BG17 + beta_5);
            max4 = max_float(max4, BG18 + beta_6);
            max4 = max_float(max4, BG3_8 + beta_1);
            tempab[5] = max4;
            
            max4 = beta_7;
            max4 = max_float(max4, BG17_8);
            max4 = max_float(max4, BG18_8 + beta_3);
            max4 = max_float(max4, BG3 + beta_4);
            tempab[6] = max4;
            
            max4 = beta_3;
            max4 = max_float(max4, BG17_8 + beta_4);
            max4 = max_float(max4, BG18_8 + beta_7);
            max4 = max_float(max4, BG3);
            tempab[7] = max4;
              }

        cycles++;
    }

 

and new vectorize code is :

__m256 Vec1,Vec2,Vec3,Vec4,Vec5,Vec6,Vec7,Vec8;

float V1[8];float V2[8];float V3[8];float V4[8];

while (cycles < MAX_CYCLE)
    {

        for (int i = len - 1; i >= 0; i--)  // calculate beta[] based on beta[][i+1]
        {

            illr = i*llr_height;
            BG8 = inz[illr + 1];
            //float BG24 = 0;
            //basegamma[16] = basegamma[8];
            BG17 = inx[illr];
            BG18 = inx[illr + 1];
            BG3 = inx[illr + 2];
            BG17_8 = BG17 + BG8;
            BG18_8 = BG18 + BG8;
            BG3_8 = BG3 + BG8;
            int iplus1 = i + 1;

            V1[0] = 0;

            V2[0] = BG17_8 + beta_7;
            V3[0] = BG18_8 + beta_4;
            V4[0] = BG3 + beta_3;

            V1[1] = beta_4;
            V2[1] = BG17_8 + beta_3;
            V3[1] = BG18_8;
            V4[1] = BG3 + beta_7;

            V1[2] = BG8 + beta_1;
            V2[2] = BG17 + beta_6;
            V3[2] = BG18 + beta_5;
            V4[2] = BG3_8 + beta_2;

            V1[3] = BG8 + beta_5;
            V2[3] = BG17 + beta_2;
            V3[3] = BG18 + beta_1;
            V4[3] = BG3_8 + beta_6;

            V1[4] = BG8 + beta_6;
            V2[4] = BG17 + beta_1;
            V3[4] = BG18 + beta_2;
            V4[4] = BG3_8 + beta_5;

            V1[5] = BG8 + beta_2;
            V2[5] = BG17 + beta_5;
            V3[5] = BG18 + beta_6;
            V4[5] = BG3_8 + beta_1;

            V1[6] = beta_7;
            V2[6] = BG17_8;
            V3[6] = BG18_8 + beta_3;
            V4[6] = BG3 + beta_4;

            V1[7] = beta_3;
            V2[7] = BG17_8 + beta_4;
            V3[7] = BG18_8 + beta_7;
            V4[7] = BG3;

            Vec1 = _mm256_load_ps(V1);
            Vec2 = _mm256_load_ps(V2);
            Vec3 = _mm256_load_ps(V3);
            Vec4 = _mm256_load_ps(V4);

            Vec5=_mm256_max_ps(Vec1, Vec2);
            Vec6 = _mm256_max_ps(Vec5, Vec3);
            Vec7 = _mm256_max_ps(Vec6, Vec4);

            _mm256_storeu_ps(V1, Vec7);

        }//for (int i = len - 1; i >= 0; i--)


    
        cycles++;
    }

I will appreciate if some one tell me why my performance fall down and what can I do to correct that?

0 Kudos
9 Replies
jimdempseyatthecove
Honored Contributor III
631 Views

Your code is not how you would vectorize. To vectorize using AVX intrinsics it would look something like this (*** untested code) ***):

while (cycles < MAX_CYCLE)
     {

        for (int i = len - 1; i >= 0; i--)  
         {
             illr = i*llr_height;

            BG8 = inz[illr + 1];
            BG17 = inx[illr];
            BG18 = inx[illr + 1];
            BG3 = inx[illr + 2];
            BG17_8 = BG17 + BG8;
            BG18_8 = BG18 + BG8;
            BG3_8 = BG3 + BG8;
 
//  __m256 float lanes:                lane7          lane6         lane5         lane4         lane3         lane2          lane1          lane0            
__m256 V_max4  = _mm256_set_ps(       beta_3,        beta_7,   BG8+beta_2,   BG8+beta_6,   BG8+beta_5,   BG8+beta_1,        beta_4,          0.0f);
__m256 V_test1 = _mm256_set_ps(BG17_8+beta_4,        BG17_8,  BG17+beta_5,  BG17+beta_1,  BG17+beta_2,  BG17+beta_6, BG17_8+beta_3, BG17_8+beta_7); // 2)
__m256 V_test2 = _mm256_set_ps(BG18_8+beta_7, BG18_8+beta_3,  BG18+beta_6,  BG18+beta_2,  BG18+beta_1,  BG18+beta_5,        BG18_8, BG18_8+beta_4);
__m256 V_test3 = _mm256_set_ps(          BG3,    BG3+beta_4, BG3_8+beta_1, BG3_8+beta_5, BG3_8+beta_6, BG3_8+beta_2,    BG3+beta_7,    BG3+beta_3);

__m256 GTmask  = _mm256_cmp_ps(V_max4, V_test1, _CMP_GT_OQ); // 2)
       V_max4  = _mm256_or_ps(_mm256_and_ps(GTmask, V_max4), _mm256_andnot_ps(GTmask, V_test1)); // 2)
       GTmask  = _mm256_cmp_ps(V_max4, V_test2, _CMP_GT_OQ); // 2)
       V_max4  = _mm256_or_ps(_mm256_and_ps(GTmask, V_max4), _mm256_andnot_ps(GTmask, V_test2)); // 2)
       GTmask  = _mm256_cmp_ps(V_max4, V_test3, _CMP_GT_OQ); // 2)
       V_max4  = _mm256_or_ps(_mm256_and_ps(GTmask, V_max4), _mm256_andnot_ps(GTmask, V_test3)); // 2)

       _mm256_store_ps(tempab, V_max4); // assuming you want to include write to memory
          }

        cycles++;
     }

Think in terms of lanes.

Jim Dempsey

0 Kudos
TimP
Honored Contributor III
631 Views
You may not need simd intrinsic for vectorization. Older versions of icpc needed std::max for optimization. Newer ones are more flexible, e.g. ? operation should work, sometimes needing #pragma vector always.
0 Kudos
SergeyKostrov
Valued Contributor II
631 Views
You've declared a set of variables VecX of type __m256: ... __m256 Vec1, Vec2, Vec3, Vec4, Vec5, Vec6, Vec7, Vec8; ... A set of variables Vx of type float: ... float V1[8];float V2[8];float V3[8];float V4[8]; ... and initialized variables VecX using _mm256_load_ps intrinsic function: ... Vec1 = _mm256_load_ps( V1 ); Vec2 = _mm256_load_ps( V2 ); Vec3 = _mm256_load_ps( V3 ); Vec4 = _mm256_load_ps( V4 ); ... Initialization could be done in a different way using a memory "mapping" technique without using _mm256_load_ps intrinsic function: ... __m256 *pVec1 = NULL, *pVec2 = NULL, *pVec3 = NULL, *pVec4 = NULL; ... float V1[8]; float V2[8]; float V3[8]; float V4[8]; ... // Initialization of V-variables ... pVec1 = ( __m256 * )&V1[0]; pVec2 = ( __m256 * )&V2[0]; pVec3 = ( __m256 * )&V3[0]; pVec4 = ( __m256 * )&V4[0]; ...
0 Kudos
SergeyKostrov
Valued Contributor II
631 Views
Another tip is: Your main for-loop is too big to be completely auto-vectorized / simd-vectorized and you're mixing integer operations with floating point operations. In cases like yours a for-loop could be divided into two parts: - A for-loop #1 that does all integer operations, for example calculation of all indexes ( a helper array to store indexes is needed ) - A for-loop #2 that does all floating point operations
0 Kudos
jimdempseyatthecove
Honored Contributor III
631 Views

Sergey,

Hamed's original code (listed as prievious code) did not use a mixture of float and int other than index and loop control. The candidate intrinsic code I presented uses a mixture of float and boolian (internally as int32) but there is no conversion between formats. The int32's are used to mask and select which lanes of the vector to include/exclude/combine for placing the max value of the respective lanes. If (when) AVX512F is available the alternate __mmask16 _mm256_cmp_ps_mask etc... intrinsics could be used to reduce the number of instructions further.

TimP,

It would be interesting to find out if the newest compiler (targeting AVX2 or KNL) would vectorize the code in a similar manner as post #2.

Jim Dempsey

0 Kudos
SergeyKostrov
Valued Contributor II
631 Views
>>...It would be interesting to find out if the newest compiler (targeting AVX2 or KNL) would vectorize the code in a similar manner as >>post #2... I don't think it could be auto-vectorized because a complexity of the original for-loop ( 1st example in the Post # 1 ) is very high. If auto-vectorization is selected then it is always a good thing to simplify such for-loops to make a job for a C++ compiler easier. Also, I recently checked a vectorization report of a C/C++ project ( ~200,000 code lines ) and more than 50% of all for-loops are not auto-vectorized because these for-loops could be rated as too complex. Some of them have hundreds of lines of code, mixed data types, calls to different functions, etc.
0 Kudos
mmmmm__hamed
Beginner
631 Views
dear jimdempseyatthecove at first I appreciate you for what you do for me and spending such big time for my problem. I execute that sample code that you wrote for me. it's speed is much better than vectorize code that I had written but it's speed is equal or slighty lower than non-vectorize code as I mentioned befor with name "my previous code". as if compiler generate more efficient code rather than our vectorize code. whats your idea? thanks
0 Kudos
TimP
Honored Contributor III
631 Views

I'm suspecting that when you say "vectorize code" you mean use of simd intrinsics.  It's been over 5 years since any compiler was put on the market without auto-vectorization, which helps a great deal in avoiding the need for the intrinsics.

You might compare the opt-report for the non-intrinsics and intrinsics versions and see whether in fact you have added (or removed) some vectorization.

0 Kudos
SergeyKostrov
Valued Contributor II
631 Views
>>I execute that sample code that you wrote for me. it's speed is much better than vectorize code that I had written but it's speed is >>equal or slighty lower than non-vectorize code as I mentioned befor with name "my previous code". You have at least three options: - Analyze vectorization reports - Analyze performance with VTune - Analyze codes and performance with Advisor One more thing. This is not a forum on how to solve vectorization problems and the forum is dedicated to problems with Intel C++ compilers for Windows, Linux and Mac operating systems.
0 Kudos
Reply