- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi
I use intel compiler 15 and I have a code that include huge number of maximum operaton
I write a a vectorize code for that max operation and performance of my application dramatically goes down .
my previouse code is :
while (cycles < MAX_CYCLE)
{
for (int i = len - 1; i >= 0; i--)
{
illr = i*llr_height;
BG8 = inz[illr + 1];
BG17 = inx[illr];
BG18 = inx[illr + 1];
BG3 = inx[illr + 2];
BG17_8 = BG17 + BG8;
BG18_8 = BG18 + BG8;
BG3_8 = BG3 + BG8;
float max4 = 0;//gamma[0];
max4 = max_float(max4,BG17_8 + beta_7);
max4 = max_float(max4, BG18_8 + beta_4);
max4 = max_float(max4, BG3 + beta_3);
tempab[0] = max4;
max4 = beta_4;
max4 = max_float(max4, BG17_8 + beta_3);
max4 = max_float(max4, BG18_8);
max4 = max_float(max4, BG3 + beta_7);
tempab[1] = max4;
max4 = BG8 + beta_1;
max4 = max_float(max4, BG17 + beta_6);
max4 = max_float(max4, BG18 + beta_5);
max4 = max_float(max4, BG3_8 + beta_2);
tempab[2] = max4;
max4 = BG8 + beta_5;
max4 = max_float(max4, BG17 + beta_2);
max4 = max_float(max4, BG18 + beta_1);
max4 = max_float(max4, BG3_8 + beta_6);
tempab[3] = max4;
max4 = BG8 + beta_6;
max4 = max_float(max4, BG17 + beta_1);
max4 = max_float(max4, BG18 + beta_2);
max4 = max_float(max4, BG3_8 + beta_5);
tempab[4] = max4;
max4 = BG8 + beta_2;
max4 = max_float(max4, BG17 + beta_5);
max4 = max_float(max4, BG18 + beta_6);
max4 = max_float(max4, BG3_8 + beta_1);
tempab[5] = max4;
max4 = beta_7;
max4 = max_float(max4, BG17_8);
max4 = max_float(max4, BG18_8 + beta_3);
max4 = max_float(max4, BG3 + beta_4);
tempab[6] = max4;
max4 = beta_3;
max4 = max_float(max4, BG17_8 + beta_4);
max4 = max_float(max4, BG18_8 + beta_7);
max4 = max_float(max4, BG3);
tempab[7] = max4;
}
cycles++;
}
and new vectorize code is :
__m256 Vec1,Vec2,Vec3,Vec4,Vec5,Vec6,Vec7,Vec8;
float V1[8];float V2[8];float V3[8];float V4[8];
while (cycles < MAX_CYCLE)
{
for (int i = len - 1; i >= 0; i--) // calculate beta[] based on beta[][i+1]
{
illr = i*llr_height;
BG8 = inz[illr + 1];
//float BG24 = 0;
//basegamma[16] = basegamma[8];
BG17 = inx[illr];
BG18 = inx[illr + 1];
BG3 = inx[illr + 2];
BG17_8 = BG17 + BG8;
BG18_8 = BG18 + BG8;
BG3_8 = BG3 + BG8;
int iplus1 = i + 1;
V1[0] = 0;
V2[0] = BG17_8 + beta_7;
V3[0] = BG18_8 + beta_4;
V4[0] = BG3 + beta_3;
V1[1] = beta_4;
V2[1] = BG17_8 + beta_3;
V3[1] = BG18_8;
V4[1] = BG3 + beta_7;
V1[2] = BG8 + beta_1;
V2[2] = BG17 + beta_6;
V3[2] = BG18 + beta_5;
V4[2] = BG3_8 + beta_2;
V1[3] = BG8 + beta_5;
V2[3] = BG17 + beta_2;
V3[3] = BG18 + beta_1;
V4[3] = BG3_8 + beta_6;
V1[4] = BG8 + beta_6;
V2[4] = BG17 + beta_1;
V3[4] = BG18 + beta_2;
V4[4] = BG3_8 + beta_5;
V1[5] = BG8 + beta_2;
V2[5] = BG17 + beta_5;
V3[5] = BG18 + beta_6;
V4[5] = BG3_8 + beta_1;
V1[6] = beta_7;
V2[6] = BG17_8;
V3[6] = BG18_8 + beta_3;
V4[6] = BG3 + beta_4;
V1[7] = beta_3;
V2[7] = BG17_8 + beta_4;
V3[7] = BG18_8 + beta_7;
V4[7] = BG3;
Vec1 = _mm256_load_ps(V1);
Vec2 = _mm256_load_ps(V2);
Vec3 = _mm256_load_ps(V3);
Vec4 = _mm256_load_ps(V4);
Vec5=_mm256_max_ps(Vec1, Vec2);
Vec6 = _mm256_max_ps(Vec5, Vec3);
Vec7 = _mm256_max_ps(Vec6, Vec4);
_mm256_storeu_ps(V1, Vec7);
}//for (int i = len - 1; i >= 0; i--)
cycles++;
}
I will appreciate if some one tell me why my performance fall down and what can I do to correct that?
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Your code is not how you would vectorize. To vectorize using AVX intrinsics it would look something like this (*** untested code) ***):
while (cycles < MAX_CYCLE) { for (int i = len - 1; i >= 0; i--) { illr = i*llr_height; BG8 = inz[illr + 1]; BG17 = inx[illr]; BG18 = inx[illr + 1]; BG3 = inx[illr + 2]; BG17_8 = BG17 + BG8; BG18_8 = BG18 + BG8; BG3_8 = BG3 + BG8; // __m256 float lanes: lane7 lane6 lane5 lane4 lane3 lane2 lane1 lane0 __m256 V_max4 = _mm256_set_ps( beta_3, beta_7, BG8+beta_2, BG8+beta_6, BG8+beta_5, BG8+beta_1, beta_4, 0.0f); __m256 V_test1 = _mm256_set_ps(BG17_8+beta_4, BG17_8, BG17+beta_5, BG17+beta_1, BG17+beta_2, BG17+beta_6, BG17_8+beta_3, BG17_8+beta_7); // 2) __m256 V_test2 = _mm256_set_ps(BG18_8+beta_7, BG18_8+beta_3, BG18+beta_6, BG18+beta_2, BG18+beta_1, BG18+beta_5, BG18_8, BG18_8+beta_4); __m256 V_test3 = _mm256_set_ps( BG3, BG3+beta_4, BG3_8+beta_1, BG3_8+beta_5, BG3_8+beta_6, BG3_8+beta_2, BG3+beta_7, BG3+beta_3); __m256 GTmask = _mm256_cmp_ps(V_max4, V_test1, _CMP_GT_OQ); // 2) V_max4 = _mm256_or_ps(_mm256_and_ps(GTmask, V_max4), _mm256_andnot_ps(GTmask, V_test1)); // 2) GTmask = _mm256_cmp_ps(V_max4, V_test2, _CMP_GT_OQ); // 2) V_max4 = _mm256_or_ps(_mm256_and_ps(GTmask, V_max4), _mm256_andnot_ps(GTmask, V_test2)); // 2) GTmask = _mm256_cmp_ps(V_max4, V_test3, _CMP_GT_OQ); // 2) V_max4 = _mm256_or_ps(_mm256_and_ps(GTmask, V_max4), _mm256_andnot_ps(GTmask, V_test3)); // 2) _mm256_store_ps(tempab, V_max4); // assuming you want to include write to memory } cycles++; }
Think in terms of lanes.
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Sergey,
Hamed's original code (listed as prievious code) did not use a mixture of float and int other than index and loop control. The candidate intrinsic code I presented uses a mixture of float and boolian (internally as int32) but there is no conversion between formats. The int32's are used to mask and select which lanes of the vector to include/exclude/combine for placing the max value of the respective lanes. If (when) AVX512F is available the alternate __mmask16 _mm256_cmp_ps_mask etc... intrinsics could be used to reduce the number of instructions further.
TimP,
It would be interesting to find out if the newest compiler (targeting AVX2 or KNL) would vectorize the code in a similar manner as post #2.
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I'm suspecting that when you say "vectorize code" you mean use of simd intrinsics. It's been over 5 years since any compiler was put on the market without auto-vectorization, which helps a great deal in avoiding the need for the intrinsics.
You might compare the opt-report for the non-intrinsics and intrinsics versions and see whether in fact you have added (or removed) some vectorization.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page