- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I'm trying to write a geometric mean sqrt(a * b) using AVX intrinsics, but it runs slower than molasses!
int main()
{
int count = 0;
for (int i = 0; i < 100000000; ++i)
{
__m128i v8n_a = _mm_set1_epi16((++count) % 16),
v8n_b = _mm_set1_epi16((++count) % 16);
__m128i v8n_0 = _mm_set1_epi16(0);
__m256i temp1, temp2;
__m256 v8f_a = _mm256_cvtepi32_ps(temp1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi16(v8n_a, v8n_0)), _mm_unpackhi_epi16(v8n_a, v8n_0), 1)),
v8f_b = _mm256_cvtepi32_ps(temp2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi16(v8n_b, v8n_0)), _mm_unpackhi_epi16(v8n_b, v8n_0), 1));
__m256i v8n_meanInt32 = _mm256_cvtps_epi32(_mm256_sqrt_ps(_mm256_mul_ps(v8f_a, v8f_b)));
__m128i v4n_meanLo = _mm256_castsi256_si128(v8n_meanInt32),
v4n_meanHi = _mm256_extractf128_si256(v8n_meanInt32, 1);
g_data[i % 8] = v4n_meanLo;
g_data[(i + 1) % 8] = v4n_meanHi;
}
return 0;
}
The key to this mystery is that I'm using Intel ICC 11 and it's only slow when compiling with icc -O3 sqrt.cpp. If I compile with icc -O3 -xavx sqrt.cpp, then it runs 10x faster.
But it's not obvious if there's emulation happening because I used performance counters and the number of instructions executed for both versions is roughly 4G:
Performance counter stats for 'a.out':
16867.119538 task-clock # 0.999 CPUs utilized
37 context-switches # 0.000 M/sec
8 CPU-migrations # 0.000 M/sec
281 page-faults # 0.000 M/sec
35,463,758,996 cycles # 2.103 GHz
23,690,669,417 stalled-cycles-frontend # 66.80% frontend cycles idle
20,846,452,415 stalled-cycles-backend # 58.78% backend cycles idle
4,023,012,964 instructions # 0.11 insns per cycle
# 5.89 stalled cycles per insn
304,385,109 branches # 18.046 M/sec
42,636 branch-misses # 0.01% of all branches
16.891160582 seconds time elapsed
-----------------------------------with -xavx----------------------------------------
Performance counter stats for 'a.out':
1288.423505 task-clock # 0.996 CPUs utilized
3 context-switches # 0.000 M/sec
2 CPU-migrations # 0.000 M/sec
279 page-faults # 0.000 M/sec
2,708,906,702 cycles # 2.102 GHz
1,608,134,568 stalled-cycles-frontend # 59.36% frontend cycles idle
798,177,722 stalled-cycles-backend # 29.46% backend cycles idle
3,803,270,546 instructions # 1.40 insns per cycle
# 0.42 stalled cycles per insn
300,601,809 branches # 233.310 M/sec
15,167 branch-misses # 0.01% of all branches
1.293986790 seconds time elapsed
Is there some kind of processor internal emulation going on? I know for denormal numbers, adds end up being 64 times slower than normal.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Maybe your program has AVX to SSE transition penalties.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
unclejoe wrote:
I'm trying to write a geometric mean sqrt(a * b) using AVX intrinsics, but it runs slower than molasses!
int main()
{
int count = 0;
for (int i = 0; i < 100000000; ++i)
{
__m128i v8n_a = _mm_set1_epi16((++count) % 16),
v8n_b = _mm_set1_epi16((++count) % 16);
__m128i v8n_0 = _mm_set1_epi16(0);
__m256i temp1, temp2;
__m256 v8f_a = _mm256_cvtepi32_ps(temp1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi16(v8n_a, v8n_0)), _mm_unpackhi_epi16(v8n_a, v8n_0), 1)),
v8f_b = _mm256_cvtepi32_ps(temp2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi16(v8n_b, v8n_0)), _mm_unpackhi_epi16(v8n_b, v8n_0), 1));
__m256i v8n_meanInt32 = _mm256_cvtps_epi32(_mm256_sqrt_ps(_mm256_mul_ps(v8f_a, v8f_b)));
__m128i v4n_meanLo = _mm256_castsi256_si128(v8n_meanInt32),
v4n_meanHi = _mm256_extractf128_si256(v8n_meanInt32, 1);
g_data[i % 8] = v4n_meanLo;
g_data[(i + 1) % 8] = v4n_meanHi;
}
return 0;
}The key to this mystery is that I'm using Intel ICC 11 and it's only slow when compiling with icc -O3 sqrt.cpp. If I compile with icc -O3 -xavx sqrt.cpp, then it runs 10x faster.
But it's not obvious if there's emulation happening because I used performance counters and the number of instructions executed for both versions is roughly 4G:
Performance counter stats for 'a.out':
16867.119538 task-clock # 0.999 CPUs utilized
37 context-switches # 0.000 M/sec
8 CPU-migrations # 0.000 M/sec
281 page-faults # 0.000 M/sec
35,463,758,996 cycles # 2.103 GHz
23,690,669,417 stalled-cycles-frontend # 66.80% frontend cycles idle
20,846,452,415 stalled-cycles-backend # 58.78% backend cycles idle
4,023,012,964 instructions # 0.11 insns per cycle
# 5.89 stalled cycles per insn
304,385,109 branches # 18.046 M/sec
42,636 branch-misses # 0.01% of all branches16.891160582 seconds time elapsed
-----------------------------------with -xavx----------------------------------------
Performance counter stats for 'a.out':
1288.423505 task-clock # 0.996 CPUs utilized
3 context-switches # 0.000 M/sec
2 CPU-migrations # 0.000 M/sec
279 page-faults # 0.000 M/sec
2,708,906,702 cycles # 2.102 GHz
1,608,134,568 stalled-cycles-frontend # 59.36% frontend cycles idle
798,177,722 stalled-cycles-backend # 29.46% backend cycles idle
3,803,270,546 instructions # 1.40 insns per cycle
# 0.42 stalled cycles per insn
300,601,809 branches # 233.310 M/sec
15,167 branch-misses # 0.01% of all branches1.293986790 seconds time elapsed
Is there some kind of processor internal emulation going on? I know for denormal numbers, adds end up being 64 times slower than normal.
I'll advise to output the ASM dump with both flags and compare the inner loops
another idea, are you sure the alignment of your g_data array is the same in both tests ? in can make a big difference if in one case it's aligned to 32B boundaries and not in the other case
it looks like this code can be simplified quite a lot, for example you can get rid of temp1 and temp2 and replace the 4 last line (the 2x 128-bit store) with a single_mm256_storeu_si256, the compiler will take care of the proper optimization 2x 128-bit moves with /QxAVX , a single 256-bit move with /QxCORE-AVX2
if you post the simplified version I'll provide more advices
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Maybe your program has AVX to SSE transition penalties.
Thanks, you're right. I've confirmed with performance counters:
perf stat -e r10c1,r20c1 a.out (OTHER_ASSISTS.AVX_TO_SSE, OTHER_ASSISTS.SSE_TO_AVX)
Performance counter stats for 'a.out':
200,000,003 r10c1
200,001,376 r20c1
16.883746025 seconds time elapsed
I just think the penalty of saving/restoring all YMM registers is too much to hide from the programmer - there should be some kind of warning.
Please ignore those inefficiencies you mentioned. This is just a test program. The dead values temp1, temp2 are for debugging and are eliminated anyways. The storing to g_data is to prevent the values I want from being marked as dead. I would like to know if there's a better, less expensive way to mark values as live for benchmarking purposes. I was thinking of something like
if (++count == 123456789)
printf("%d", keepAliveValue)
where the compiler won't know any better than to compute keepAliveValue, but won't ever actually execute the printf().
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
It would be interesting if you could post diassembled code?
>>>I just think the penalty of saving/restoring all YMM registers is too much to hide from the programmer - there should be some kind of warning>>>
Warning is given by performance counters and/or by Intel software development emulator.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
unclejoe wrote:
Maybe your program has AVX to SSE transition penalties.
Thanks, you're right. I've confirmed with performance counters:
congrats to iliyapolak!
unclejoe wrote:
I just think the penalty of saving/restoring all YMM registers is too much to hide from the programmer - there should be some kind of warning.
there was a discussion about it in another thread the other day, even if coding only with intrinsics it is advised to use the /QxAVX flag since the compiler take care of the transitions for you by inserting proper VZEROUPPER instructions where required
unclejoe wrote:
anyways. The storing to g_data is to prevent the values I want from being marked as dead. I would like to know if there's a better, less expensive way to mark values as live for benchmarking purposes.
what I willl typically do in such cases is to declare out of loop something like this:
__m256 keepAlive = _mm256_setzero_ps();
then in the loop body, something like:
keepAlive = _mm256_or_ps(keepAlive,_mm256_castsi256_ps(v8n_meanInt32));
this way you'll have minimal impact on performance (keepAlive will be in a YMM register and VORPS is low reciprocal throughput), unlike with your stores
to fool the compiler simply save the content of keepAlive once at the end of the routine
it's always a good idea to have a look at ASM dumps though, to be certain of what you're measuring
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Here's the assembly from the inner most loop. Also, I won't need to mix the 2 instruction types since I can recompile everything.
----------------------------------------------------------------------no VEX prefix---------------------------------------------------------------------
..B1.9: # Preds ..B1.7 ..B1.10
movdqa %xmm3, %xmm2 #16.94
punpckhwd %xmm0, %xmm3 #16.129
punpcklwd %xmm0, %xmm2 #16.94
movd %esi, %xmm1 #13.61
punpcklwd %xmm1, %xmm1 #13.61
pshufd $0, %xmm1, %xmm5 #13.61
movdqa %xmm5, %xmm4 #17.89
punpckhwd %xmm0, %xmm5 #17.124
movl %ecx, %esi #21.16
punpcklwd %xmm0, %xmm4 #17.89
andl $7, %esi #21.16
movslq %esi, %rsi #21.5
shlq $4, %rsi #21.5
incl %ecx #22.17
movl %ecx, %edi #22.22
andl $7, %edi #22.22
vinsertf128 $1, %xmm3, %ymm2, %ymm3 #16.47
vcvtdq2ps %ymm3, %ymm7 #16.20
movslq %edi, %rdi #22.5
shlq $4, %rdi #22.5
cmpl $100000000, %ecx #11.23
vinsertf128 $1, %xmm5, %ymm4, %ymm6 #17.42
vcvtdq2ps %ymm6, %ymm8 #17.15
vmulps %ymm8, %ymm7, %ymm9 #18.63
vsqrtps %ymm9, %ymm10 #18.48
vcvtps2dq %ymm10, %ymm11 #18.29
movaps %xmm11, g_data(%rsi) #21.5
vextractf128 $1, %ymm11, g_data(%rdi) #22.5
jl ..B1.2 # Prob 100% #11.23
-----------------------------------------------------------------------with VEX prefix----------------------------------------------------------------------
..B1.9: # Preds ..B1.7 ..B1.10
vpunpcklwd %xmm0, %xmm4, %xmm3 #16.94
vmovd %esi, %xmm1 #13.61
movl %ecx, %esi #21.16
andl $7, %esi #21.16
movslq %esi, %rsi #21.5
shlq $4, %rsi #21.5
vpunpcklwd %xmm1, %xmm1, %xmm2 #13.61
incl %ecx #22.17
movl %ecx, %edi #22.22
andl $7, %edi #22.22
vpshufd $0, %xmm2, %xmm7 #13.61
movslq %edi, %rdi #22.5
shlq $4, %rdi #22.5
vpunpckhwd %xmm0, %xmm4, %xmm4 #16.129
vinsertf128 $1, %xmm4, %ymm3, %ymm5 #16.47
vcvtdq2ps %ymm5, %ymm10 #16.20
vpunpcklwd %xmm0, %xmm7, %xmm6 #17.89
vpunpckhwd %xmm0, %xmm7, %xmm8 #17.124
vinsertf128 $1, %xmm8, %ymm6, %ymm9 #17.42
vcvtdq2ps %ymm9, %ymm11 #17.15
vmulps %ymm11, %ymm10, %ymm12 #18.63
vsqrtps %ymm12, %ymm13 #18.48
vcvtps2dq %ymm13, %ymm14 #18.29
vmovaps %xmm14, g_data(%rsi) #21.5
vextractf128 $1, %ymm14, g_data(%rdi) #22.5
cmpl $100000000, %ecx #11.23
jl ..B1.2 # Prob 100% #11.23
It seems there are indeed 2 transitions in the mixed non-VEX SSE and AVX.
1. punpcklwd %xmm0, %xmm4 #17.89 to vinsertf128 $1, %xmm3, %ymm2, %ymm3 #16.47
2. movaps %xmm11, g_data(%rsi) #21.5 to vextractf128 $1, %ymm11, g_data(%rdi) #22.5
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
>>>congrats to iliyapolak!>>>
Thank you very much:)
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
>>>It seems there are indeed 2 transitions in the mixed non-VEX SSE and AVX.
1. punpcklwd %xmm0, %xmm4 #17.89 to vinsertf128 $1, %xmm3, %ymm2, %ymm3 #16.47
2. movaps %xmm11, g_data(%rsi) #21.5 to vextractf128 $1, %ymm11, g_data(%rdi) #22.5>>>
Yes it seems that these instructions are responsible for transition penalty.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page