- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

I'm trying to write a geometric mean sqrt(a * b) using AVX intrinsics, but it runs slower than molasses!

int main()

{

int count = 0;

for (int i = 0; i < 100000000; ++i)

{

__m128i v8n_a = _mm_set1_epi16((++count) % 16),

v8n_b = _mm_set1_epi16((++count) % 16);

__m128i v8n_0 = _mm_set1_epi16(0);

__m256i temp1, temp2;

__m256 v8f_a = _mm256_cvtepi32_ps(temp1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi16(v8n_a, v8n_0)), _mm_unpackhi_epi16(v8n_a, v8n_0), 1)),

v8f_b = _mm256_cvtepi32_ps(temp2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi16(v8n_b, v8n_0)), _mm_unpackhi_epi16(v8n_b, v8n_0), 1));

__m256i v8n_meanInt32 = _mm256_cvtps_epi32(_mm256_sqrt_ps(_mm256_mul_ps(v8f_a, v8f_b)));

__m128i v4n_meanLo = _mm256_castsi256_si128(v8n_meanInt32),

v4n_meanHi = _mm256_extractf128_si256(v8n_meanInt32, 1);

g_data[i % 8] = v4n_meanLo;

g_data[(i + 1) % 8] = v4n_meanHi;

}

return 0;

}

The key to this mystery is that I'm using Intel ICC 11 and it's only slow when compiling with icc -O3 sqrt.cpp. If I compile with icc -O3 -xavx sqrt.cpp, then it runs 10x faster.

But it's not obvious if there's emulation happening because I used performance counters and the number of instructions executed for both versions is roughly 4G:

Performance counter stats for 'a.out':

16867.119538 task-clock # 0.999 CPUs utilized

37 context-switches # 0.000 M/sec

8 CPU-migrations # 0.000 M/sec

281 page-faults # 0.000 M/sec

35,463,758,996 cycles # 2.103 GHz

23,690,669,417 stalled-cycles-frontend # 66.80% frontend cycles idle

20,846,452,415 stalled-cycles-backend # 58.78% backend cycles idle

4,023,012,964 instructions # 0.11 insns per cycle

# 5.89 stalled cycles per insn

304,385,109 branches # 18.046 M/sec

42,636 branch-misses # 0.01% of all branches

16.891160582 seconds time elapsed

-----------------------------------with -xavx----------------------------------------

Performance counter stats for 'a.out':

1288.423505 task-clock # 0.996 CPUs utilized

3 context-switches # 0.000 M/sec

2 CPU-migrations # 0.000 M/sec

279 page-faults # 0.000 M/sec

2,708,906,702 cycles # 2.102 GHz

1,608,134,568 stalled-cycles-frontend # 59.36% frontend cycles idle

798,177,722 stalled-cycles-backend # 29.46% backend cycles idle

3,803,270,546 instructions # 1.40 insns per cycle

# 0.42 stalled cycles per insn

300,601,809 branches # 233.310 M/sec

15,167 branch-misses # 0.01% of all branches

1.293986790 seconds time elapsed

Is there some kind of processor internal emulation going on? I know for denormal numbers, adds end up being 64 times slower than normal.

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

Link Copied

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

Maybe your program has AVX to SSE transition penalties.

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

unclejoe wrote:

I'm trying to write a geometric mean sqrt(a * b) using AVX intrinsics, but it runs slower than molasses!

int main()

{

int count = 0;

for (int i = 0; i < 100000000; ++i)

{

__m128i v8n_a = _mm_set1_epi16((++count) % 16),

v8n_b = _mm_set1_epi16((++count) % 16);

__m128i v8n_0 = _mm_set1_epi16(0);

__m256i temp1, temp2;

__m256 v8f_a = _mm256_cvtepi32_ps(temp1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi16(v8n_a, v8n_0)), _mm_unpackhi_epi16(v8n_a, v8n_0), 1)),

v8f_b = _mm256_cvtepi32_ps(temp2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi16(v8n_b, v8n_0)), _mm_unpackhi_epi16(v8n_b, v8n_0), 1));

__m256i v8n_meanInt32 = _mm256_cvtps_epi32(_mm256_sqrt_ps(_mm256_mul_ps(v8f_a, v8f_b)));

__m128i v4n_meanLo = _mm256_castsi256_si128(v8n_meanInt32),

v4n_meanHi = _mm256_extractf128_si256(v8n_meanInt32, 1);

g_data[i % 8] = v4n_meanLo;

g_data[(i + 1) % 8] = v4n_meanHi;

}

return 0;

}The key to this mystery is that I'm using Intel ICC 11 and it's only slow when compiling with icc -O3 sqrt.cpp. If I compile with icc -O3 -xavx sqrt.cpp, then it runs 10x faster.

But it's not obvious if there's emulation happening because I used performance counters and the number of instructions executed for both versions is roughly 4G:

Performance counter stats for 'a.out':

16867.119538 task-clock # 0.999 CPUs utilized

37 context-switches # 0.000 M/sec

8 CPU-migrations # 0.000 M/sec

281 page-faults # 0.000 M/sec

35,463,758,996 cycles # 2.103 GHz

23,690,669,417 stalled-cycles-frontend # 66.80% frontend cycles idle

20,846,452,415 stalled-cycles-backend # 58.78% backend cycles idle

4,023,012,964 instructions # 0.11 insns per cycle

# 5.89 stalled cycles per insn

304,385,109 branches # 18.046 M/sec

42,636 branch-misses # 0.01% of all branches16.891160582 seconds time elapsed

-----------------------------------with -xavx----------------------------------------

Performance counter stats for 'a.out':

1288.423505 task-clock # 0.996 CPUs utilized

3 context-switches # 0.000 M/sec

2 CPU-migrations # 0.000 M/sec

279 page-faults # 0.000 M/sec

2,708,906,702 cycles # 2.102 GHz

1,608,134,568 stalled-cycles-frontend # 59.36% frontend cycles idle

798,177,722 stalled-cycles-backend # 29.46% backend cycles idle

3,803,270,546 instructions # 1.40 insns per cycle

# 0.42 stalled cycles per insn

300,601,809 branches # 233.310 M/sec

15,167 branch-misses # 0.01% of all branches1.293986790 seconds time elapsed

Is there some kind of processor internal emulation going on? I know for denormal numbers, adds end up being 64 times slower than normal.

I'll advise to output the ASM dump with both flags and compare the inner loops

another idea, are you sure the alignment of your *g_data *array is the same in both tests ? in can make a big difference if in one case it's aligned to 32B boundaries and not in the other case

it looks like this code can be simplified quite a lot, for example you can get rid of temp1 and temp2 and replace the 4 last line (the 2x 128-bit store) with a single_mm256_storeu_si256, the compiler will take care of the proper optimization 2x 128-bit moves with /QxAVX , a single 256-bit move with /QxCORE-AVX2

if you post the simplified version I'll provide more advices

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

Maybe your program has AVX to SSE transition penalties.

Thanks, you're right. I've confirmed with performance counters:

perf stat -e r10c1,r20c1 a.out (OTHER_ASSISTS.AVX_TO_SSE, OTHER_ASSISTS.SSE_TO_AVX)

Performance counter stats for 'a.out':

200,000,003 r10c1

200,001,376 r20c1

16.883746025 seconds time elapsed

I just think the penalty of saving/restoring all YMM registers is too much to hide from the programmer - there should be some kind of warning.

Please ignore those inefficiencies you mentioned. This is just a test program. The dead values temp1, temp2 are for debugging and are eliminated anyways. The storing to g_data is to prevent the values I want from being marked as dead. I would like to know if there's a better, less expensive way to mark values as live for benchmarking purposes. I was thinking of something like

if (++count == 123456789)

printf("%d", keepAliveValue)

where the compiler won't know any better than to compute keepAliveValue, but won't ever actually execute the printf().

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

It would be interesting if you could post diassembled code?

>>>I just think the penalty of saving/restoring all YMM registers is too much to hide from the programmer - there should be some kind of warning>>>

Warning is given by performance counters and/or by Intel software development emulator.

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

unclejoe wrote:

Maybe your program has AVX to SSE transition penalties.

Thanks, you're right. I've confirmed with performance counters:

congrats to iliyapolak!

unclejoe wrote:

I just think the penalty of saving/restoring all YMM registers is too much to hide from the programmer - there should be some kind of warning.

there was a discussion about it in another thread the other day, even if coding only with intrinsics it is advised to use the /QxAVX flag since the compiler take care of the transitions for you by inserting proper VZEROUPPER instructions where required

unclejoe wrote:

anyways. The storing to g_data is to prevent the values I want from being marked as dead. I would like to know if there's a better, less expensive way to mark values as live for benchmarking purposes.

what I willl typically do in such cases is to declare out of loop something like this:

__m256 keepAlive = _mm256_setzero_ps();

then in the loop body, something like:

keepAlive = _mm256_or_ps(keepAlive,_mm256_castsi256_ps(v8n_meanInt32));

this way you'll have minimal impact on performance (keepAlive will be in a YMM register and VORPS is low reciprocal throughput), unlike with your stores

to fool the compiler simply save the content of keepAlive once at the end of the routine

it's always a good idea to have a look at ASM dumps though, to be certain of what you're measuring

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

Here's the assembly from the inner most loop. Also, I won't need to mix the 2 instruction types since I can recompile everything.

----------------------------------------------------------------------no VEX prefix---------------------------------------------------------------------

..B1.9: # Preds ..B1.7 ..B1.10

movdqa %xmm3, %xmm2 #16.94

punpckhwd %xmm0, %xmm3 #16.129

punpcklwd %xmm0, %xmm2 #16.94

movd %esi, %xmm1 #13.61

punpcklwd %xmm1, %xmm1 #13.61

pshufd $0, %xmm1, %xmm5 #13.61

movdqa %xmm5, %xmm4 #17.89

punpckhwd %xmm0, %xmm5 #17.124

movl %ecx, %esi #21.16

punpcklwd %xmm0, %xmm4 #17.89

andl $7, %esi #21.16

movslq %esi, %rsi #21.5

shlq $4, %rsi #21.5

incl %ecx #22.17

movl %ecx, %edi #22.22

andl $7, %edi #22.22

vinsertf128 $1, %xmm3, %ymm2, %ymm3 #16.47

vcvtdq2ps %ymm3, %ymm7 #16.20

movslq %edi, %rdi #22.5

shlq $4, %rdi #22.5

cmpl $100000000, %ecx #11.23

vinsertf128 $1, %xmm5, %ymm4, %ymm6 #17.42

vcvtdq2ps %ymm6, %ymm8 #17.15

vmulps %ymm8, %ymm7, %ymm9 #18.63

vsqrtps %ymm9, %ymm10 #18.48

vcvtps2dq %ymm10, %ymm11 #18.29

movaps %xmm11, g_data(%rsi) #21.5

vextractf128 $1, %ymm11, g_data(%rdi) #22.5

jl ..B1.2 # Prob 100% #11.23

-----------------------------------------------------------------------with VEX prefix----------------------------------------------------------------------

..B1.9: # Preds ..B1.7 ..B1.10

vpunpcklwd %xmm0, %xmm4, %xmm3 #16.94

vmovd %esi, %xmm1 #13.61

movl %ecx, %esi #21.16

andl $7, %esi #21.16

movslq %esi, %rsi #21.5

shlq $4, %rsi #21.5

vpunpcklwd %xmm1, %xmm1, %xmm2 #13.61

incl %ecx #22.17

movl %ecx, %edi #22.22

andl $7, %edi #22.22

vpshufd $0, %xmm2, %xmm7 #13.61

movslq %edi, %rdi #22.5

shlq $4, %rdi #22.5

vpunpckhwd %xmm0, %xmm4, %xmm4 #16.129

vinsertf128 $1, %xmm4, %ymm3, %ymm5 #16.47

vcvtdq2ps %ymm5, %ymm10 #16.20

vpunpcklwd %xmm0, %xmm7, %xmm6 #17.89

vpunpckhwd %xmm0, %xmm7, %xmm8 #17.124

vinsertf128 $1, %xmm8, %ymm6, %ymm9 #17.42

vcvtdq2ps %ymm9, %ymm11 #17.15

vmulps %ymm11, %ymm10, %ymm12 #18.63

vsqrtps %ymm12, %ymm13 #18.48

vcvtps2dq %ymm13, %ymm14 #18.29

vmovaps %xmm14, g_data(%rsi) #21.5

vextractf128 $1, %ymm14, g_data(%rdi) #22.5

cmpl $100000000, %ecx #11.23

jl ..B1.2 # Prob 100% #11.23

It seems there are indeed 2 transitions in the mixed non-VEX SSE and AVX.

1. punpcklwd %xmm0, %xmm4 #17.89 to vinsertf128 $1, %xmm3, %ymm2, %ymm3 #16.47

2. movaps %xmm11, g_data(%rsi) #21.5 to vextractf128 $1, %ymm11, g_data(%rdi) #22.5

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

>>>congrats to iliyapolak!>>>

Thank you very much:)

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

>>>It seems there are indeed 2 transitions in the mixed non-VEX SSE and AVX.

1. punpcklwd %xmm0, %xmm4 #17.89 to vinsertf128 $1, %xmm3, %ymm2, %ymm3 #16.47

2. movaps %xmm11, g_data(%rsi) #21.5 to vextractf128 $1, %ymm11, g_data(%rdi) #22.5>>>

Yes it seems that these instructions are responsible for transition penalty.

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page