Solved: >>This prefetcher is said to

Chen_S_ · ‎06-08-2014

The code is compiled using MSVC2010 SP1, with /arch:AVX, and the AVX version is slightly (5~10%) slower than the SSE version. I am using an E-1230 V2 processor with 16GB dual-channel DDR3-1600 memory.

Both functions read 416 (9 float point vectors of length 8, and another 4 float vectors of length 8) byte data from memory, and return a float value, there is no memory store involved. The compiled SSE version has 111 instructions, and the AVX version has 67 instructions. All memory visits are aligned (16-byte for SSE, 32-byte for AVX). The difference between the two versions is only that the SSE version process 4 floating points in each instruction, so need two instructions for a length 8 vector, while the AVX version process 8 floating points in each instruction.

The AVX version should be at least as fast as the SSE version even if the program is memory-bound, but it turns out the AVX version is slower. The code is the core in an image processing program, the SSE version processes the image in ~180 ms, but the AVX version takes about ~200 ms. The function is called for about 2M times in processing an image, with different inputs.

The code is as follows.

SSE version:

float _SURFEvalProjection_2C2R_Fast(SURFWeakClassifier * ptrWeak, int pixOffset)
{

SURFPixData * ptr_x0y0 = ptrWeak->Feature.ptrPtOffsets[0] + pixOffset;
SURFPixData * ptr_x0y1 = ptrWeak->Feature.ptrPtOffsets[1] + pixOffset;
SURFPixData * ptr_x0y2 = ptrWeak->Feature.ptrPtOffsets[2] + pixOffset;

SURFPixData * ptr_x1y0 = ptrWeak->Feature.ptrPtOffsets[3] + pixOffset;
SURFPixData * ptr_x1y1 = ptrWeak->Feature.ptrPtOffsets[4] + pixOffset;
SURFPixData * ptr_x1y2 = ptrWeak->Feature.ptrPtOffsets[5] + pixOffset;

SURFPixData * ptr_x2y0 = ptrWeak->Feature.ptrPtOffsets[6] + pixOffset;
SURFPixData * ptr_x2y1 = ptrWeak->Feature.ptrPtOffsets[7] + pixOffset;
SURFPixData * ptr_x2y2 = ptrWeak->Feature.ptrPtOffsets[8] + pixOffset;

__m128 dp_4, dp_8;
__m128 tmp40, tmp41, tmp42, tmp43, tmp80, tmp81, tmp82, tmp83;
tmp40 = _mm_sub_ps(_mm_add_ps(ptr_x1y1->datam128[0], ptr_x0y0->datam128[0]), _mm_add_ps(ptr_x1y0->datam128[0], ptr_x0y1->datam128[0]));
tmp41 = _mm_sub_ps(_mm_add_ps(ptr_x2y1->datam128[0], ptr_x1y0->datam128[0]), _mm_add_ps(ptr_x2y0->datam128[0], ptr_x1y1->datam128[0]));
tmp42 = _mm_sub_ps(_mm_add_ps(ptr_x1y2->datam128[0], ptr_x0y1->datam128[0]), _mm_add_ps(ptr_x1y1->datam128[0], ptr_x0y2->datam128[0]));
tmp43 = _mm_sub_ps(_mm_add_ps(ptr_x2y2->datam128[0], ptr_x1y1->datam128[0]), _mm_add_ps(ptr_x2y1->datam128[0], ptr_x1y2->datam128[0]));
tmp80 = _mm_sub_ps(_mm_add_ps(ptr_x1y1->datam128[1], ptr_x0y0->datam128[1]), _mm_add_ps(ptr_x1y0->datam128[1], ptr_x0y1->datam128[1]));
tmp81 = _mm_sub_ps(_mm_add_ps(ptr_x2y1->datam128[1], ptr_x1y0->datam128[1]), _mm_add_ps(ptr_x2y0->datam128[1], ptr_x1y1->datam128[1]));
tmp82 = _mm_sub_ps(_mm_add_ps(ptr_x1y2->datam128[1], ptr_x0y1->datam128[1]), _mm_add_ps(ptr_x1y1->datam128[1], ptr_x0y2->datam128[1]));
tmp83 = _mm_sub_ps(_mm_add_ps(ptr_x2y2->datam128[1], ptr_x1y1->datam128[1]), _mm_add_ps(ptr_x2y1->datam128[1], ptr_x1y2->datam128[1]));

// Calculate the inner product, add eps, and rsqrt.
dp_4 = _mm_add_ps(_mm_add_ps(_mm_dp_ps(tmp40, tmp40, 255), _mm_dp_ps(tmp41, tmp41, 255)),
_mm_add_ps(_mm_dp_ps(tmp42, tmp42, 255), _mm_dp_ps(tmp43, tmp43, 255)));
dp_8 = _mm_add_ps(_mm_add_ps(_mm_dp_ps(tmp80, tmp80, 255), _mm_dp_ps(tmp81, tmp81, 255)),
_mm_add_ps(_mm_dp_ps(tmp82, tmp82, 255), _mm_dp_ps(tmp83, tmp83, 255)));
dp_4 = _mm_add_ps(dp_4, dp_8);

__m128 m128_eps = _mm_set_ps(0.00000001, 0.00000001, 0.00000001, 0.00000001);
dp_4 = _mm_add_ps(dp_4, m128_eps);
dp_4 = _mm_rsqrt_ps(dp_4);

// Normalize and inner prod with the projections.
__m128 res0 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection), tmp40);
__m128 res1 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+8), tmp41);
__m128 res2 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+16), tmp42);
__m128 res3 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+24), tmp43);

__m128 res4 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+4), tmp80);
__m128 res5 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+12), tmp81);
__m128 res6 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+20), tmp82);
__m128 res7 = _mm_mul_ps(_mm_load_ps(ptrWeak->Projection+28), tmp83);

res0 = _mm_add_ps(_mm_add_ps(res0, res1), _mm_add_ps(res2, res3));
res1 = _mm_add_ps(_mm_add_ps(res4, res5), _mm_add_ps(res6, res7));
res0 = _mm_add_ps(res0, res1);
res0 = _mm_mul_ps(res0, dp_4);

__m128 m128_zero = _mm_setzero_ps();
res0 = _mm_hadd_ps(m128_zero, res0);
res0 = _mm_hadd_ps(m128_zero, res0);

return res0.m128_f32[3];
}

AVX version:

float _SURFEvalProjection_2C2R_Fast_AVX(SURFWeakClassifier * ptrWeak, int pixOffset)

{

SURFPixData * ptr_x0y0 = ptrWeak->Feature.ptrPtOffsets[0] + pixOffset;

SURFPixData * ptr_x0y1 = ptrWeak->Feature.ptrPtOffsets[1] + pixOffset;
SURFPixData * ptr_x0y2 = ptrWeak->Feature.ptrPtOffsets[2] + pixOffset;

SURFPixData * ptr_x1y0 = ptrWeak->Feature.ptrPtOffsets[3] + pixOffset;
SURFPixData * ptr_x1y1 = ptrWeak->Feature.ptrPtOffsets[4] + pixOffset;
SURFPixData * ptr_x1y2 = ptrWeak->Feature.ptrPtOffsets[5] + pixOffset;

SURFPixData * ptr_x2y0 = ptrWeak->Feature.ptrPtOffsets[6] + pixOffset;
SURFPixData * ptr_x2y1 = ptrWeak->Feature.ptrPtOffsets[7] + pixOffset;
SURFPixData * ptr_x2y2 = ptrWeak->Feature.ptrPtOffsets[8] + pixOffset;

__m256 dp; // Dot product.
__m256 tmp0, tmp1, tmp2, tmp3;

tmp0 = _mm256_sub_ps(_mm256_add_ps(ptr_x1y1->datam256, ptr_x0y0->datam256), _mm256_add_ps(ptr_x1y0->datam256, ptr_x0y1->datam256));
tmp1 = _mm256_sub_ps(_mm256_add_ps(ptr_x2y1->datam256, ptr_x1y0->datam256), _mm256_add_ps(ptr_x2y0->datam256, ptr_x1y1->datam256));
tmp2 = _mm256_sub_ps(_mm256_add_ps(ptr_x1y2->datam256, ptr_x0y1->datam256), _mm256_add_ps(ptr_x1y1->datam256, ptr_x0y2->datam256));
tmp3 = _mm256_sub_ps(_mm256_add_ps(ptr_x2y2->datam256, ptr_x1y1->datam256), _mm256_add_ps(ptr_x2y1->datam256, ptr_x1y2->datam256));

dp = _mm256_add_ps(_mm256_add_ps(_mm256_dp_ps(tmp0, tmp0, 255), _mm256_dp_ps(tmp1, tmp1, 255)),
_mm256_add_ps(_mm256_dp_ps(tmp2, tmp2, 255), _mm256_dp_ps(tmp3, tmp3, 255)));
dp = _mm256_add_ps(dp, _mm256_permute2f128_ps(dp, dp, 3 | 0<<4)); // Now 7 and 3 are reversed.

__m256 m256_eps = _mm256_set_ps(0.00000001, 0.00000001, 0.00000001, 0.00000001, 0.00000001, 0.00000001, 0.00000001, 0.00000001);
dp = _mm256_add_ps(dp, m256_eps);
dp = _mm256_rsqrt_ps(dp);

// Normalize and inner prod with the projections.
__m256 res0 = _mm256_mul_ps(_mm256_load_ps(ptrWeak->Projection), tmp0);
__m256 res1 = _mm256_mul_ps(_mm256_load_ps(ptrWeak->Projection+8), tmp1);
__m256 res2 = _mm256_mul_ps(_mm256_load_ps(ptrWeak->Projection+16), tmp2);
__m256 res3 = _mm256_mul_ps(_mm256_load_ps(ptrWeak->Projection+24), tmp3);

res0 = _mm256_add_ps(_mm256_add_ps(res0, res1), _mm256_add_ps(res2, res3));
res0 = _mm256_mul_ps(res0, dp);

__m256 m256_zero = _mm256_setzero_ps();
res0 = _mm256_hadd_ps(m256_zero, res0);
res0 = _mm256_hadd_ps(m256_zero, res0);
res0 = _mm256_add_ps(res0, _mm256_permute2f128_ps(res0, res0, 3 | 0<<4));

return res0.m256_f32[7];
}

McCalpinJohn · ‎06-11-2014

It is not at all unusual for AVX code on Sandy Bridge & Ivy Bridge to be slightly slower than SSE code for data that is not contained in the L1 cache. There are some known causes, but it appears that you are avoiding the most obvious ones.

Last week I did some testing with the STREAM benchmark configured to operate on data at various levels of the cache hierarchy (including data in memory -- the standard STREAM configuration), and found

For L1-contained data, AVX vector code was by far the fastest, followed by SSE vector code, followed by scalar code. (Scalar SSE and scalar AVX code gave very similar performance.)
For L2-contained data, the SSE vector code was 2%-5% faster than the AVX vector code (as I expected), but the *scalar* code was 10% to 40% faster than the SSE vector code and 15%-60% faster than the AVX vector code. (NOTE: See corrected scalar results in a later post!)
For L3-contained data (using a single threaded benchmark test), the SSE vector code was 3% to 14% faster than the AVX vector code, but again the *scalar* code was fastest: 50%-60% faster than the SSE vector code and 60%-80% faster than the AVX vector code. (NOTE: See corrected scalar results in a later post!)
For data in local memory (using a single-threaded benchmark test), the SSE vector code was 1%-3% faster than the AVX vector code, while the *scalar* code was 4%-9% faster than the SSE vector code and 7%-12% faster than the AVX vector code.

I trust the L1-contained results and the memory-contained results -- the L1-contained results are close to the hardware specifications, while the memory-contained results are consistent with many experiments I have run over the last 2 years. The L2-contained and L3-contained results are quite a bit more dramatic than I expected, so I will go back and re-test these with a range of array alignments to see if this is a side effect of bank conflicts in the L1 cache. (NOTE: These L2-contained and L3-contained scalar results were wrong! See corrected scalar results in a later post!)

My interpretation of all these results is that the differences have to do with the startup of the L1 D cache streaming prefetcher. This prefetcher is said to operate based on streams of loads, and it is pretty clear that instructions with smaller load payloads will define "streams" faster than instructions with larger load payloads.

View solution in original post

jimdempseyatthecove · ‎06-09-2014

>>This prefetcher is said to operate based on streams of loads, and it is pretty clear that instructions with smaller load payloads will define "streams" faster than instructions with larger load payloads.

You would think that the hardware prefetcher would adapt to cache line order fetch pattern. With SSE you would have 4 accesses to the same cache line before advancing to the next cache line. If the prefetcher had counter to weight the number of accesses in addition to the order then this might account for the difference.

I wonder what would happen if you would be able to insert a dummy read and discard (e.g. SSE reads at , you insert mov rax,[X+8] or something like that). Not seeing the code it is hard for me to guess at what the most effective dummy move offsets would be (4, 8, 12,

Jim Dempsey

Chen_S_ · ‎06-10-2014

Thank you John and Jim.

I have added several _mm_prefetch to the code, and the performance improves from 180 ms to 125 ms for SSE, and from 200 ms to 170 ms for AVX. Both are improved, but the performance gap increases. The code with prefetch is like follows:

    SURFPixData * ptr_x0y0 = ptrWeak->Feature.ptrPtOffsets[0] + pixOffset;
    _mm_prefetch((char*)(ptr_x0y0->data), _MM_HINT_T0);
    SURFPixData * ptr_x0y1 = ptrWeak->Feature.ptrPtOffsets[1] + pixOffset;
    _mm_prefetch((char*)(ptr_x0y1->data), _MM_HINT_T0);
    ...

In my funciton, 416 bytes (9 float vectors of length 8, and 1 float vector of length 32) need to be read from 10 different memory locations in each call to the function. The length-32 vector is likely already in the cache when the function is called as in the context of my code, but the 9 length-8 vectors are not likely in the cache, and usually are on different cache lines. Between two adjacent call of the function, the pointers to the 9 length-8 vectors move by 64 bytes (it is a sliding window detection program, and I move the window by a step of 2), so none of the 9 length-8 vectors is already in the cache when the funcion is called the next time, but there do is a streaming pattern, with 9 streams of step 64 bytes (perhaps too many streams for the CPU to handle?). It turns out that the CPU's hardware prefetcher does help, since if I increase the window step size from 2 to 3, such that the pointers move by 96 bytes between adjacent function calls and there are fewer calls to the function, the performance drops by about 1/3, and if it is changed to step size=4 (128 bytes), the performance drops by almost 50%. Furthermore, using even window step size (such that the pointers move by n*64 bytes) is always better than using odd window step size (such that the pointers move by (2n+1)*32 bytes), which can be related to the cache line size which is 64 bytes.

In addition, my profiler (MSVC2010, CPU sampling mode) tells me that there is a single instruction that takes 14.5% of the CPU samples for the AVX version of the function:

000000013FB320CE vaddps ymm0,ymm5,ymmword ptr [rsi+r9]

Obviously, neither hardware prefetcher nor the prefetch instruction works for this memory address, although the prefetch instruction does appear in the asm code, about 17 instructions before:

000000013FB32078 prefetcht0 [rsi+r9]

For the SSE version, there are several instructions consuming about 2%~4% CPU samples each, but there is no single instruction with >5% CPU samples. This may confirm that the memory access pattern of the SSE code (many small loads) is more friendly to the CPU for detecting streams and performing hardware prefetching.

At last, I have another two questions:

1. What happens if a memory location A is prefetched to cache line C at t1, and then another memory position B mapped to the same cache line C is prefetched/accessed (by accessed I mean there is an instruction using B as a memory operand) at t2, but the 8-ways of the cache line C are fully occupied. For IVB, this means A and B are 4KB away, since L1D is 32KB 8-way set-associate. What is the cache line replacement algorithm used in Ivy Bridge? If it is least-recent-use, will A be replaced if no instruction has accessed A between t1 and t2? In this case, obviously A has not been used at all, so by least-recent-use, it seems should be replaced by B.

Although this case is very unlikely in my case, but it is an interesting question.

2. Memory address A is prefetched at time t1, since there is about 200~300 cycles of memory latency, A will not appear in L1D until 200 cycles later. But an instruction uses A as a memory operand at time t2, before the prefetch completes. Will that instruction incur another transmission on the memory controller/bus? Or is the CPU smart enough to know that A is already on the way and will soon be available in L1D, so that just wait for a short while is OK? This question can be relevant to my problem since, in my code the prefetch is only 10~20 instructions prior to using the memory in the AVX code.

Chen_S_ · ‎06-10-2014

By modifying the prefetch to prefetching 64 floating points ahead, the AVX code is significantly improved:

    SURFPixData * ptr_x0y0 = ptrWeak->Feature.ptrPtOffsets[0] + pixOffset;
    _mm_prefetch((char*)(ptr_x0y0->data + 64), _MM_HINT_T0);
    SURFPixData * ptr_x0y1 = ptrWeak->Feature.ptrPtOffsets[1] + pixOffset;
    _mm_prefetch((char*)(ptr_x0y1->data + 64), _MM_HINT_T0);
    ...

SSE code: 122 ms; AVX code: 127 ms. AVX is still 3~4% slower.

Bernard · ‎06-10-2014

>>>In addition, my profiler (MSVC2010, CPU sampling mode) tells me that there is a single instruction that takes 14.5% of the CPU samples for the AVX version of the function:

000000013FB320CE vaddps ymm0,ymm5,ymmword ptr [rsi+r9]>>>

Did you check with VS debugger what memory address RSI register does point to and what type of value (index?) is in R9?

I suppose that quoted instruction is referencing either struct or array base address and loads some value from the index which is contained inside r9.register.

Chen_S_ · ‎06-11-2014

Finally by running the program in parallel using 6 openmp threads (on 4 cores), both AVX and SSE version runs at almost the same speed, with AVX slightly (~1%) faster. The performance drops with 7 or 8 threads. The memory bandwidth utilized is > 20GB/s. I think we can draw a conclusion for this thread, that in normal single thread and memory-bound programs, SSE may outperform AVX since its memory access pattern (many small access) is more friendly to the hardware prefetcher. But multi-threading using openmp compensates this by keeping both the CPU and the memory controller busy.

McCalpinJohn · ‎06-11-2014

Follow-up & correction on my comments from 2014-06-09 (https://software.intel.com/en-us/forums/topic/516265#comment-1790852)

In that set of results I said that the scalar code was faster than the vector code for L2 and L3 accesses, and that I was a bit suspicious of the results. It is a good thing that I included that disclaimer, because I was wrong!

The compiler removed 1/2 of the work in the scalar cases, so the apparent results were inflated. I fixed the part of my code that allowed the compiler to make this optimization and used performance counters to verify that the correct amount of work was being done. The results now show that scalar code is slower than vector code for L2 and L3-contained data.

Scalar code is still slightly faster for data in memory.

McCalpinJohn · ‎06-11-2014

It is not at all unusual for AVX code on Sandy Bridge & Ivy Bridge to be slightly slower than SSE code for data that is not contained in the L1 cache. There are some known causes, but it appears that you are avoiding the most obvious ones.

Last week I did some testing with the STREAM benchmark configured to operate on data at various levels of the cache hierarchy (including data in memory -- the standard STREAM configuration), and found

For L1-contained data, AVX vector code was by far the fastest, followed by SSE vector code, followed by scalar code. (Scalar SSE and scalar AVX code gave very similar performance.)
For L2-contained data, the SSE vector code was 2%-5% faster than the AVX vector code (as I expected), but the *scalar* code was 10% to 40% faster than the SSE vector code and 15%-60% faster than the AVX vector code. (NOTE: See corrected scalar results in a later post!)
For L3-contained data (using a single threaded benchmark test), the SSE vector code was 3% to 14% faster than the AVX vector code, but again the *scalar* code was fastest: 50%-60% faster than the SSE vector code and 60%-80% faster than the AVX vector code. (NOTE: See corrected scalar results in a later post!)
For data in local memory (using a single-threaded benchmark test), the SSE vector code was 1%-3% faster than the AVX vector code, while the *scalar* code was 4%-9% faster than the SSE vector code and 7%-12% faster than the AVX vector code.

I trust the L1-contained results and the memory-contained results -- the L1-contained results are close to the hardware specifications, while the memory-contained results are consistent with many experiments I have run over the last 2 years. The L2-contained and L3-contained results are quite a bit more dramatic than I expected, so I will go back and re-test these with a range of array alignments to see if this is a side effect of bank conflicts in the L1 cache. (NOTE: These L2-contained and L3-contained scalar results were wrong! See corrected scalar results in a later post!)

My interpretation of all these results is that the differences have to do with the startup of the L1 D cache streaming prefetcher. This prefetcher is said to operate based on streams of loads, and it is pretty clear that instructions with smaller load payloads will define "streams" faster than instructions with larger load payloads.

emmanuel_attia · ‎06-16-2014

This is a bit off topic but you should avoid

return res0.m256_f32[7];

Use _mm_extract_ps(_mm256_extractf128_ps(res0, 1), 3) instead to avoid penalties (and your code will be more portable across compilers).

Need help: Why my avx code is slower than SSE code?