- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
we are benchmarking the code below and we get an unexpected behavior on Intel Xeon with Win64. The throughput reaches a peak at 128 bytes, and then decreases to one fifth at 256 bytes. The throughput is uniform on Opteron CPUs.
I have looked for potential cache misses but the numbers are almost the same on both analyzes.
Are there any bottlenecks? I am not sure how to correct the sharp drop in performance at 256 bytes. BTW, a 256 bytes testing means calling twice MOVE128_SSE2_INSTRINSIC_MFENCE. The code below is extrapolated, and there are other things going on besides that wrapper, such as checks. But these checks are identical for 128 and 256 bytes.
void
MOVE128_SSE2_INSTRINSIC_MFENCE(volatile unsigned *src,
volatile unsigned *dst)
{
__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
__m128i* src_ptr = (__m128i*)src;
__m128i* dst_ptr = (__m128i*)dst;
_mm_prefetch((char*) src_ptr,_MM_HINT_T0);
xmm0 = _mm_load_si128(src_ptr);
xmm1 = _mm_load_si128(src_ptr + 1);
xmm2 = _mm_load_si128(src_ptr + 2);
xmm3 = _mm_load_si128(src_ptr + 3);
_mm_prefetch((char*) (src_ptr + 4), _MM_HINT_T0);
xmm4 = _mm_load_si128(src_ptr + 4);
xmm5 = _mm_load_si128(src_ptr + 5);
xmm6 = _mm_load_si128(src_ptr + 6);
xmm7 = _mm_load_si128(src_ptr + 7);
_mm_store_si128(dst_ptr, xmm0);
_mm_store_si128(dst_ptr + 1, xmm1);
_mm_store_si128(dst_ptr + 2, xmm2);
_mm_store_si128(dst_ptr + 3, xmm3);
_mm_store_si128(dst_ptr + 4, xmm4);
_mm_store_si128(dst_ptr + 5, xmm5);
_mm_store_si128(dst_ptr + 6, xmm6);
_mm_store_si128(dst_ptr + 7, xmm7);
/* Flushing */
_mm_mfence();
}
One more thing - I noticed that the performance is corrected at 256 bytes if I place a flush before the second _mm_prefetch(). The drawback is that the throughput at 128 bytes decreases with about 20% which is quite a lot.
Any suggestions would be most appreciated.
Best regards,
Calin
we are benchmarking the code below and we get an unexpected behavior on Intel Xeon with Win64. The throughput reaches a peak at 128 bytes, and then decreases to one fifth at 256 bytes. The throughput is uniform on Opteron CPUs.
I have looked for potential cache misses but the numbers are almost the same on both analyzes.
Are there any bottlenecks? I am not sure how to correct the sharp drop in performance at 256 bytes. BTW, a 256 bytes testing means calling twice MOVE128_SSE2_INSTRINSIC_MFENCE. The code below is extrapolated, and there are other things going on besides that wrapper, such as checks. But these checks are identical for 128 and 256 bytes.
void
MOVE128_SSE2_INSTRINSIC_MFENCE(volatile unsigned *src,
volatile unsigned *dst)
{
__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
__m128i* src_ptr = (__m128i*)src;
__m128i* dst_ptr = (__m128i*)dst;
_mm_prefetch((char*) src_ptr,_MM_HINT_T0);
xmm0 = _mm_load_si128(src_ptr);
xmm1 = _mm_load_si128(src_ptr + 1);
xmm2 = _mm_load_si128(src_ptr + 2);
xmm3 = _mm_load_si128(src_ptr + 3);
_mm_prefetch((char*) (src_ptr + 4), _MM_HINT_T0);
xmm4 = _mm_load_si128(src_ptr + 4);
xmm5 = _mm_load_si128(src_ptr + 5);
xmm6 = _mm_load_si128(src_ptr + 6);
xmm7 = _mm_load_si128(src_ptr + 7);
_mm_store_si128(dst_ptr, xmm0);
_mm_store_si128(dst_ptr + 1, xmm1);
_mm_store_si128(dst_ptr + 2, xmm2);
_mm_store_si128(dst_ptr + 3, xmm3);
_mm_store_si128(dst_ptr + 4, xmm4);
_mm_store_si128(dst_ptr + 5, xmm5);
_mm_store_si128(dst_ptr + 6, xmm6);
_mm_store_si128(dst_ptr + 7, xmm7);
/* Flushing */
_mm_mfence();
}
One more thing - I noticed that the performance is corrected at 256 bytes if I place a flush before the second _mm_prefetch(). The drawback is that the throughput at 128 bytes decreases with about 20% which is quite a lot.
Any suggestions would be most appreciated.
Best regards,
Calin
Link Copied
1 Reply
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
The fix that I have found consist of inserting several fence operations before the prefetch. It also improves performance if prefetch is done for destination pointers as well.
void
MOVE128_SSE2_INSTRINSIC_MFENCE(volatile unsigned *src,
volatile unsigned *dst)
{
__m128i* src_ptr = (__m128i*)src;
__m128i* dst_ptr = (__m128i*)dst;
register __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
_mm_prefetch((char*) src_ptr,_MM_HINT_T0);
xmm0 = _mm_load_si128(src_ptr);
xmm1 = _mm_load_si128(src_ptr + 1);
xmm2 = _mm_load_si128(src_ptr + 2);
xmm3 = _mm_load_si128(src_ptr + 3);
_mm_lfence();
_mm_prefetch((char*) (src_ptr + 4), _MM_HINT_T0);
xmm4 = _mm_load_si128(src_ptr + 4);
xmm5 = _mm_load_si128(src_ptr + 5);
xmm6 = _mm_load_si128(src_ptr + 6);
xmm7 = _mm_load_si128(src_ptr + 7);
_mm_lfence();
_mm_prefetch((char*) dst_ptr, _MM_HINT_T0);
_mm_store_si128(dst_ptr, xmm0);
_mm_store_si128(dst_ptr + 1, xmm1);
_mm_store_si128(dst_ptr + 2, xmm2);
_mm_store_si128(dst_ptr + 3, xmm3);
_mm_sfence();
_mm_prefetch((char*) (dst_ptr + 4), _MM_HINT_T0);
_mm_store_si128(dst_ptr + 4, xmm4);
_mm_store_si128(dst_ptr + 5, xmm5);
_mm_store_si128(dst_ptr + 6, xmm6);
_mm_store_si128(dst_ptr + 7, xmm7);
_mm_sfence();
}
Now 128 bytes is giving the expected throughput and there are no performance penalties for 256 bytes and more.
Best regards,
Calin
void
MOVE128_SSE2_INSTRINSIC_MFENCE(volatile unsigned *src,
volatile unsigned *dst)
{
__m128i* src_ptr = (__m128i*)src;
__m128i* dst_ptr = (__m128i*)dst;
register __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
_mm_prefetch((char*) src_ptr,_MM_HINT_T0);
xmm0 = _mm_load_si128(src_ptr);
xmm1 = _mm_load_si128(src_ptr + 1);
xmm2 = _mm_load_si128(src_ptr + 2);
xmm3 = _mm_load_si128(src_ptr + 3);
_mm_lfence();
_mm_prefetch((char*) (src_ptr + 4), _MM_HINT_T0);
xmm4 = _mm_load_si128(src_ptr + 4);
xmm5 = _mm_load_si128(src_ptr + 5);
xmm6 = _mm_load_si128(src_ptr + 6);
xmm7 = _mm_load_si128(src_ptr + 7);
_mm_lfence();
_mm_prefetch((char*) dst_ptr, _MM_HINT_T0);
_mm_store_si128(dst_ptr, xmm0);
_mm_store_si128(dst_ptr + 1, xmm1);
_mm_store_si128(dst_ptr + 2, xmm2);
_mm_store_si128(dst_ptr + 3, xmm3);
_mm_sfence();
_mm_prefetch((char*) (dst_ptr + 4), _MM_HINT_T0);
_mm_store_si128(dst_ptr + 4, xmm4);
_mm_store_si128(dst_ptr + 5, xmm5);
_mm_store_si128(dst_ptr + 6, xmm6);
_mm_store_si128(dst_ptr + 7, xmm7);
_mm_sfence();
}
Now 128 bytes is giving the expected throughput and there are no performance penalties for 256 bytes and more.
Best regards,
Calin

Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page