- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
I have a program which does 1-byte vector sum using SSE intrinsics.
When I compile it with Intel compiler (ICC 15.0.1, x86_64, Linux, SandyBridge CPU) it segfaults, it looks like the end condition of the loop is not checked correctly. Same code works with GCC.
My only optimization flag -s "-O3" (When compiling with -O1 the program works).
#include <stdio.h> #include <string.h> #include <stdint.h> #include <stdlib.h> #include <xmmintrin.h> #include <emmintrin.h> static inline void add_single(void *dst, void *src) { *((int8_t*)dst) += *((int8_t*)src); } __attribute__((noinline)) void vector_sum_char(void *dst, void *src, unsigned length) { const unsigned factor = sizeof(__m128i); __m128i s[1], d[1]; int i, j; i = 0; while(i < (int)(length - factor + 1)) { printf("src=%p dst=%p i=%d max=%d\n", src, dst, i, (int)(length - factor + 1)); j = 0; d= (__m128i)_mm_loadu_si128(dst); s = (__m128i)_mm_loadu_si128(src); src += sizeof(__m128i); j = 0; _mm_storeu_si128(dst, _mm_add_epi8(d , s )); dst += sizeof(__m128i); i += factor; } for (i = 0; i < (length%factor); ++i) { add_single(dst, src); ++dst; ++src; } } int main(int argc, char **argv) { int num_elems = 17; void *src = calloc(1, num_elems); void *dst = calloc(1, num_elems); vector_sum_char(dst, src, num_elems); free(src); free(dst); return 0; }
Output looks like this:
src=0x6122e0 dst=0x612300 i=0 max=2 src=0x6122f0 dst=0x612310 i=16 max=2 src=0x612300 dst=0x612320 i=32 max=2 src=0x612310 dst=0x612330 i=48 max=2 src=0x612320 dst=0x612340 i=64 max=2 src=0x612330 dst=0x612350 i=80 max=2 src=0x612340 dst=0x612360 i=96 max=2 src=0x612350 dst=0x612370 i=112 max=2 ... Segmentation fault (core dumped) gdb: Program received signal SIGSEGV, Segmentation fault. vector_sum_char (dst=0x7fffffffd7a0, src=0x0, length=1083768336) 42 _mm_storeu_si128(dst, _mm_add_epi8(d, s )); (gdb) bt #0 vector_sum_char (dst=0x7fffffffd7a0, src=0x0, length=1083768336) #1 0x00000000004015ff in main (argc=-10336, argv=0x0) (gdb) f 0 #0 vector_sum_char (dst=0x7fffffffd7a0, src=0x0, length=1083768336) 42 _mm_storeu_si128(dst, _mm_add_epi8(d , s )); 0x000000000040167f <+95>: mov %rbp,%rsi 0x0000000000401682 <+98>: mov %rbx,%rdx 0x0000000000401685 <+101>: mov %r12d,%ecx 0x0000000000401688 <+104>: mov %r15d,%r8d 0x000000000040168b <+107>: xor %eax,%eax 0x000000000040168d <+109>: callq 0x401338 <printf@plt> => 0x0000000000401692 <+114>: movdqu (%rbx),%xmm1 0x0000000000401696 <+118>: movdqu 0x0(%rbp),%xmm0 0x000000000040169b <+123>: paddb %xmm0,%xmm1 0x000000000040169f <+127>: inc %r14d 0x00000000004016a2 <+130>: movdqu %xmm1,(%rbx) 0x00000000004016a6 <+134>: add $0x10,%rbp 0x00000000004016aa <+138>: add $0x10,%rbx 0x00000000004016ae <+142>: add $0x10,%r12d 0x00000000004016b2 <+146>: cmp %r13d,%r14d 0x00000000004016b5 <+149>: jl 0x40167a <vector_sum_char+90>
please note here that in address <+146> r13d is compared, but on <+101> and <+142> r12d is being used as "i".
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
You should not be performing arithmetic on void*
A void "object" has no defined size, not even 0.
To expect src += sizeof(__m128i), where src is of type void* is expecting undefined behavior to perform as you expect.
The fact that one compiler may provide for units of 1 is no assurance that all compilers (or options) will exhibit this behavior..
(uintptr_t)src += sizeof(__m128i); would be better;
same with ((char*)src)++;
See if the code produces the same effect after correcting the arithmetic statements performed on void*.
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
jimdempseyatthecove wrote:
You should not be performing arithmetic on void*
Hi Jim,
I've modified my program to avoid void* arithmetic, and it still fails the same way. here's the modified program:
static inline void add_single(int8_t *dst, int8_t *src) { *((int8_t*)dst) += *((int8_t*)src); } __attribute__((noinline)) void vector_sum_char(int8_t *dst, int8_t *src, unsigned length) { const unsigned factor = sizeof(__m128i); __m128i s[1], d[1]; int i, j; i = 0; while(i < (int)(length - factor + 1)) { printf("src=%p dst=%p i=%d max=%d\n", src, dst, i, (int)(length - factor + 1)); j = 0; d= (__m128i)_mm_loadu_si128((void*)dst); s = (__m128i)_mm_loadu_si128((void*)src); src += sizeof(__m128i); j = 0; _mm_storeu_si128((void*)dst, _mm_add_epi8(d , s )); dst += sizeof(__m128i); i += factor; } for (i = 0; i < (length%factor); ++i) { add_single(dst, src); ++dst; ++src; } }

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page