Community
cancel
Showing results for 
Search instead for 
Did you mean: 
developer1
Beginner
82 Views

possible icc bug with for loop condition?

Hi,

I have a program which does 1-byte vector sum using SSE intrinsics.

When I compile it with Intel compiler (ICC 15.0.1, x86_64, Linux, SandyBridge CPU) it segfaults, it looks like the end condition of the loop is not checked correctly. Same code works with GCC.

My only optimization flag -s "-O3" (When compiling with -O1 the program works).

#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <xmmintrin.h>
#include <emmintrin.h>

static inline void add_single(void *dst, void *src) {
    *((int8_t*)dst) += *((int8_t*)src);
}

__attribute__((noinline))
void vector_sum_char(void *dst, void *src, unsigned length)
{
    const unsigned factor = sizeof(__m128i);
    __m128i s[1], d[1];
    int i, j;

    i = 0;
    while(i < (int)(length - factor + 1))
    {
        printf("src=%p dst=%p i=%d max=%d\n", src, dst, i, (int)(length - factor + 1));
        j = 0;
        d = (__m128i)_mm_loadu_si128(dst);
        s = (__m128i)_mm_loadu_si128(src);
        src += sizeof(__m128i);

        j = 0;
        _mm_storeu_si128(dst, _mm_add_epi8(d, s));
        dst += sizeof(__m128i);

        i += factor;
    }
    for (i = 0; i < (length%factor); ++i) {
        add_single(dst, src);
        ++dst;
        ++src;
    }
}

int main(int argc, char **argv)
{
    int  num_elems = 17;
    void *src = calloc(1, num_elems);
    void *dst = calloc(1, num_elems);

    vector_sum_char(dst, src, num_elems);

    free(src);
    free(dst);
    return 0;
}

Output looks like this:

src=0x6122e0 dst=0x612300 i=0 max=2
src=0x6122f0 dst=0x612310 i=16 max=2
src=0x612300 dst=0x612320 i=32 max=2
src=0x612310 dst=0x612330 i=48 max=2
src=0x612320 dst=0x612340 i=64 max=2
src=0x612330 dst=0x612350 i=80 max=2
src=0x612340 dst=0x612360 i=96 max=2
src=0x612350 dst=0x612370 i=112 max=2
...
Segmentation fault (core dumped)

gdb:

Program received signal SIGSEGV, Segmentation fault.
vector_sum_char (dst=0x7fffffffd7a0, src=0x0, length=1083768336)
42	        _mm_storeu_si128(dst, _mm_add_epi8(d, s));
(gdb) bt
#0  vector_sum_char (dst=0x7fffffffd7a0, src=0x0, length=1083768336)
#1  0x00000000004015ff in main (argc=-10336, argv=0x0)
(gdb) f 0
#0  vector_sum_char (dst=0x7fffffffd7a0, src=0x0, length=1083768336)
42	        _mm_storeu_si128(dst, _mm_add_epi8(d, s));

   0x000000000040167f <+95>:	mov    %rbp,%rsi
   0x0000000000401682 <+98>:	mov    %rbx,%rdx
   0x0000000000401685 <+101>:	mov    %r12d,%ecx
   0x0000000000401688 <+104>:	mov    %r15d,%r8d
   0x000000000040168b <+107>:	xor    %eax,%eax
   0x000000000040168d <+109>:	callq  0x401338 <printf@plt>
=> 0x0000000000401692 <+114>:	movdqu (%rbx),%xmm1
   0x0000000000401696 <+118>:	movdqu 0x0(%rbp),%xmm0
   0x000000000040169b <+123>:	paddb  %xmm0,%xmm1
   0x000000000040169f <+127>:	inc    %r14d
   0x00000000004016a2 <+130>:	movdqu %xmm1,(%rbx)
   0x00000000004016a6 <+134>:	add    $0x10,%rbp
   0x00000000004016aa <+138>:	add    $0x10,%rbx
   0x00000000004016ae <+142>:	add    $0x10,%r12d
   0x00000000004016b2 <+146>:	cmp    %r13d,%r14d
   0x00000000004016b5 <+149>:	jl     0x40167a <vector_sum_char+90>

please note here that in address <+146> r13d is compared, but on <+101> and <+142> r12d is being used as "i".

0 Kudos
2 Replies
jimdempseyatthecove
Black Belt
82 Views

You should not be performing arithmetic on void*

A void "object" has no defined size, not even 0.

To expect src += sizeof(__m128i), where src is of type void* is expecting undefined behavior to perform as you expect.

The fact that one compiler may provide for units of 1 is no assurance that all compilers (or options) will exhibit this behavior..

(uintptr_t)src += sizeof(__m128i); would be better;

same with ((char*)src)++;

See if the code produces the same effect after correcting the arithmetic statements performed on void*.

Jim Dempsey

developer1
Beginner
82 Views

jimdempseyatthecove wrote:

You should not be performing arithmetic on void*

Hi Jim, 

I've modified my program to avoid void* arithmetic, and it still fails the same way. here's the modified program:

static inline void add_single(int8_t *dst, int8_t *src) {
    *((int8_t*)dst) += *((int8_t*)src);
}

__attribute__((noinline))
void vector_sum_char(int8_t *dst, int8_t *src, unsigned length)
{
    const unsigned factor = sizeof(__m128i);
    __m128i s[1], d[1];
    int i, j;

    i = 0;
    while(i < (int)(length - factor + 1))
    {
        printf("src=%p dst=%p i=%d max=%d\n", src, dst, i, (int)(length - factor + 1));
        j = 0;
        d = (__m128i)_mm_loadu_si128((void*)dst);
        s = (__m128i)_mm_loadu_si128((void*)src);
        src += sizeof(__m128i);

        j = 0;
        _mm_storeu_si128((void*)dst, _mm_add_epi8(d, s));
        dst += sizeof(__m128i);

        i += factor;
    }
    for (i = 0; i < (length%factor); ++i) {
        add_single(dst, src);
        ++dst;
        ++src;
    }
}

 

Reply