Hi Evan,

nemequ · ‎07-08-2016

When using memcpy for unaligned access, ICC generates sub-optimal code. This comes up a lot in data compression; for example LZ4, zstd, LZFSE use memcpy for this (as do a *lot* of other codecs), so they'll be slower with ICC. Yann Collet wrote a blog post about the issue a while back, see https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html (note that the issues with GCC on ARM have been resolved, so I think ICC is the last hold-out, though I haven't tested MSVC).

Here is a quick test which shows how gcc, clang, and icc handle the same code:

nemequ@peltast:~/t$ cat align.c
#include <stdint.h>
#include <string.h>

uint32_t read32_memcpy(const void* ptr) {
  uint32_t v;
  memcpy(&v, ptr, sizeof(uint32_t));
  return v;
}

uint32_t read32_ub(const void* ptr) {
  return *((const uint32_t *) ptr);
}

uint64_t read64_memcpy(const void* ptr) {
  uint64_t v;
  memcpy(&v, ptr, sizeof(uint64_t));
  return v;
}

uint64_t read64_ub(const void* ptr) {
  return *((const uint64_t *) ptr);
}
nemequ@peltast:~/t$ gcc --version && gcc -O3 -c -o align.o align.c && objdump -d -M intel -S align.o
gcc (GCC) 6.1.1 20160621 (Red Hat 6.1.1-3)
Copyright (C) 2016 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.


align.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <read32_memcpy>:
   0:	8b 07                	mov    eax,DWORD PTR [rdi]
   2:	c3                   	ret    
   3:	0f 1f 00             	nop    DWORD PTR [rax]
   6:	66 2e 0f 1f 84 00 00 	nop    WORD PTR cs:[rax+rax*1+0x0]
   d:	00 00 00 

0000000000000010 <read32_ub>:
  10:	8b 07                	mov    eax,DWORD PTR [rdi]
  12:	c3                   	ret    
  13:	0f 1f 00             	nop    DWORD PTR [rax]
  16:	66 2e 0f 1f 84 00 00 	nop    WORD PTR cs:[rax+rax*1+0x0]
  1d:	00 00 00 

0000000000000020 <read64_memcpy>:
  20:	48 8b 07             	mov    rax,QWORD PTR [rdi]
  23:	c3                   	ret    
  24:	66 90                	xchg   ax,ax
  26:	66 2e 0f 1f 84 00 00 	nop    WORD PTR cs:[rax+rax*1+0x0]
  2d:	00 00 00 

0000000000000030 <read64_ub>:
  30:	48 8b 07             	mov    rax,QWORD PTR [rdi]
  33:	c3                   	ret    
nemequ@peltast:~/t$ clang --version && clang -O3 -c -o align.o align.c && objdump -d -M intel -S align.o
clang version 3.8.0 (tags/RELEASE_380/final)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/bin

align.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <read32_memcpy>:
   0:	8b 07                	mov    eax,DWORD PTR [rdi]
   2:	c3                   	ret    
   3:	66 66 66 66 2e 0f 1f 	data16 data16 data16 nop WORD PTR cs:[rax+rax*1+0x0]
   a:	84 00 00 00 00 00 

0000000000000010 <read32_ub>:
  10:	8b 07                	mov    eax,DWORD PTR [rdi]
  12:	c3                   	ret    
  13:	66 66 66 66 2e 0f 1f 	data16 data16 data16 nop WORD PTR cs:[rax+rax*1+0x0]
  1a:	84 00 00 00 00 00 

0000000000000020 <read64_memcpy>:
  20:	48 8b 07             	mov    rax,QWORD PTR [rdi]
  23:	c3                   	ret    
  24:	66 66 66 2e 0f 1f 84 	data16 data16 nop WORD PTR cs:[rax+rax*1+0x0]
  2b:	00 00 00 00 00 

0000000000000030 <read64_ub>:
  30:	48 8b 07             	mov    rax,QWORD PTR [rdi]
  33:	c3                   	ret    
nemequ@peltast:~/t$ icc --version && icc -O3 -c -o align.o align.c && objdump -d -M intel -S align.o
icc (ICC) 16.0.3 20160415
Copyright (C) 1985-2016 Intel Corporation.  All rights reserved.


align.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <read32_memcpy>:
   0:	8b 07                	mov    eax,DWORD PTR [rdi]
   2:	89 44 24 f8          	mov    DWORD PTR [rsp-0x8],eax
   6:	8b 44 24 f8          	mov    eax,DWORD PTR [rsp-0x8]
   a:	c3                   	ret    
   b:	0f 1f 44 00 00       	nop    DWORD PTR [rax+rax*1+0x0]

0000000000000010 <read32_ub>:
  10:	8b 07                	mov    eax,DWORD PTR [rdi]
  12:	c3                   	ret    
  13:	0f 1f 44 00 00       	nop    DWORD PTR [rax+rax*1+0x0]
  18:	0f 1f 84 00 00 00 00 	nop    DWORD PTR [rax+rax*1+0x0]
  1f:	00 

0000000000000020 <read64_memcpy>:
  20:	48 8b 07             	mov    rax,QWORD PTR [rdi]
  23:	48 89 44 24 f8       	mov    QWORD PTR [rsp-0x8],rax
  28:	48 8b 44 24 f8       	mov    rax,QWORD PTR [rsp-0x8]
  2d:	c3                   	ret    
  2e:	66 90                	xchg   ax,ax

0000000000000030 <read64_ub>:
  30:	48 8b 07             	mov    rax,QWORD PTR [rdi]
  33:	c3                   	ret    
  34:	0f 1f 44 00 00       	nop    DWORD PTR [rax+rax*1+0x0]
  39:	0f 1f 80 00 00 00 00 	nop    DWORD PTR [rax+0x0]

Anoop_M_Intel · ‎07-12-2016

Hi Evan,

I have escalated this to compiler engineering. Will keep you posted on what they say.

Thanks and Regards
Anoop

ICC generates sub-optimal code for memcpy