Intel® C++ Compiler
Community support and assistance for creating C++ code that runs on platforms based on Intel® processors.

icc18 vectorizer unnecessarily ands with 0x00ff after shifting word right by 8

BMart1
New Contributor II
293 Views

This code

#include <stdint.h>

int const count = 1024;
uint8_t p[count];

void mul(uint16_t m)
{
#pragma simd vectorlength(8)
	for (int i = 0; i < count; ++i)
		p = uint16_t(p * m) >> 8;
}

compiled with -Os​ generates

mul(unsigned short):
  xor edx, edx #9.2
  movzx eax, di #10.26
  pxor xmm2, xmm2 #10.19
  movdqu xmm1, XMMWORD PTR .L_2il0floatpacket.0[rip] #10.32
  movd xmm3, eax #10.26
  punpcklwd xmm3, xmm3 #10.26
  punpckldq xmm3, xmm3 #10.26
  punpcklqdq xmm3, xmm3 #10.26
..B1.2: # Preds ..B1.2 ..B1.1
  lea rax, QWORD PTR [p+rdx] #10.19
  movq xmm0, QWORD PTR [rax] #10.19
  punpcklbw xmm0, xmm2 #10.19
  pmullw xmm0, xmm3 #10.32
  psrlw xmm0, 8 #10.32
  pand xmm0, xmm1 #10.32
  packuswb xmm0, xmm2 #10.32
  movq QWORD PTR [rax], xmm0 #10.3
  add rdx, 8 #9.2
  cmp rdx, 1024 #9.2
  jb ..B1.2 # Prob 99% #9.2
  ret #11.1
p:
.L_2il0floatpacket.0:
  .long 0x00ff00ff,0x00ff00ff,0x00ff00ff,0x00ff00ff

with a redundant pand instruction.

0 Kudos
2 Replies
TimP
Honored Contributor III
293 Views
Is it the same if you shift by 8 cast to same type?
0 Kudos
BMart1
New Contributor II
293 Views

Makes no difference: https://godbolt.org/g/pzbiKu

0 Kudos
Reply