Intel® C++ Compiler
Community support and assistance for creating C++ code that runs on platforms based on Intel® processors.
7944 Discussions

automatic vectorization not using min/max instructions?

lmeyerov2
Beginner
530 Views
Hi,

I'm experimenting with automatic vectorization support in ICC XE 12 (beta build 20100512 for windows, C++) and am getting some puzzling assembly for output. Essentially, instead of using min/max SSE functions, the assembly goes through a long convoluted series of XORs etc.

I want to loop over three arrays to compute a min/max for the 4th:

for each i: 
res = min(max(first, second), third));


Ideally, this will use the min and max SSE functions. Furthermore, I'm trying to guarentee alignment of the arrays and to avoid checking for leftovers. The goal is to write this with pragmas etc. outside of the loop body (no assembly, few intrinsics) and not touch the loop body itself. E.g., going a little overboard:

void computeLvlSIMD(Level *lvl, Inherited &inh) {
  unsigned int len4 = lvl->lengthBy4;
  unsigned int len = 4 * len4; //no carry-over hintable?
  int *inhW = inh.w;
  int *w = lvl->w;
  int *minw = lvl->minw;
  int *prefw = lvl->prefw;

  __assume_aligned(inhW, 16);
  __assume_aligned(w, 16);
  __assume_aligned(minw, 16);
  __assume_aligned(prefw, 16);

  #pragma ivdep
  #pragma vector aligned
  #pragma vector always
  for (unsigned int i = 0; i < len; i ++) {
    int max = inhW; //note that the loop body is 'normal' code
    if (minw > max) max = minw;
    int min = max;
    if (prefw < min) min = prefw;
    w = min;	
  }
}


However, the generated assembly does a long series of pxor's etc. instead of movdqa/movdqa/movdqa/max/min/movdqa:

; -- Machine type EFI2
; mark_description "Intel C++ Compiler XE for applications running on Intel 64, Version 12.0.0.030 Beta Build 20100512";
; mark_description "-O3 -Qstd=c99 -QaxSSE4.2 -Qvec-report:3 -S";

...

.B1.4::                         ; Preds .B1.4 .B1.3
$LN25:
$LN26:
        movdqa    xmm0, XMMWORD PTR [r11+rdx*4]                 ;8.13
$LN27:
        add       eax, 4                                        ;19.3
$LN28:
        movdqa    xmm1, XMMWORD PTR [rbx+rdx*4]                 ;6.13
$LN29:
        movdqa    xmm3, xmm0                                    ;21.5
$LN30:
        pcmpgtd   xmm3, xmm1                                    ;21.5
$LN31:
        pxor      xmm0, xmm1                                    ;21.5
$LN32:
        pand      xmm3, xmm0                                    ;21.5
$LN33:
        cmp       eax, r8d                                      ;19.3
$LN34:
        pxor      xmm3, xmm1                                    ;21.5
$LN35:
        movdqa    xmm2, XMMWORD PTR [rcx+rdx*4]                 ;9.14
$LN36:
        movdqa    xmm4, xmm3                                    ;23.5
$LN37:
        pcmpgtd   xmm4, xmm2                                    ;23.5
$LN38:
        pxor      xmm2, xmm3                                    ;23.5
$LN39:
        pand      xmm4, xmm2                                    ;23.5
$LN40:
        pxor      xmm4, xmm3                                    ;23.5
$LN41:
        movdqa    XMMWORD PTR [r10+rdx*4], xmm4                 ;7.10
$LN42:
        mov       edx, eax                                      ;19.3
$LN43:
        jb        .B1.4         ; Prob 82%                      ;19.3
Any tips would be appreciated!
0 Kudos
8 Replies
lmeyerov2
Beginner
530 Views
For reference, the intrinsics version is something like

__m128i inhwQ, minwQ, prefwQ; //i32
  __m128i min, max;
  for (unsigned int i = 0; i < len4; i += 4) {
	inhwQ = _mm_load_si128(inhw + i);
	minwQ = _mm_load_si128(minw + i);
	prefwQ = _mm_load_si128(prefw + i);
	min = _mm_min_epi32(inhwQ, minwQ);
	max = _mm_max_epi32(prefwQ, min);
	_mm_store_si128(w + i, max);
  }


and the expected assembly like

$LN16:
        movdqa    xmm0, XMMWORD PTR [rdx+r10]                   ;22.25
$LN17:
        pminsd    xmm0, XMMWORD PTR [rax+r10]                   ;25.8
$LN18:
        movdqa    xmm1, XMMWORD PTR [r11+r10]                   ;24.26
$LN19:
        pmaxsd    xmm1, xmm0                                    ;26.8
$LN20:
        movdqa    XMMWORD PTR [r8+r10], xmm1                    ;27.18
$LN21:


Regards,

- L
0 Kudos
Mark_S_Intel1
Employee
530 Views
Could you please attach a test case that can be compiled?
You can either provide the test case as a private attachment or submit via Intel Premier Support at https://premier.intel.com/

Thanks,
--mark
0 Kudos
lmeyerov2
Beginner
530 Views
I can't give the original context unfortunately, but here's an isolated rewrite:
#include  

void strange (unsigned int len) {

  unsigned int len4 = (len + 3) / 4;
  unsigned int len1 = 4 * len4;

int *a = (int *) _aligned_malloc( sizeof(int) * len1, 16);

int *b = (int *) _aligned_malloc( sizeof(int) * len1, 16);

int *c = (int *) _aligned_malloc( sizeof(int) * len1, 16);

int *res = (int *) _aligned_malloc( sizeof(int) * len1, 16);

__assume_aligned(a, 16); __assume_aligned(b, 16); __assume_aligned(c, 16); __assume_aligned(res, 16); #pragma ivdep #pragma vector aligned #pragma vector always for (unsigned int i = 0; i < len1; i ++) { int tmp = a; if (b > tmp) tmp = b; if (c < tmp) tmp = c; res = tmp; } }
Compiled with
"C:\Program Files (x86)\Intel\CompilerPro-12.0\bin\intel64\icl" /O3 /c /Qstd=c99 /QaxSSE4.2 /Qvec-report:1 IsolatedIntrinsics.cpp /S
Thanks!
0 Kudos
Mark_S_Intel1
Employee
530 Views
Thanks for the test case. I will look into it and get back to you.

--mark
0 Kudos
Mark_S_Intel1
Employee
530 Views
I get the same .asm results as you when I use /QaxSSE4.2. If I use /QxSSE4.2, I get pminsd instructions:

movdqa xmm0, XMMWORD PTR [rdi+rcx*4] ;7.117
add edx, 4 ;16.3
pmaxsd xmm0, XMMWORD PTR [r12+rcx*4] ;18.2
cmp edx, r8d ;16.3
pminsd xmm0, XMMWORD PTR [rsi+rcx*4] ;19.2
movdqa XMMWORD PTR [rax+rcx*4], xmm0 ;7.239
mov ecx, edx ;16.3
jb .B1.8 ; Prob 82% ;16.3

I will look further and get back to you.

--mark


0 Kudos
lmeyerov2
Beginner
530 Views
Yep, that looks right! Now I'm just wondering about telling the vectorizor that the loop counter is a multiple of 4 (the generated assembly seems long).
0 Kudos
lmeyerov2
Beginner
530 Views
It looks like changing the loop condition to "i < 4 * len" helps.
0 Kudos
Mark_S_Intel1
Employee
530 Views

Yes; using <4 * len" does help quite a lot !

I filed a report to resolve the case where using /QaxSSE4.2 does not generate the expected min/max instructions and will let you know the issue is resolved.

--mark

0 Kudos
Reply