Intel® C++ Compiler
Community support and assistance for creating C++ code that runs on platforms based on Intel® processors.

SIMD data and std::vector

Pajarre__Eero
Beginner
844 Views

With the Microsoft compiler, compiling in C++ standard compliance mode of 2017 or later, I can use SIMD data types for example in std::vector without custom allocators. When trying to use Intel compiler from Visual Studio this does not seem to work. (I am new to using the Intel compiler, so I may just be missing some option, but I think I have tried the standard version settings etc)

I have attached example code at the bottom of this message. The memory allocations of course vary, and often produce aligned results, even if not set really set up correctly. The define (commented out) on the first line seems to be the one controlling the behavior in Microsoft header files, if it is used, then the code works also with Intel compilers. As the defined symbol is in "compiler namespace", I guess defining it as a user is not a clean solution.

There are some compatibility issues between Visual Studio 2019 and Intel compiler, so I have done my testing with Visual Studio 2017.

 Eero

//#define __cpp_aligned_new 1
#include <stdio.h>
#include <xmmintrin.h>
#include <vector>

struct Test_S{
  __m128 v;
  float b;
};

int main(){
  fprintf(stderr,"Alignment requirement = %d\n",alignof(Test_S));
#ifdef __cpp_aligned_new
  fprintf(stderr,"__cpp_aligned_new defined\n");
#else
  fprintf(stderr,"__cpp_aligned_new not defined\n");
#endif
  std::vector<Test_S> a;
  a.push_back(Test_S());
  fprintf(stderr,"addr=%p\n",&(a[0].v));
  std::vector<Test_S> b;
  b.push_back(Test_S());
  fprintf(stderr,"addr=%p\n",&(b[0].v));
  std::vector<Test_S> c;
  c.push_back(Test_S());
  fprintf(stderr,"addr=%p\n",&(c[0].v));
}

 

0 Kudos
6 Replies
Viet_H_Intel
Moderator
844 Views

Can you provide the output when you compiled with icl vs. cl?

0 Kudos
Pajarre__Eero
Beginner
844 Views

The template magic produces thousands of lines of assembly, which I found rather difficult to follow.

It is "easier" to follow the logic on debugger while single stepping. As mentioned the problems seems to start from a mssing __cpp_aligned_new preprocessor symbol, which in the header files is transferred to _HAS_ALIGNED_NEW symbol.

The _HAS_ALIGNED_NEW is used in xmemory0 header starting on line 138 for generating and selecting template functions for the allocator.

The call chain genrated by Microsoft compiler leads to this function:

 

; Function compile flags: /Odtp /ZI
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.16.27023\include\xmemory0
;	COMDAT ??$_Allocate@$0BA@U_Default_allocate_traits@std@@$0A@@std@@YAPAXI@Z
_TEXT	SEGMENT
$T1 = -72						; size = 4
__Passed_align$ = -4					; size = 4
__Bytes$ = 8						; size = 4
??$_Allocate@$0BA@U_Default_allocate_traits@std@@$0A@@std@@YAPAXI@Z PROC ; std::_Allocate<16,std::_Default_allocate_traits,0>, COMDAT

; 143  : 	{	// allocate _Bytes when _HAS_ALIGNED_NEW && _Align > __STDCPP_DEFAULT_NEW_ALIGNMENT__

	push	ebp
	mov	ebp, esp
	sub	esp, 72					; 00000048H
	push	ebx
	push	esi
	push	edi

; 144  : 	if (_Bytes == 0)

	cmp	DWORD PTR __Bytes$[ebp], 0
	jne	SHORT $LN2@Allocate

; 145  : 		{
; 146  : 		return (nullptr);

	xor	eax, eax
	jmp	SHORT $LN1@Allocate
$LN2@Allocate:

; 147  : 		}
; 148  : 
; 149  : 	size_t _Passed_align = _Align;

	mov	DWORD PTR __Passed_align$[ebp], 16	; 00000010H

; 150  :   #if defined(_M_IX86) || defined(_M_X64)
; 151  : 	if (_Bytes >= _Big_allocation_threshold)

	cmp	DWORD PTR __Bytes$[ebp], 4096		; 00001000H
	jb	SHORT $LN3@Allocate

; 152  : 		{	// boost the alignment of big allocations to help autovectorization
; 153  : 		_Passed_align = _Max_value(_Align, _Big_allocation_alignment);

	mov	DWORD PTR $T1[ebp], 16			; 00000010H
	push	OFFSET ?_Big_allocation_alignment@std@@3IB
	lea	eax, DWORD PTR $T1[ebp]
	push	eax
	call	??$_Max_value@I@std@@YAABIABI0@Z	; std::_Max_value<unsigned int>
	add	esp, 8
	mov	ecx, DWORD PTR [eax]
	mov	DWORD PTR __Passed_align$[ebp], ecx
$LN3@Allocate:

; 154  : 		}
; 155  :   #endif /* defined(_M_IX86) || defined(_M_X64) */
; 156  : 
; 157  : 	return (_Traits::_Allocate_aligned(_Bytes, _Passed_align));

	mov	eax, DWORD PTR __Passed_align$[ebp]
	push	eax
	mov	ecx, DWORD PTR __Bytes$[ebp]
	push	ecx
	call	?_Allocate_aligned@_Default_allocate_traits@std@@SAPAXII@Z ; std::_Default_allocate_traits::_Allocate_aligned
	add	esp, 8
$LN1@Allocate:

; 158  : 	}

	pop	edi
	pop	esi
	pop	ebx
	mov	esp, ebp
	pop	ebp
	ret	0
??$_Allocate@$0BA@U_Default_allocate_traits@std@@$0A@@std@@YAPAXI@Z ENDP ; std::_Allocate<16,std::_Default_allocate_traits,0>
_TEXT	ENDS

 

On Intel compiler the call chain goes to: (Notice the comment fragment on line 19)

 

_TEXT	SEGMENT  DWORD PUBLIC FLAT  'CODE'
;	COMDAT ??$_Allocate@$07U_Default_allocate_traits@std@@$0A@@std@@YAPAXI@Z
TXTST66:
_2__routine_start_??$_Allocate@$07U_Default_allocate_traits@std@@$0A@@std@@YAPAXI@Z_66:
; -- Begin  ??$_Allocate@$07U_Default_allocate_traits@std@@$0A@@std@@YAPAXI@Z
;??$_Allocate@$07U_Default_allocate_traits@std@@$0A@@std@@YAPAXI@Z	ENDS
_TEXT	ENDS
_TEXT	SEGMENT  DWORD PUBLIC FLAT  'CODE'
;	COMDAT ??$_Allocate@$07U_Default_allocate_traits@std@@$0A@@std@@YAPAXI@Z
; mark_begin;

	PUBLIC ??$_Allocate@$07U_Default_allocate_traits@std@@$0A@@std@@YAPAXI@Z
; --- std::_Allocate<8U, std::_Default_allocate_traits, 0>(size_t)
??$_Allocate@$07U_Default_allocate_traits@std@@$0A@@std@@YAPAXI@Z	PROC NEAR 
; parameter 1(_Bytes): 8 + ebp
.B67.1:                         ; Preds .B67.0
                                ; Execution count [0.00e+000]

;;; 	{	// allocate _Bytes when !_HAS_ALIGNED_NEW || _Align <= __STDCPP_DEFAULT_NEW_ALIGNMENT__

L384::
                                                         ;180.2
$LN2543:
        push      ebp                                           ;180.2
$LN2544:
        mov       ebp, esp                                      ;180.2
$LN2545:
        sub       esp, 8                                        ;180.2
$LN2546:

;;;  #if defined(_M_IX86) || defined(_M_X64)
;;; 	if (_Bytes >= _Big_allocation_threshold)

        mov       eax, DWORD PTR [8+ebp]                        ;182.2
$LN2547:
        cmp       eax, 4096                                     ;182.2
$LN2548:
        jb        .B67.4        ; Prob 50%                      ;182.2
$LN2549:
                                ; LOE ebx ebp esi edi esp
.B67.2:                         ; Preds .B67.1
                                ; Execution count [0.00e+000]
$LN2550:

;;; 		{	// boost the alignment of big allocations to help autovectorization
;;; 		return (_Allocate_manually_vector_aligned<_Traits>(_Bytes));

        push      eax                                           ;184.3
$LN2551:
        mov       eax, DWORD PTR [8+ebp]                        ;184.3
$LN2552:
        mov       DWORD PTR [esp], eax                          ;184.3
$LN2553:
        call      ??$_Allocate_manually_vector_aligned@U_Default_allocate_traits@std@@@std@@YAPAXI@Z ;184.3
$LN2554:
                                ; LOE eax ebx ebp esi edi esp
.B67.10:                        ; Preds .B67.2
                                ; Execution count [0.00e+000]
$LN2555:
        mov       DWORD PTR [-8+ebp], eax                       ;184.3
$LN2556:
        add       esp, 4                                        ;184.3
$LN2557:
                                ; LOE ebx ebp esi edi esp
.B67.3:                         ; Preds .B67.10
                                ; Execution count [0.00e+000]
$LN2558:
        mov       eax, DWORD PTR [-8+ebp]                       ;184.3
$LN2559:
        leave                                                   ;184.3
$LN2560:
        ret                                                     ;184.3
$LN2561:
                                ; LOE
.B67.4:                         ; Preds .B67.1
                                ; Execution count [0.00e+000]
$LN2562:

;;; 		}
;;;  #endif /* defined(_M_IX86) || defined(_M_X64) */
;;; 
;;; 	if (_Bytes != 0)

        mov       eax, DWORD PTR [8+ebp]                        ;188.2
$LN2563:
        test      eax, eax                                      ;188.2
$LN2564:
        je        .B67.7        ; Prob 50%                      ;188.2
$LN2565:
                                ; LOE ebx ebp esi edi esp
.B67.5:                         ; Preds .B67.4
                                ; Execution count [0.00e+000]
$LN2566:

;;; 		{
;;; 		return (_Traits::_Allocate(_Bytes));

        push      eax                                           ;190.3
$LN2567:
        mov       eax, DWORD PTR [8+ebp]                        ;190.3
$LN2568:
        mov       DWORD PTR [esp], eax                          ;190.3
$LN2569:
;       std::_Default_allocate_traits::_Allocate(size_t)
        call      ?_Allocate@_Default_allocate_traits@std@@SAPAXI@Z ;190.3
$LN2570:
                                ; LOE eax ebx ebp esi edi esp
.B67.11:                        ; Preds .B67.5
                                ; Execution count [0.00e+000]
$LN2571:
        mov       DWORD PTR [-4+ebp], eax                       ;190.3
$LN2572:
        add       esp, 4                                        ;190.3
$LN2573:
                                ; LOE ebx ebp esi edi esp
.B67.6:                         ; Preds .B67.11
                                ; Execution count [0.00e+000]
$LN2574:
        mov       eax, DWORD PTR [-4+ebp]                       ;190.3
$LN2575:
        leave                                                   ;190.3
$LN2576:
        ret                                                     ;190.3
$LN2577:
                                ; LOE
.B67.7:                         ; Preds .B67.4
                                ; Execution count [0.00e+000]
$LN2578:

;;; 		}
;;; 
;;; 	return (nullptr);

        mov       eax, 0                                        ;193.2
$LN2579:
        leave                                                   ;193.2
$LN2580:
        ret                                                     ;193.2
$LN2581:
                                ; LOE
$LN2582:
; mark_end;

I hope I got the code pieces right....

 

 Eero

0 Kudos
Viet_H_Intel
Moderator
844 Views

Sorry, what I meant was how to compile the test case (I didn't see error when compiled it) and what the results of executing the binary.

0 Kudos
Pajarre__Eero
Beginner
844 Views

I saved my example file, and managed to compile it directly by just running icl test.cpp

(in shell started from the windows start menu entry for Intel 19.0 compiler)

Can you contact me directly, or provide more information here?

Eero

 

0 Kudos
Viet_H_Intel
Moderator
844 Views

 

C:\Temp>icl test.cpp
Intel(R) C++ Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.1.0.166 Build 20191121
Copyright (C) 1985-2019 Intel Corporation.  All rights reserved.

test.cpp
Microsoft (R) Incremental Linker Version 14.23.28105.4
Copyright (C) Microsoft Corporation.  All rights reserved.

-out:test.exe
test.obj

C:\Temp>test.exe
Alignment requirement = 16
__cpp_aligned_new not defined
addr=0000020AEDE8F9F0
addr=0000020AEDE8FB10
addr=0000020AEDE8FA20

C:\Temp>notepad test.cpp

//#define __cpp_aligned_new 1
#include <stdio.h>
#include <xmmintrin.h>
#include <vector>

struct Test_S{
  __m128 v;
  float b;
};

int main(){
  fprintf(stderr,"Alignment requirement = %d\n",alignof(Test_S));
#ifdef __cpp_aligned_new
  fprintf(stderr,"__cpp_aligned_new defined\n");
#else
  fprintf(stderr,"__cpp_aligned_new not defined\n");
#endif
  std::vector<Test_S> a;
  a.push_back(Test_S());
  fprintf(stderr,"addr=%p\n",&(a[0].v));
  std::vector<Test_S> b;
  b.push_back(Test_S());
  fprintf(stderr,"addr=%p\n",&(b[0].v));
  std::vector<Test_S> c;
  c.push_back(Test_S());
  fprintf(stderr,"addr=%p\n",&(c[0].v));
}

0 Kudos
Pajarre__Eero
Beginner
844 Views

Maybe the problem only exists with the 32bit target compiler?

I now tested the 64 bit target compiler, and it seems to avoid the problem, I have not gone through the code with debugger, so I am not 100% sure.

With the 32bit target I get results like:

H:\Dropbox\tmp>icl test.cpp
Intel(R) C++ Intel(R) 64 Compiler for applications running on IA-32, Version 19.0.5.281 Build 20190815
Copyright (C) 1985-2019 Intel Corporation.  All rights reserved.

test.cpp
Microsoft (R) Incremental Linker Version 14.16.27035.0
Copyright (C) Microsoft Corporation.  All rights reserved.

-out:test.exe
test.obj

H:\Dropbox\tmp>test.exe
Alignment requirement = 16
__cpp_aligned_new not defined
addr=00F2CA30
addr=00F2C9B8
addr=00F2CA08

Where the last two addresses would have crashed sse data loading from the pointer?

 Eero

0 Kudos
Reply