- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Dear all, I wrote some code for testing FMA instructions, compiled in IDE vs 2010( of course using Intel c++ toolset), and there are not any FMA instructions generated. Then I compiled the same code in vs 2015, using the same Intel toolset, with same project configuration, and it generate FMA instructions successfully. I use Intel Parallel studio XE 2016 cluster
Is there any difference between vs2010 and vs2015?
Below is code:
#include "stdafx.h" #include <Windows.h> #include <immintrin.h> bool fma_test() { __m256 mma, mmb, mmc; float a[8], b[8], c[8]; for (int i = 0; i < 8; ++i) { a = i; b = i; c = i; } mma = _mm256_load_ps(a); mmb = _mm256_load_ps(b); mmc = _mm256_load_ps(c); __m256 ret = _mm256_fmadd_ps(mma, mmb, mmc); if(ret.m256_f32[7] == 56.0) return true; return false; } int main() { if (fma_test()) { printf("true"); } else printf("false"); system("pause"); return 0; }
Below is disassembly in VS2010 and 2015:
VS2010:
000000013FD30FFC add byte ptr [rax],al
000000013FD30FFE add byte ptr [rax],al
--- D:\sl\XR\MoFangG\xfma\xfma.cpp ---------------------------------------------
return true;
return false;
}
int main()
{
000000013FD31000 sub rsp,78h
000000013FD31004 mov edx,9D9FFEh
000000013FD31009 mov qword ptr [rsp+60h],r13
000000013FD3100E lea r13,[rsp+3Fh]
000000013FD31013 mov ecx,3
000000013FD31018 and r13,0FFFFFFFFFFFFFFE0h
000000013FD3101C mov rax,qword ptr [__security_cookie (13FD36000h)]
000000013FD31023 xor rax,rsp
000000013FD31026 mov qword ptr [rsp+70h],rax
000000013FD3102B call __intel_new_feature_proc_init (13FD318F0h)
000000013FD31030 vstmxcsr dword ptr [rsp+68h]
000000013FD31036 or dword ptr [rsp+68h],8040h
000000013FD3103E vldmxcsr dword ptr [rsp+68h]
// xfma.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <Windows.h>
#include <immintrin.h>
bool fma_test()
{
__m256 mma, mmb, mmc;
float a[8], b[8], c[8];
for (int i = 0; i < 8; ++i)
{
a = i;
000000013FD31044 vcvtdq2ps ymm0,ymmword ptr [__xi_z+30h (13FD33220h)]
b = i;
c = i;
}
mma = _mm256_load_ps(a);
mmb = _mm256_load_ps(b);
mmc = _mm256_load_ps(c);
__m256 ret = _mm256_fmadd_ps(mma, mmb, mmc);
000000013FD3104C db c4h
000000013FD3104D loop main+0CCh (13FD310CCh)
000000013FD3104F test al,0C0h
000000013FD31051 vmovups ymmword ptr [rbp],ymm0
if(ret.m256_f32[7] == 56.0)
000000013FD31057 vmovss xmm1,dword ptr [rbp+1Ch]
000000013FD3105D vucomiss xmm1,dword ptr [__xi_z+50h (13FD33240h)]
000000013FD31065 jp main+69h (13FD31069h)
000000013FD31067 je main+9Fh (13FD3109Fh)
}
else
printf("false");
000000013FD31069 lea rcx,[__xi_z+54h (13FD33244h)]
000000013FD31070 vzeroupper
000000013FD31073 call qword ptr [__imp_printf (13FD33180h)]
system("pause");
000000013FD31079 lea rcx,[__xi_z+5Ah (13FD3324Ah)]
000000013FD31080 call qword ptr [__imp_system (13FD330E0h)]
return 0;
000000013FD31086 mov rcx,qword ptr [rsp+70h]
000000013FD3108B xor rcx,rsp
000000013FD3108E call __security_check_cookie (13FD310D0h)
000000013FD31093 mov r13,qword ptr [rsp+60h]
000000013FD31098 xor eax,eax
000000013FD3109A add rsp,78h
000000013FD3109E ret
if (fma_test())
{
printf("true");
000000013FD3109F lea rcx,[__xi_z+60h (13FD33250h)]
000000013FD310A6 vzeroupper
000000013FD310A9 call qword ptr [__imp_printf (13FD33180h)]
000000013FD310AF jmp main+79h (13FD31079h)
000000013FD310B1 nop dword ptr [rax+rax]
000000013FD310B9 nop dword ptr [rax]
--- No source file -------------------------------------------------------------
000000013FD310C0 int 3
000000013FD310C1 int 3
000000013FD310C2 int 3
000000013FD310C3 int 3
000000013FD310C4 int 3
000000013FD310C5 int 3
000000013FD310C6 nop word ptr [rax+rax]
__security_check_cookie:
000000013FD310D0 cmp rcx,qword ptr [__security_cookie (13FD36000h)]
000000013FD310D7 jne ReportFailure (13FD310EAh)
000000013FD310D9 rol rcx,10h
000000013FD310DD test cx,0FFFFh
000000013FD310E2 jne RestoreRcx (13FD310E6h)
000000013FD310E4 rep ret
RestoreRcx:
000000013FD310E6 ror rcx,10h
ReportFailure:
000000013FD310EA jmp __report_gsfailure (13FD31440h)
000000013FD310EF int 3
__GSHandlerCheckCommon:
000000013FD310F0 push rbx
000000013FD310F2 sub rsp,20h
000000013FD310F6 mov r11d,dword ptr [r8]
000000013FD310F9 mov rbx,rdx
000000013FD310FC mov r9,rcx
000000013FD310FF and r11d,0FFFFFFF8h
000000013FD31103 test byte ptr [r8],4
000000013FD31107 mov r10,rcx
000000013FD3110A je __GSHandlerCheckCommon+2Fh (13FD3111Fh)
000000013FD3110C mov eax,dword ptr [r8+8]
000000013FD31110 movsxd r10,dword ptr [r8+4]
000000013FD31114 neg eax
000000013FD31116 add r10,rcx
000000013FD31119 movsxd rcx,eax
000000013FD3111C and r10,rcx
000000013FD3111F movsxd rax,r11d
000000013FD31122 mov rdx,qword ptr [rax+r10]
000000013FD31126 mov rax,qword ptr [rbx+10h]
000000013FD3112A mov ecx,dword ptr [rax+8]
000000013FD3112D add rcx,qword ptr [rbx+8]
000000013FD31131 test byte ptr [rcx+3],0Fh
000000013FD31135 je __GSHandlerCheckCommon+53h (13FD31143h)
000000013FD31137 movzx eax,byte ptr [rcx+3]
000000013FD3113B and eax,0FFFFFFF0h
000000013FD3113E cdqe
000000013FD31140 add r9,rax
000000013FD31143 xor r9,rdx
000000013FD31146 mov rcx,r9
000000013FD31149 add rsp,20h
000000013FD3114D pop rbx
000000013FD3114E jmp __security_check_cookie (13FD310D0h)
000000013FD31153 int 3
__GSHandlerCheck:
000000013FD31154 sub rsp,28h
000000013FD31158 mov r8,qword ptr [r9+38h]
000000013FD3115C mov rcx,rdx
000000013FD3115F mov rdx,r9
000000013FD31162 call __GSHandlerCheckCommon (13FD310F0h)
000000013FD31167 mov eax,1
000000013FD3116C add rsp,28h
000000013FD31170 ret
000000013FD31171 int 3
000000013FD31172 int 3
000000013FD31173 int 3
--- f:\dd\vctools\crt_bld\self_64_amd64\crt\src\crtexe.c -----------------------
VS2015
c = i;
000000013F0B1098 vmovaps xmmword ptr [rsp+0B0h],xmm0
}
mma = _mm256_load_ps(a);
000000013F0B10A1 vmovaps ymm1,ymmword ptr [rsp+70h]
mmb = _mm256_load_ps(b);
000000013F0B10A7 vmovaps ymm0,ymmword ptr [rsp+90h]
c = i;
000000013F0B10B0 vmovaps xmmword ptr [rsp+0C0h],xmm5
mmc = _mm256_load_ps(c);
__m256 ret = _mm256_fmadd_ps(mma, mmb, mmc);
000000013F0B10B9 vfmadd213ps ymm1,ymm0,ymmword ptr [rsp+0B0h]
000000013F0B10C3 vmovaps ymmword ptr [r13],ymm1
if(ret.m256_f32[7] == 56.0)
000000013F0B10C9 vmovss xmm2,dword ptr [r13+1Ch]
if (fma_test())
{
printf("true");
}
else
{
printf("false");
000000013F0B10CF vucomiss xmm2,dword ptr [__xt_z+18h (013F0B4290h)]
if (fma_test())
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Noting that full avx2 support was introduced in a vs2013 update, this may not be entirely surprising. If you did use icl, this might be influenced by your arch setting and icl version.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Tim's correct, Raymond. AVX2 including the new FMA3 instructions were introduced 13.0 initial release and improved support for 3rd gen (-axCORE-AVX-I, -axCORE_AVX2) etc introduced (with range based for loops, c++11 etc) in updates. So, it could be related to the arch and icl version per-se. You should try the subsequent update releases and find out. That said, it's a very old version. I'd suggest you upgrade to the latest 16.0 update 2 release that's out.
Regards,
Kittur
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Dear Tim and Kittur Ganesh, I said that I used same project congifuration for VS2010 and VS2015, the arch is both X64-Release, icl version is both 2016 XE update 2. Everything is same for VS2010 and VS2015, same project configuration(I checked for several times), same computer(i5-4590 with 8GB RAM), same ICL. This confused me very much.
Please check out the attachment files(projects and source codes for VS2010 and VS2015).
Kittur Ganesh (Intel) wrote:
Tim's correct, Raymond. AVX2 including the new FMA3 instructions were introduced 13.0 initial release and improved support for 3rd gen (-axCORE-AVX-I, -axCORE_AVX2) etc introduced (with range based for loops, c++11 etc) in updates. So, it could be related to the arch and icl version per-se. You should try the subsequent update releases and find out. That said, it's a very old version. I'd suggest you upgrade to the latest 16.0 update 2 release that's out.
Regards,
Kittur
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Thanks Raymond, for the attachments and will investigate accordingly, appreciate much.
Kittur
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi, Raymond
Tim was right. AVX is not supported in Visual Studio 2010, and start to support in Visual Studio 2013.
The disassembly is parsed by Visual Studio 2010 debugger, which cannot show FMA instructions correctly.
Hope that resolve your confusion.
Thanks.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Yuan,
Why would the MS VS selection affect the same Intel PS XE (16.0.2) on the same CPU and O/S?
While I can accept that the Disassembly window of the VS debugger might not disassemble an FMA instruction, I do not see that as a compelling reason not to generate the instruction sequence on that version of the IDE. Most users do not use the disassembly window.
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page