- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Just for reference, I am using the Intel Compiler 12.1 (ComposerXE 2011 Update 6) under Windows 7 on an I7 2600K processor with VS2010.
When compiling in release 32bit one of our applications we have some code being strangely inlined.
My Current Compiler options are /GR /O3 /Ox /Qinline-dllimport /arch:SSE2 /QaxSSE4.2 /Zi /Qvc10
but I have tried with O2 etc.. and nothing I have tried so far fixes the issue I am going to describe.
Here is a simple function that we use. the VxrF32vec4 is just a class derived from F32vec4 with additional inline method and nothing else.
void __regcall InitPosition(VxrF32vec4 &px,VxrF32vec4 &py,VxrF32vec4 &pz,const VxrF32vec4 &position,const VxrF32vec4 &c0,const VxrF32vec4 &xdx,const VxrF32vec4 &xdy,const VxrF32vec4 &xdz)
{
px = _mm_shuffle_ps(position,position,0xFF) + c0*xdx;
py = _mm_shuffle_ps(position,position,0xAA) + c0*xdy;
pz = _mm_shuffle_ps(position,position,0x55) + c0*xdz;}
And it is compiled like that.
void __regcall InitPosition(VxrF32vec4 &px,VxrF32vec4 &py,VxrF32vec4 &pz,const VxrF32vec4 &position,const VxrF32vec4 &c0,const VxrF32vec4 &xdx,const VxrF32vec4 &xdy,const VxrF32vec4 &xdz)
{
0053E830 push ebx
px = _mm_shuffle_ps(position,position,0xFF) + c0*xdx;
0053E831 mov ebx,dword ptr [xdx]
0053E835 movaps xmm0,xmmword ptr [esi]
0053E838 movaps xmm1,xmmword ptr [edi]
0053E83B mulps xmm0,xmmword ptr [ebx]
0053E83E shufps xmm1,xmm1,0FFh
0053E842 addps xmm1,xmm0
0053E845 movaps xmmword ptr [eax],xmm1
py = _mm_shuffle_ps(position,position,0xAA) + c0*xdy;
0053E848 mov eax,dword ptr [xdy]
0053E84C movaps xmm2,xmmword ptr [esi]
0053E84F movaps xmm3,xmmword ptr [edi]
0053E852 mulps xmm2,xmmword ptr [eax]
0053E855 shufps xmm3,xmm3,0AAh
0053E859 addps xmm3,xmm2
0053E85C movaps xmmword ptr [ecx],xmm3
pz = _mm_shuffle_ps(position,position,0x55) + c0*xdz;
0053E85F mov ecx,dword ptr [xdz]
0053E863 movaps xmm0,xmmword ptr [esi]
0053E866 movaps xmm1,xmmword ptr [edi]
0053E869 mulps xmm0,xmmword ptr [ecx]
0053E86C shufps xmm1,xmm1,55h
0053E870 addps xmm1,xmm0
0053E873 movaps xmmword ptr [edx],xmm1
}
0053E876 pop ebx
0053E877 ret
Which is perfectly fine as far as I can see.
However when I try to inline that function here is the code generate for it.
InitPosition(px,py,pz,position,c0,xdx,xdy,xdz);
0069C9A2 mov dword ptr [ebp-1978h],edi
0069C9A8 mov dword ptr [ebp-1970h],eax
0069C9AE mov dword ptr [ebp-1968h],ecx
0069C9B4 lea eax,[ebp-918h]
0069C9BA mov dword ptr [ebp-1960h],eax
0069C9C0 lea edx,[ebp-0BC8h]
0069C9C6 mov dword ptr [ebp-1958h],edx
0069C9CC lea esi,[ebp-0A68h]
0069C9D2 mov dword ptr [ebp-1950h],esi
0069C9D8 lea ecx,[ebp-0A38h]
0069C9DE mov dword ptr [ebp-1948h],ecx
0069C9E4 lea ecx,[ebp-0A08h]
0069C9EA mov dword ptr [ebp-1940h],ecx
0069C9F0 mov dword ptr [ebp-1938h],eax
0069C9F6 lea ecx,[ebp-8A8h]
0069C9FC mov dword ptr [ebp-1930h],ecx
0069CA02 movaps xmm0,xmmword ptr [ebp-918h]
0069CA09 movaps xmmword ptr [ebp-8A8h],xmm0
0069CA10 mov dword ptr [ebp-1928h],eax
0069CA16 lea ecx,[ebp-898h]
0069CA1C mov dword ptr [ebp-1920h],ecx
0069CA22 movaps xmm1,xmmword ptr [ebp-918h]
0069CA29 movaps xmmword ptr [ebp-898h],xmm1
0069CA30 movaps xmm2,xmmword ptr [ebp-8A8h]
0069CA37 shufps xmm2,xmmword ptr [ebp-898h],0FFh
0069CA3F lea ecx,[ebp-888h]
0069CA45 mov dword ptr [ebp-1918h],ecx
0069CA4B movaps xmmword ptr [ebp-878h],xmm2
0069CA52 lea ecx,[ebp-888h]
0069CA58 mov dword ptr [ebp-1910h],ecx
0069CA5E movaps xmmword ptr [ebp-888h],xmm2
0069CA65 lea ecx,[ebp-868h]
0069CA6B mov dword ptr [ebp-1908h],ecx
0069CA71 mov dword ptr [ebp-1900h],edx
0069CA77 mov dword ptr [ebp-18F8h],esi
0069CA7D mov dword ptr [ebp-18F0h],edx
0069CA83 lea ecx,[ebp-858h]
0069CA89 mov dword ptr [ebp-18E8h],ecx
0069CA8F movaps xmm3,xmmword ptr [ebp-0BC8h]
0069CA96 movaps xmmword ptr [ebp-858h],xmm3
0069CA9D mov dword ptr [ebp-18E0h],esi
0069CAA3 lea esi,[ebp-848h]
0069CAA9 mov dword ptr [ebp-18D8h],esi
0069CAAF movaps xmm4,xmmword ptr [ebp-0A68h]
0069CAB6 movaps xmmword ptr [ebp-848h],xmm4
0069CABD movaps xmm5,xmmword ptr [ebp-858h]
0069CAC4 mulps xmm5,xmmword ptr [ebp-848h]
0069CACB lea esi,[ebp-868h]
0069CAD1 mov dword ptr [ebp-18D0h],esi
0069CAD7 movaps xmmword ptr [ebp-838h],xmm5
0069CADE lea ecx,[ebp-868h]
0069CAE4 mov dword ptr [ebp-18C8h],ecx
0069CAEA movaps xmmword ptr [ebp-868h],xmm5
0069CAF1 lea esi,[ebp-828h]
0069CAF7 mov dword ptr [ebp-18C0h],esi
0069CAFD lea ecx,[ebp-888h]
0069CB03 mov dword ptr [ebp-18B8h],ecx
0069CB09 lea esi,[ebp-868h]
0069CB0F mov dword ptr [ebp-18B0h],esi
0069CB15 lea ecx,[ebp-888h]
0069CB1B mov dword ptr [ebp-18A8h],ecx
0069CB21 lea esi,[ebp-818h]
0069CB27 mov dword ptr [ebp-18A0h],esi
0069CB2D movaps xmm6,xmmword ptr [ebp-888h]
0069CB34 movaps xmmword ptr [ebp-818h],xmm6
0069CB3B lea ecx,[ebp-868h]
0069CB41 mov dword ptr [ebp-1898h],ecx
0069CB47 lea esi,[ebp-808h]
0069CB4D mov dword ptr [ebp-1890h],esi
0069CB53 movaps xmm7,xmmword ptr [ebp-868h]
0069CB5A movaps xmmword ptr [ebp-808h],xmm7
0069CB61 movaps xmm0,xmmword ptr [ebp-818h]
0069CB68 addps xmm0,xmmword ptr [ebp-808h]
0069CB6F lea ecx,[ebp-828h]
0069CB75 mov dword ptr [ebp-1888h],ecx
0069CB7B movaps xmmword ptr [ebp-7F8h],xmm0
0069CB82 lea esi,[ebp-828h]
0069CB88 mov dword ptr [ebp-1880h],esi
0069CB8E movaps xmmword ptr [ebp-828h],xmm0
0069CB95 mov dword ptr [ebp-1878h],edi
0069CB9B lea edi,[ebp-828h]
0069CBA1 mov dword ptr [ebp-1870h],edi
0069CBA7 movaps xmm1,xmmword ptr [ebp-828h]
0069CBAE movaps xmmword ptr [ebp-8D8h],xmm1
0069CBB5 mov dword ptr [ebp-1868h],eax
0069CBBB lea esi,[ebp-898h]
0069CBC1 mov dword ptr [ebp-1860h],esi
0069CBC7 movaps xmm2,xmmword ptr [ebp-918h]
0069CBCE movaps xmmword ptr [ebp-898h],xmm2
0069CBD5 mov dword ptr [ebp-1858h],eax
0069CBDB lea edi,[ebp-8A8h]
0069CBE1 mov dword ptr [ebp-1850h],edi
0069CBE7 movaps xmm3,xmmword ptr [ebp-918h]
0069CBEE movaps xmmword ptr [ebp-8A8h],xmm3
0069CBF5 movaps xmm4,xmmword ptr [ebp-898h]
0069CBFC shufps xmm4,xmmword ptr [ebp-8A8h],0AAh
0069CC04 lea ecx,[ebp-888h]
0069CC0A mov dword ptr [ebp-1848h],ecx
0069CC10 movaps xmmword ptr [ebp-7E8h],xmm4
0069CC17 lea ecx,[ebp-888h]
0069CC1D mov dword ptr [ebp-1840h],ecx
0069CC23 movaps xmmword ptr [ebp-888h],xmm4
0069CC2A lea ecx,[ebp-828h]
0069CC30 mov dword ptr [ebp-1838h],ecx
0069CC36 mov dword ptr [ebp-1830h],edx
0069CC3C lea ecx,[ebp-0A38h]
0069CC42 mov dword ptr [ebp-1828h],ecx
0069CC48 mov dword ptr [ebp-1820h],edx
0069CC4E lea edx,[ebp-7D8h]
0069CC54 mov dword ptr [ebp-1818h],edx
0069CC5A movaps xmm5,xmmword ptr [ebp-0BC8h]
0069CC61 movaps xmmword ptr [ebp-7D8h],xmm5
0069CC68 mov dword ptr [ebp-1810h],ecx
0069CC6E lea edx,[ebp-7C8h]
0069CC74 mov dword ptr [ebp-1808h],edx
0069CC7A movaps xmm6,xmmword ptr [ebp-0A38h]
0069CC81 movaps xmmword ptr [ebp-7C8h],xmm6
0069CC88 movaps xmm0,xmmword ptr [ebp-7D8h]
0069CC8F mulps xmm0,xmmword ptr [ebp-7C8h]
0069CC96 lea ecx,[ebp-828h]
0069CC9C mov dword ptr [ebp-1800h],ecx
0069CCA2 movaps xmmword ptr [ebp-7B8h],xmm0
0069CCA9 lea edx,[ebp-828h]
0069CCAF mov dword ptr [ebp-17F8h],edx
0069CCB5 movaps xmmword ptr [ebp-828h],xmm0
0069CCBC lea ecx,[ebp-868h]
0069CCC2 mov dword ptr [ebp-17F0h],ecx
0069CCC8 lea edx,[ebp-888h]
0069CCCE mov dword ptr [ebp-17E8h],edx
0069CCD4 lea ecx,[ebp-828h]
0069CCDA mov dword ptr [ebp-17E0h],ecx
0069CCE0 lea edx,[ebp-888h]
0069CCE6 mov dword ptr [ebp-17D8h],edx
0069CCEC lea ecx,[ebp-7A8h]
0069CCF2 mov dword ptr [ebp-17D0h],ecx
0069CCF8 movaps xmm7,xmmword ptr [ebp-888h]
0069CCFF movaps xmmword ptr [ebp-7A8h],xmm7
0069CD06 lea edx,[ebp-828h]
0069CD0C mov dword ptr [ebp-17C8h],edx
0069CD12 lea ecx,[ebp-798h]
0069CD18 mov dword ptr [ebp-17C0h],ecx
0069CD1E movaps xmm0,xmmword ptr [ebp-828h]
0069CD25 movaps xmmword ptr [ebp-798h],xmm0
0069CD2C movaps xmm1,xmmword ptr [ebp-7A8h]
0069CD33 addps xmm1,xmmword ptr [ebp-798h]
0069CD3A lea edx,[ebp-868h]
0069CD40 mov dword ptr [ebp-17B8h],edx
0069CD46 movaps xmmword ptr [ebp-788h],xmm1
0069CD4D lea ecx,[ebp-868h]
0069CD53 mov dword ptr [ebp-17B0h],ecx
0069CD59 movaps xmmword ptr [ebp-868h],xmm1
0069CD60 lea edx,[ebp-8C8h]
0069CD66 mov dword ptr [ebp-17A8h],edx
0069CD6C lea ecx,[ebp-868h]
0069CD72 mov dword ptr [ebp-17A0h],ecx
0069CD78 movaps xmm2,xmmword ptr [ebp-868h]
0069CD7F movaps xmmword ptr [ebp-8C8h],xmm2
0069CD86 mov dword ptr [ebp-1798h],eax
0069CD8C mov dword ptr [ebp-1790h],esi
0069CD92 movaps xmm3,xmmword ptr [ebp-918h]
0069CD99 movaps xmmword ptr [ebp-898h],xmm3
0069CDA0 mov dword ptr [ebp-1788h],eax
0069CDA6 mov dword ptr [ebp-1780h],edi
0069CDAC movaps xmm4,xmmword ptr [ebp-918h]
0069CDB3 movaps xmmword ptr [ebp-8A8h],xmm4
0069CDBA movaps xmm5,xmmword ptr [ebp-898h]
0069CDC1 shufps xmm5,xmmword ptr [ebp-8A8h],55h
0069CDC9 lea eax,[ebp-888h]
0069CDCF mov dword ptr [ebp-1778h],eax
0069CDD5 movaps xmmword ptr [ebp-778h],xmm5
0069CDDC lea esi,[ebp-888h]
0069CDE2 mov dword ptr [ebp-1770h],esi
0069CDE8 movaps xmmword ptr [ebp-888h],xmm5
0069CDEF lea edi,[ebp-828h]
0069CDF5 mov dword ptr [ebp-1768h],edi
0069CDFB lea eax,[ebp-0BC8h]
0069CE01 mov dword ptr [ebp-1760h],eax
0069CE07 lea ecx,[ebp-0A08h]
0069CE0D mov dword ptr [ebp-1758h],ecx
0069CE13 mov dword ptr [ebp-1750h],eax
0069CE19 lea edx,[ebp-768h]
0069CE1F mov dword ptr [ebp-1748h],edx
0069CE25 movaps xmm6,xmmword ptr [ebp-0BC8h]
0069CE2C movaps xmmword ptr [ebp-768h],xmm6
0069CE33 mov dword ptr [ebp-1740h],ecx
0069CE39 lea esi,[ebp-758h]
0069CE3F mov dword ptr [ebp-1738h],esi
0069CE45 movaps xmm7,xmmword ptr [ebp-0A08h]
0069CE4C movaps xmmword ptr [ebp-758h],xmm7
0069CE53 movaps xmm0,xmmword ptr [ebp-768h]
0069CE5A mulps xmm0,xmmword ptr [ebp-758h]
0069CE61 lea edi,[ebp-828h]
0069CE67 mov dword ptr [ebp-1730h],edi
0069CE6D movaps xmmword ptr [ebp-748h],xmm0
0069CE74 lea eax,[ebp-828h]
0069CE7A mov dword ptr [ebp-1728h],eax
0069CE80 movaps xmmword ptr [ebp-828h],xmm0
0069CE87 lea edx,[ebp-868h]
0069CE8D mov dword ptr [ebp-1720h],edx
0069CE93 lea ecx,[ebp-888h]
0069CE99 mov dword ptr [ebp-1718h],ecx
0069CE9F lea esi,[ebp-828h]
0069CEA5 mov dword ptr [ebp-1710h],esi
0069CEAB lea edi,[ebp-888h]
0069CEB1 mov dword ptr [ebp-1708h],edi
0069CEB7 lea eax,[ebp-738h]
0069CEBD mov dword ptr [ebp-1700h],eax
0069CEC3 movaps xmm1,xmmword ptr [ebp-888h]
0069CECA movaps xmmword ptr [ebp-738h],xmm1
0069CED1 lea edx,[ebp-828h]
0069CED7 mov dword ptr [ebp-16F8h],edx
0069CEDD lea ecx,[ebp-728h]
0069CEE3 mov dword ptr [ebp-16F0h],ecx
0069CEE9 movaps xmm2,xmmword ptr [ebp-828h]
0069CEF0 movaps xmmword ptr [ebp-728h],xmm2
0069CEF7 movaps xmm3,xmmword ptr [ebp-738h]
0069CEFE addps xmm3,xmmword ptr [ebp-728h]
0069CF05 lea esi,[ebp-868h]
0069CF0B mov dword ptr [ebp-16E8h],esi
0069CF11 movaps xmmword ptr [ebp-718h],xmm3
0069CF18 lea edi,[ebp-868h]
0069CF1E mov dword ptr [ebp-16E0h],edi
0069CF24 movaps xmmword ptr [ebp-868h],xmm3
0069CF2B lea eax,[ebp-8B8h]
0069CF31 mov dword ptr [ebp-16D8h],eax
0069CF37 lea edx,[ebp-868h]
0069CF3D mov dword ptr [ebp-16D0h],edx
0069CF43 movaps xmm4,xmmword ptr [ebp-868h]
0069CF4A movaps xmmword ptr [ebp-8B8h],xmm4
It is doing the exact same thing but with 4 or 5 times more code!!!
And this pattern is repeated all over the place in our code with other methods and function which make our application far less optimized than hoped.
This behaviour was not seen at all with Intel Compiler 11.
Any idea?
Best Regards,
Laurent Lessieux
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hello,
I tried to reproduce the error with the small code snippets you provided. But for me the assembler code of the function you provided didn't changed after inlining.
So I need a complete reproducer for that problem to continue investigation.
Thanks,
Alex
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I will see what I can do but I certainly can't provide the full code of our rendering engine. So I will have to do some specific application for that and then I can't guarantee that it will fail or not.
Apparently I am not the only one reporting performance degradation with the new Compiler though so it must be a quite common occurence.
Laurent.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I tried to make a small test application with the same engine (reduced to the minimum) but I am failing to get the same issue.
I spent almost a full morning, I will continue a bit to see what is different between the real code and the test code but I will probably not be able to spend a lot more time on the issue.
Also I noticed that even with /Ob2 in that case it was not inlining small functions that usually are inlined (like simple operators or simple class accessors.)
Laurent.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Here are the flags I am using now.
Somehow adding /Qcxx-features and /Ox seems to have fixed it or perhaps /Oa-. Not sure.
Anyway we go with that now.
I am not sure that I will be able to justify spending more time on that one now.
Laurent
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Actually I can now confirm that simply removing the Qcxx-features from the above list makes the code way bigger and ends up generating debug like assembly code for me.
Laurent.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Laurent,
thank you very much for your good investigations. But without some code that the engineering can test it will be very hard to find the issue and fix it.
I've doing some more investigations on your code and see that I
get very similar results if I disable the inlining with /Ob0. So I think the
inlining is disabled for this piece of code.May
the compiler heuristics have changed for that. Please try to use
#pragma forceinline recursive
before calling the function.
You can also try to adjust the inlining factor. Default is 100, you can try to increase it to 200
/Qinline-factor 200
Hope this helps to solve this issue for you!
Thanks,
Alex

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page