Intel® C++ Compiler
Community support and assistance for creating C++ code that runs on platforms based on Intel® processors.
7956 Discussions

Inlined code looks like debug code without any optimization.

LLess
Beginner
347 Views
Hi,

Just for reference, I am using the Intel Compiler 12.1 (ComposerXE 2011 Update 6) under Windows 7 on an I7 2600K processor with VS2010.

When compiling in release 32bit one of our applications we have some code being strangely inlined.
My Current Compiler options are /GR /O3 /Ox /Qinline-dllimport /arch:SSE2 /QaxSSE4.2 /Zi /Qvc10
but I have tried with O2 etc.. and nothing I have tried so far fixes the issue I am going to describe.

Here is a simple function that we use. the VxrF32vec4 is just a class derived from F32vec4 with additional inline method and nothing else.

void __regcall InitPosition(VxrF32vec4 &px,VxrF32vec4 &py,VxrF32vec4 &pz,const VxrF32vec4 &position,const VxrF32vec4 &c0,const VxrF32vec4 &xdx,const VxrF32vec4 &xdy,const VxrF32vec4 &xdz)

{

px = _mm_shuffle_ps(position,position,0xFF) + c0*xdx;

py = _mm_shuffle_ps(position,position,0xAA) + c0*xdy;

pz = _mm_shuffle_ps(position,position,0x55) + c0*xdz;
}

And it is compiled like that.

void __regcall InitPosition(VxrF32vec4 &px,VxrF32vec4 &py,VxrF32vec4 &pz,const VxrF32vec4 &position,const VxrF32vec4 &c0,const VxrF32vec4 &xdx,const VxrF32vec4 &xdy,const VxrF32vec4 &xdz)

{

0053E830 push ebx

px = _mm_shuffle_ps(position,position,0xFF) + c0*xdx;

0053E831 mov ebx,dword ptr [xdx]

0053E835 movaps xmm0,xmmword ptr [esi]

0053E838 movaps xmm1,xmmword ptr [edi]

0053E83B mulps xmm0,xmmword ptr [ebx]

0053E83E shufps xmm1,xmm1,0FFh

0053E842 addps xmm1,xmm0

0053E845 movaps xmmword ptr [eax],xmm1

py = _mm_shuffle_ps(position,position,0xAA) + c0*xdy;

0053E848 mov eax,dword ptr [xdy]

0053E84C movaps xmm2,xmmword ptr [esi]

0053E84F movaps xmm3,xmmword ptr [edi]

0053E852 mulps xmm2,xmmword ptr [eax]

0053E855 shufps xmm3,xmm3,0AAh

0053E859 addps xmm3,xmm2

0053E85C movaps xmmword ptr [ecx],xmm3

pz = _mm_shuffle_ps(position,position,0x55) + c0*xdz;

0053E85F mov ecx,dword ptr [xdz]

0053E863 movaps xmm0,xmmword ptr [esi]

0053E866 movaps xmm1,xmmword ptr [edi]

0053E869 mulps xmm0,xmmword ptr [ecx]

0053E86C shufps xmm1,xmm1,55h

0053E870 addps xmm1,xmm0

0053E873 movaps xmmword ptr [edx],xmm1

}

0053E876 pop ebx

0053E877 ret

Which is perfectly fine as far as I can see.

However when I try to inline that function here is the code generate for it.

InitPosition(px,py,pz,position,c0,xdx,xdy,xdz);

0069C9A2 mov dword ptr [ebp-1978h],edi

0069C9A8 mov dword ptr [ebp-1970h],eax

0069C9AE mov dword ptr [ebp-1968h],ecx

0069C9B4 lea eax,[ebp-918h]

0069C9BA mov dword ptr [ebp-1960h],eax

0069C9C0 lea edx,[ebp-0BC8h]

0069C9C6 mov dword ptr [ebp-1958h],edx

0069C9CC lea esi,[ebp-0A68h]

0069C9D2 mov dword ptr [ebp-1950h],esi

0069C9D8 lea ecx,[ebp-0A38h]

0069C9DE mov dword ptr [ebp-1948h],ecx

0069C9E4 lea ecx,[ebp-0A08h]

0069C9EA mov dword ptr [ebp-1940h],ecx

0069C9F0 mov dword ptr [ebp-1938h],eax

0069C9F6 lea ecx,[ebp-8A8h]

0069C9FC mov dword ptr [ebp-1930h],ecx

0069CA02 movaps xmm0,xmmword ptr [ebp-918h]

0069CA09 movaps xmmword ptr [ebp-8A8h],xmm0

0069CA10 mov dword ptr [ebp-1928h],eax

0069CA16 lea ecx,[ebp-898h]

0069CA1C mov dword ptr [ebp-1920h],ecx

0069CA22 movaps xmm1,xmmword ptr [ebp-918h]

0069CA29 movaps xmmword ptr [ebp-898h],xmm1

0069CA30 movaps xmm2,xmmword ptr [ebp-8A8h]

0069CA37 shufps xmm2,xmmword ptr [ebp-898h],0FFh

0069CA3F lea ecx,[ebp-888h]

0069CA45 mov dword ptr [ebp-1918h],ecx

0069CA4B movaps xmmword ptr [ebp-878h],xmm2

0069CA52 lea ecx,[ebp-888h]

0069CA58 mov dword ptr [ebp-1910h],ecx

0069CA5E movaps xmmword ptr [ebp-888h],xmm2

0069CA65 lea ecx,[ebp-868h]

0069CA6B mov dword ptr [ebp-1908h],ecx

0069CA71 mov dword ptr [ebp-1900h],edx

0069CA77 mov dword ptr [ebp-18F8h],esi

0069CA7D mov dword ptr [ebp-18F0h],edx

0069CA83 lea ecx,[ebp-858h]

0069CA89 mov dword ptr [ebp-18E8h],ecx

0069CA8F movaps xmm3,xmmword ptr [ebp-0BC8h]

0069CA96 movaps xmmword ptr [ebp-858h],xmm3

0069CA9D mov dword ptr [ebp-18E0h],esi

0069CAA3 lea esi,[ebp-848h]

0069CAA9 mov dword ptr [ebp-18D8h],esi

0069CAAF movaps xmm4,xmmword ptr [ebp-0A68h]

0069CAB6 movaps xmmword ptr [ebp-848h],xmm4

0069CABD movaps xmm5,xmmword ptr [ebp-858h]

0069CAC4 mulps xmm5,xmmword ptr [ebp-848h]

0069CACB lea esi,[ebp-868h]

0069CAD1 mov dword ptr [ebp-18D0h],esi

0069CAD7 movaps xmmword ptr [ebp-838h],xmm5

0069CADE lea ecx,[ebp-868h]

0069CAE4 mov dword ptr [ebp-18C8h],ecx

0069CAEA movaps xmmword ptr [ebp-868h],xmm5

0069CAF1 lea esi,[ebp-828h]

0069CAF7 mov dword ptr [ebp-18C0h],esi

0069CAFD lea ecx,[ebp-888h]

0069CB03 mov dword ptr [ebp-18B8h],ecx

0069CB09 lea esi,[ebp-868h]

0069CB0F mov dword ptr [ebp-18B0h],esi

0069CB15 lea ecx,[ebp-888h]

0069CB1B mov dword ptr [ebp-18A8h],ecx

0069CB21 lea esi,[ebp-818h]

0069CB27 mov dword ptr [ebp-18A0h],esi

0069CB2D movaps xmm6,xmmword ptr [ebp-888h]

0069CB34 movaps xmmword ptr [ebp-818h],xmm6

0069CB3B lea ecx,[ebp-868h]

0069CB41 mov dword ptr [ebp-1898h],ecx

0069CB47 lea esi,[ebp-808h]

0069CB4D mov dword ptr [ebp-1890h],esi

0069CB53 movaps xmm7,xmmword ptr [ebp-868h]

0069CB5A movaps xmmword ptr [ebp-808h],xmm7

0069CB61 movaps xmm0,xmmword ptr [ebp-818h]

0069CB68 addps xmm0,xmmword ptr [ebp-808h]

0069CB6F lea ecx,[ebp-828h]

0069CB75 mov dword ptr [ebp-1888h],ecx

0069CB7B movaps xmmword ptr [ebp-7F8h],xmm0

0069CB82 lea esi,[ebp-828h]

0069CB88 mov dword ptr [ebp-1880h],esi

0069CB8E movaps xmmword ptr [ebp-828h],xmm0

0069CB95 mov dword ptr [ebp-1878h],edi

0069CB9B lea edi,[ebp-828h]

0069CBA1 mov dword ptr [ebp-1870h],edi

0069CBA7 movaps xmm1,xmmword ptr [ebp-828h]

0069CBAE movaps xmmword ptr [ebp-8D8h],xmm1

0069CBB5 mov dword ptr [ebp-1868h],eax

0069CBBB lea esi,[ebp-898h]

0069CBC1 mov dword ptr [ebp-1860h],esi

0069CBC7 movaps xmm2,xmmword ptr [ebp-918h]

0069CBCE movaps xmmword ptr [ebp-898h],xmm2

0069CBD5 mov dword ptr [ebp-1858h],eax

0069CBDB lea edi,[ebp-8A8h]

0069CBE1 mov dword ptr [ebp-1850h],edi

0069CBE7 movaps xmm3,xmmword ptr [ebp-918h]

0069CBEE movaps xmmword ptr [ebp-8A8h],xmm3

0069CBF5 movaps xmm4,xmmword ptr [ebp-898h]

0069CBFC shufps xmm4,xmmword ptr [ebp-8A8h],0AAh

0069CC04 lea ecx,[ebp-888h]

0069CC0A mov dword ptr [ebp-1848h],ecx

0069CC10 movaps xmmword ptr [ebp-7E8h],xmm4

0069CC17 lea ecx,[ebp-888h]

0069CC1D mov dword ptr [ebp-1840h],ecx

0069CC23 movaps xmmword ptr [ebp-888h],xmm4

0069CC2A lea ecx,[ebp-828h]

0069CC30 mov dword ptr [ebp-1838h],ecx

0069CC36 mov dword ptr [ebp-1830h],edx

0069CC3C lea ecx,[ebp-0A38h]

0069CC42 mov dword ptr [ebp-1828h],ecx

0069CC48 mov dword ptr [ebp-1820h],edx

0069CC4E lea edx,[ebp-7D8h]

0069CC54 mov dword ptr [ebp-1818h],edx

0069CC5A movaps xmm5,xmmword ptr [ebp-0BC8h]

0069CC61 movaps xmmword ptr [ebp-7D8h],xmm5

0069CC68 mov dword ptr [ebp-1810h],ecx

0069CC6E lea edx,[ebp-7C8h]

0069CC74 mov dword ptr [ebp-1808h],edx

0069CC7A movaps xmm6,xmmword ptr [ebp-0A38h]

0069CC81 movaps xmmword ptr [ebp-7C8h],xmm6

0069CC88 movaps xmm0,xmmword ptr [ebp-7D8h]

0069CC8F mulps xmm0,xmmword ptr [ebp-7C8h]

0069CC96 lea ecx,[ebp-828h]

0069CC9C mov dword ptr [ebp-1800h],ecx

0069CCA2 movaps xmmword ptr [ebp-7B8h],xmm0

0069CCA9 lea edx,[ebp-828h]

0069CCAF mov dword ptr [ebp-17F8h],edx

0069CCB5 movaps xmmword ptr [ebp-828h],xmm0

0069CCBC lea ecx,[ebp-868h]

0069CCC2 mov dword ptr [ebp-17F0h],ecx

0069CCC8 lea edx,[ebp-888h]

0069CCCE mov dword ptr [ebp-17E8h],edx

0069CCD4 lea ecx,[ebp-828h]

0069CCDA mov dword ptr [ebp-17E0h],ecx

0069CCE0 lea edx,[ebp-888h]

0069CCE6 mov dword ptr [ebp-17D8h],edx

0069CCEC lea ecx,[ebp-7A8h]

0069CCF2 mov dword ptr [ebp-17D0h],ecx

0069CCF8 movaps xmm7,xmmword ptr [ebp-888h]

0069CCFF movaps xmmword ptr [ebp-7A8h],xmm7

0069CD06 lea edx,[ebp-828h]

0069CD0C mov dword ptr [ebp-17C8h],edx

0069CD12 lea ecx,[ebp-798h]

0069CD18 mov dword ptr [ebp-17C0h],ecx

0069CD1E movaps xmm0,xmmword ptr [ebp-828h]

0069CD25 movaps xmmword ptr [ebp-798h],xmm0

0069CD2C movaps xmm1,xmmword ptr [ebp-7A8h]

0069CD33 addps xmm1,xmmword ptr [ebp-798h]

0069CD3A lea edx,[ebp-868h]

0069CD40 mov dword ptr [ebp-17B8h],edx

0069CD46 movaps xmmword ptr [ebp-788h],xmm1

0069CD4D lea ecx,[ebp-868h]

0069CD53 mov dword ptr [ebp-17B0h],ecx

0069CD59 movaps xmmword ptr [ebp-868h],xmm1

0069CD60 lea edx,[ebp-8C8h]

0069CD66 mov dword ptr [ebp-17A8h],edx

0069CD6C lea ecx,[ebp-868h]

0069CD72 mov dword ptr [ebp-17A0h],ecx

0069CD78 movaps xmm2,xmmword ptr [ebp-868h]

0069CD7F movaps xmmword ptr [ebp-8C8h],xmm2

0069CD86 mov dword ptr [ebp-1798h],eax

0069CD8C mov dword ptr [ebp-1790h],esi

0069CD92 movaps xmm3,xmmword ptr [ebp-918h]

0069CD99 movaps xmmword ptr [ebp-898h],xmm3

0069CDA0 mov dword ptr [ebp-1788h],eax

0069CDA6 mov dword ptr [ebp-1780h],edi

0069CDAC movaps xmm4,xmmword ptr [ebp-918h]

0069CDB3 movaps xmmword ptr [ebp-8A8h],xmm4

0069CDBA movaps xmm5,xmmword ptr [ebp-898h]

0069CDC1 shufps xmm5,xmmword ptr [ebp-8A8h],55h

0069CDC9 lea eax,[ebp-888h]

0069CDCF mov dword ptr [ebp-1778h],eax

0069CDD5 movaps xmmword ptr [ebp-778h],xmm5

0069CDDC lea esi,[ebp-888h]

0069CDE2 mov dword ptr [ebp-1770h],esi

0069CDE8 movaps xmmword ptr [ebp-888h],xmm5

0069CDEF lea edi,[ebp-828h]

0069CDF5 mov dword ptr [ebp-1768h],edi

0069CDFB lea eax,[ebp-0BC8h]

0069CE01 mov dword ptr [ebp-1760h],eax

0069CE07 lea ecx,[ebp-0A08h]

0069CE0D mov dword ptr [ebp-1758h],ecx

0069CE13 mov dword ptr [ebp-1750h],eax

0069CE19 lea edx,[ebp-768h]

0069CE1F mov dword ptr [ebp-1748h],edx

0069CE25 movaps xmm6,xmmword ptr [ebp-0BC8h]

0069CE2C movaps xmmword ptr [ebp-768h],xmm6

0069CE33 mov dword ptr [ebp-1740h],ecx

0069CE39 lea esi,[ebp-758h]

0069CE3F mov dword ptr [ebp-1738h],esi

0069CE45 movaps xmm7,xmmword ptr [ebp-0A08h]

0069CE4C movaps xmmword ptr [ebp-758h],xmm7

0069CE53 movaps xmm0,xmmword ptr [ebp-768h]

0069CE5A mulps xmm0,xmmword ptr [ebp-758h]

0069CE61 lea edi,[ebp-828h]

0069CE67 mov dword ptr [ebp-1730h],edi

0069CE6D movaps xmmword ptr [ebp-748h],xmm0

0069CE74 lea eax,[ebp-828h]

0069CE7A mov dword ptr [ebp-1728h],eax

0069CE80 movaps xmmword ptr [ebp-828h],xmm0

0069CE87 lea edx,[ebp-868h]

0069CE8D mov dword ptr [ebp-1720h],edx

0069CE93 lea ecx,[ebp-888h]

0069CE99 mov dword ptr [ebp-1718h],ecx

0069CE9F lea esi,[ebp-828h]

0069CEA5 mov dword ptr [ebp-1710h],esi

0069CEAB lea edi,[ebp-888h]

0069CEB1 mov dword ptr [ebp-1708h],edi

0069CEB7 lea eax,[ebp-738h]

0069CEBD mov dword ptr [ebp-1700h],eax

0069CEC3 movaps xmm1,xmmword ptr [ebp-888h]

0069CECA movaps xmmword ptr [ebp-738h],xmm1

0069CED1 lea edx,[ebp-828h]

0069CED7 mov dword ptr [ebp-16F8h],edx

0069CEDD lea ecx,[ebp-728h]

0069CEE3 mov dword ptr [ebp-16F0h],ecx

0069CEE9 movaps xmm2,xmmword ptr [ebp-828h]

0069CEF0 movaps xmmword ptr [ebp-728h],xmm2

0069CEF7 movaps xmm3,xmmword ptr [ebp-738h]

0069CEFE addps xmm3,xmmword ptr [ebp-728h]

0069CF05 lea esi,[ebp-868h]

0069CF0B mov dword ptr [ebp-16E8h],esi

0069CF11 movaps xmmword ptr [ebp-718h],xmm3

0069CF18 lea edi,[ebp-868h]

0069CF1E mov dword ptr [ebp-16E0h],edi

0069CF24 movaps xmmword ptr [ebp-868h],xmm3

0069CF2B lea eax,[ebp-8B8h]

0069CF31 mov dword ptr [ebp-16D8h],eax

0069CF37 lea edx,[ebp-868h]

0069CF3D mov dword ptr [ebp-16D0h],edx

0069CF43 movaps xmm4,xmmword ptr [ebp-868h]

0069CF4A movaps xmmword ptr [ebp-8B8h],xmm4

It is doing the exact same thing but with 4 or 5 times more code!!!

And this pattern is repeated all over the place in our code with other methods and function which make our application far less optimized than hoped.

This behaviour was not seen at all with Intel Compiler 11.

Any idea?

Best Regards,

Laurent Lessieux

0 Kudos
7 Replies
Alexander_W_Intel
347 Views

Hello,

I tried to reproduce the error with the small code snippets you provided. But for me the assembler code of the function you provided didn't changed after inlining.

So I need a complete reproducer for that problem to continue investigation.

Thanks,
Alex


0 Kudos
LLess
Beginner
347 Views
Hi Alex,

I will see what I can do but I certainly can't provide the full code of our rendering engine. So I will have to do some specific application for that and then I can't guarantee that it will fail or not.

Apparently I am not the only one reporting performance degradation with the new Compiler though so it must be a quite common occurence.

Laurent.
0 Kudos
Alexander_W_Intel
347 Views
Hi Laurent,
the whole render engine would be far to big. If you can provide a small reproducer it would be very nice.
We need to be able to reproduce your issue on our side to fix it.
Thanks,
Alex
0 Kudos
LLess
Beginner
347 Views
Hi Alex,

I tried to make a small test application with the same engine (reduced to the minimum) but I am failing to get the same issue.
I spent almost a full morning, I will continue a bit to see what is different between the real code and the test code but I will probably not be able to spend a lot more time on the issue.

Also I noticed that even with /Ob2 in that case it was not inlining small functions that usually are inlined (like simple operators or simple class accessors.)

Laurent.
0 Kudos
LLess
Beginner
347 Views
Actually it seems that I managed to get the optimization back.
Here are the flags I am using now.












Somehow adding /Qcxx-features and /Ox seems to have fixed it or perhaps /Oa-. Not sure.
Anyway we go with that now.

I am not sure that I will be able to justify spending more time on that one now.

Laurent
0 Kudos
LLess
Beginner
347 Views
Hi Alex,

Actually I can now confirm that simply removing the Qcxx-features from the above list makes the code way bigger and ends up generating debug like assembly code for me.

Laurent.
0 Kudos
Alexander_W_Intel
347 Views

Hi Laurent,

thank you very much for your good investigations. But without some code that the engineering can test it will be very hard to find the issue and fix it.

I've doing some more investigations on your code and see that I get very similar results if I disable the inlining with /Ob0. So I think the inlining is disabled for this piece of code.May the compiler heuristics have changed for that. Please try to use

#pragma forceinline recursive

before calling the function.
You can also try to adjust the inlining factor. Default is 100, you can try to increase it to 200

/Qinline-factor 200

Hope this helps to solve this issue for you!
Thanks,
Alex


0 Kudos
Reply