Assembly-level optimization issues

djordje_v_petrovic · ‎12-03-2010

Hello all,

First of all, I'd like to let you know that I'm new to IA-32 assembly programming, so any tips regarding the following code would be welcome.

I've got a piece of code generated by VS C++ compiler running about 28% faster than my hand-coded, "optimized" piece, and I can't figure out why.

Also, I'd like to know if there's an easy way to see/count the gaps in my pipeline and/or see what the scheduler does to my code. (I've tried interleaving independent instruction sequences in hopes of reducing the cycle-count, but to no observable improvement, possibly because the scheduler already does that).

I'm measuring the cycle count by executing CPUID; RDTSC before and after the code in question and subtracting the results. Prior to this, I issue the following calls:

[cpp]SetPriorityClass (GetCurrentThread(), REALTIME_PRIORITY_CLASS);
SetThreadPriority (GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
SetProcessPriorityBoost (GetCurrentThread(), 0);
SetThreadAffinityMask (GetCurrentThread(), 0x1);[/cpp]

as advised by Peter Kankowski in his article.

One more thing: The CPU I'm running/testing the code on is T5470 (Core2Duo @ 1.6GHz).

Compiler generated version of the code:

[plain]   108: 		{
   109: 			for (qint32 i = 0; i < (qint32)num_data_points; i++)
004012AD  cvttsd2si   edx,mmword ptr [ebp+38h] 
004012B2  xor         eax,eax 
004012B4  test        edx,edx 
004012B6  jle         LOOP_END+59h (401301h) 
004012B8  mov         ecx,dword ptr [ebp+64h] 
004012BB  movsd       xmm0,mmword ptr [ebp] 
004012C0  movsd       xmm1,mmword ptr [ebp-1Ch] 
004012C5  movsd       xmm2,mmword ptr [ebp+18h] 
004012CA  movsd       xmm3,mmword ptr [ebp+28h] 
004012CF  add         ecx,4 
   110: 			{
   111: 				plot_points.setX((qint32)data_offset);
004012D2  cvttsd2si   edi,xmm0 
004012D6  mov         dword ptr [esi+eax*8],edi 
   112: 				data_offset += num_pixels_per_point;
004012D9  movapd      xmm4,xmm1 
004012DD  addsd       xmm4,xmm0 
004012E1  movapd      xmm0,xmm4 
   113: 				plot_points.setY((qint32)(data->y()) * normalized_zoomed_height + offset_height);
004012E5  cvtsi2sd    xmm4,dword ptr [ecx] 
004012E9  mulsd       xmm4,xmm2 
004012ED  addsd       xmm4,xmm3 
004012F1  cvttsd2si   edi,xmm4 
004012F5  mov         dword ptr [esi+eax*8+4],edi 
004012F9  inc         eax  
   114: 				data++;
004012FA  add         ecx,8 
004012FD  cmp         eax,edx 
004012FF  jl          LOOP_END+2Ah (4012D2h) 
   115: 			}
   116: 		}[/plain]

My hand-coded version:

[plain]    66: 		{
    67: 			__asm 
    68: 			{
    69: 	//   240: 		for (qint32 i = 0; i < (qint32)num_data_points; i++)
    70: 				//xor         eax,eax // i
    71: 				movsd       xmm4, num_data_points
0040124C  movsd       xmm4,mmword ptr [ebp+38h] 
    72: 				mov         ecx, data
00401251  mov         ecx,dword ptr [ebp+64h] 
    73: 				add         ecx,4 // ptr to data[0].y
00401254  add         ecx,4 
    74: 				mov         esi, plot_points
00401257  mov         esi,dword ptr [ebp+5Ch] 
    75: 	//   241: 		{
    76: 	//   242: 			plot_points.setX((qint32)data_offset);
    77: 				movsd       xmm2, data_offset
0040125A  movsd       xmm2,mmword ptr [ebp] 
    78: 				movsd       xmm3, normalized_zoomed_height
0040125F  movsd       xmm3,mmword ptr [ebp+18h] 
    79: 				movsd       xmm5, offset_height
00401264  movsd       xmm5,mmword ptr [ebp+28h] 
    80: 				movsd       xmm6, num_pixels_per_point
00401269  movsd       xmm6,mmword ptr [ebp-1Ch] 
    81: 				cvttsd2si   eax, xmm4
0040126E  cvttsd2si   eax,xmm4 
    82: 				test        eax, eax
00401272  test        eax,eax 
    83: 				jle         LOOP_END
00401274  jle         LOOP_END (4012A8h) 
00401276  jmp         LOOP_START (401280h) 
00401278  lea         esp,[esp] 
0040127F  nop              
    84: 				align 16
    85: LOOP_START:
    86: 				cvttsd2si   edi, xmm2
00401280  cvttsd2si   edi,xmm2 
    87: 				mov         dword ptr [esi], edi
00401284  mov         dword ptr [esi],edi 
    88: 
    89: 	//   244: 			plot_points.setY((qint32)(data->y()) * normalized_zoomed_height + offset_height);
    90: 				cvtsi2sd    xmm4, dword ptr [ecx]
00401286  cvtsi2sd    xmm4,dword ptr [ecx] 
    91: 	//   243: 			offset += num_pixels_per_point;
    92: 				addsd       xmm2, xmm6
0040128A  addsd       xmm2,xmm6 
    93: 				mulsd       xmm4, xmm3
0040128E  mulsd       xmm4,xmm3 
    94: 				add         esi, 4
00401292  add         esi,4 
    95: 				addsd       xmm4, xmm5
00401295  addsd       xmm4,xmm5 
    96: 				add         ecx, 8
00401299  add         ecx,8 
    97: 				cvttsd2si   edi, xmm4
0040129C  cvttsd2si   edi,xmm4 
    98: 				mov         dword ptr [esi], edi
004012A0  mov         dword ptr [esi],edi 
    99: 				add         esi, 4
004012A2  add         esi,4 
   100: 	//   245: 			data++;
   101: 				dec         eax
004012A5  dec         eax  
   102: 				jnz         LOOP_START
004012A6  jne         LOOP_START (401280h) 
   103: 	//   246: 		}
   104: LOOP_END:
   105: 			}
   106: 		}[/plain]

Thanks in advance,

Djordje

djordje_v_petrovic · ‎12-04-2010

Hi all,

It seems the main issue is here:

movapd xmm4,xmm1
addsd xmm4,xmm0
movapd xmm0,xmm4

instead of

addsd xmm0,xmm1

makes the compiler generated loop perform significantly faster than the hand-coded version.

Any ideas as to why this is happening ?

Thanks,

Djordje

Taronyu · ‎12-10-2010

I'm not very deep into this low level stuff but my tip would be that it has to do sth with either the CPU pipeline or caching. I would recommend consulting the Intel Software Optimizazion Guide for information on these topics, it's very helpful.