Software Archive
Read-only legacy content
17061 Discussions

Assembly-level optimization issues

djordje_v_petrovic
770 Views

Hello all,

First of all, I'd like to let you know that I'm new to IA-32 assembly programming, so any tips regarding the following code would be welcome.

I've got a piece of code generated by VS C++ compiler running about 28% faster than my hand-coded, "optimized" piece, and I can't figure out why.

Also, I'd like to know if there's an easy way to see/count the gaps in my pipeline and/or see what the scheduler does to my code. (I've tried interleaving independent instruction sequences in hopes of reducing the cycle-count, but to no observable improvement, possibly because the scheduler already does that).

I'm measuring the cycle count by executing CPUID; RDTSC before and after the code in question and subtracting the results. Prior to this, I issue the following calls:

[cpp]SetPriorityClass (GetCurrentThread(), REALTIME_PRIORITY_CLASS);
SetThreadPriority (GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
SetProcessPriorityBoost (GetCurrentThread(), 0);
SetThreadAffinityMask (GetCurrentThread(), 0x1);[/cpp]

as advised by Peter Kankowski in his article.

One more thing: The CPU I'm running/testing the code on is T5470 (Core2Duo @ 1.6GHz).

Compiler generated version of the code:

[plain]   108: 		{
   109: 			for (qint32 i = 0; i < (qint32)num_data_points; i++)
004012AD  cvttsd2si   edx,mmword ptr [ebp+38h] 
004012B2  xor         eax,eax 
004012B4  test        edx,edx 
004012B6  jle         LOOP_END+59h (401301h) 
004012B8  mov         ecx,dword ptr [ebp+64h] 
004012BB  movsd       xmm0,mmword ptr [ebp] 
004012C0  movsd       xmm1,mmword ptr [ebp-1Ch] 
004012C5  movsd       xmm2,mmword ptr [ebp+18h] 
004012CA  movsd       xmm3,mmword ptr [ebp+28h] 
004012CF  add         ecx,4 
   110: 			{
   111: 				plot_points.setX((qint32)data_offset);
004012D2  cvttsd2si   edi,xmm0 
004012D6  mov         dword ptr [esi+eax*8],edi 
   112: 				data_offset += num_pixels_per_point;
004012D9  movapd      xmm4,xmm1 
004012DD  addsd       xmm4,xmm0 
004012E1  movapd      xmm0,xmm4 
   113: 				plot_points.setY((qint32)(data->y()) * normalized_zoomed_height + offset_height);
004012E5  cvtsi2sd    xmm4,dword ptr [ecx] 
004012E9  mulsd       xmm4,xmm2 
004012ED  addsd       xmm4,xmm3 
004012F1  cvttsd2si   edi,xmm4 
004012F5  mov         dword ptr [esi+eax*8+4],edi 
004012F9  inc         eax  
   114: 				data++;
004012FA  add         ecx,8 
004012FD  cmp         eax,edx 
004012FF  jl          LOOP_END+2Ah (4012D2h) 
   115: 			}
   116: 		}[/plain]

My hand-coded version:

[plain]    66: 		{
    67: 			__asm 
    68: 			{
    69: 	//   240: 		for (qint32 i = 0; i < (qint32)num_data_points; i++)
    70: 				//xor         eax,eax // i
    71: 				movsd       xmm4, num_data_points
0040124C  movsd       xmm4,mmword ptr [ebp+38h] 
    72: 				mov         ecx, data
00401251  mov         ecx,dword ptr [ebp+64h] 
    73: 				add         ecx,4 // ptr to data[0].y
00401254  add         ecx,4 
    74: 				mov         esi, plot_points
00401257  mov         esi,dword ptr [ebp+5Ch] 
    75: 	//   241: 		{
    76: 	//   242: 			plot_points.setX((qint32)data_offset);
    77: 				movsd       xmm2, data_offset
0040125A  movsd       xmm2,mmword ptr [ebp] 
    78: 				movsd       xmm3, normalized_zoomed_height
0040125F  movsd       xmm3,mmword ptr [ebp+18h] 
    79: 				movsd       xmm5, offset_height
00401264  movsd       xmm5,mmword ptr [ebp+28h] 
    80: 				movsd       xmm6, num_pixels_per_point
00401269  movsd       xmm6,mmword ptr [ebp-1Ch] 
    81: 				cvttsd2si   eax, xmm4
0040126E  cvttsd2si   eax,xmm4 
    82: 				test        eax, eax
00401272  test        eax,eax 
    83: 				jle         LOOP_END
00401274  jle         LOOP_END (4012A8h) 
00401276  jmp         LOOP_START (401280h) 
00401278  lea         esp,[esp] 
0040127F  nop              
    84: 				align 16
    85: LOOP_START:
    86: 				cvttsd2si   edi, xmm2
00401280  cvttsd2si   edi,xmm2 
    87: 				mov         dword ptr [esi], edi
00401284  mov         dword ptr [esi],edi 
    88: 
    89: 	//   244: 			plot_points.setY((qint32)(data->y()) * normalized_zoomed_height + offset_height);
    90: 				cvtsi2sd    xmm4, dword ptr [ecx]
00401286  cvtsi2sd    xmm4,dword ptr [ecx] 
    91: 	//   243: 			offset += num_pixels_per_point;
    92: 				addsd       xmm2, xmm6
0040128A  addsd       xmm2,xmm6 
    93: 				mulsd       xmm4, xmm3
0040128E  mulsd       xmm4,xmm3 
    94: 				add         esi, 4
00401292  add         esi,4 
    95: 				addsd       xmm4, xmm5
00401295  addsd       xmm4,xmm5 
    96: 				add         ecx, 8
00401299  add         ecx,8 
    97: 				cvttsd2si   edi, xmm4
0040129C  cvttsd2si   edi,xmm4 
    98: 				mov         dword ptr [esi], edi
004012A0  mov         dword ptr [esi],edi 
    99: 				add         esi, 4
004012A2  add         esi,4 
   100: 	//   245: 			data++;
   101: 				dec         eax
004012A5  dec         eax  
   102: 				jnz         LOOP_START
004012A6  jne         LOOP_START (401280h) 
   103: 	//   246: 		}
   104: LOOP_END:
   105: 			}
   106: 		}[/plain]

Thanks in advance,

Djordje

0 Kudos
2 Replies
djordje_v_petrovic
770 Views

Hi all,

It seems the main issue is here:

movapd xmm4,xmm1
addsd xmm4,xmm0
movapd xmm0,xmm4

instead of

addsd xmm0,xmm1

makes the compiler generated loop perform significantly faster than the hand-coded version.

Any ideas as to why this is happening ?

Thanks,

Djordje

0 Kudos
Taronyu
Beginner
770 Views
I'm not very deep into this low level stuff but my tip would be that it has to do sth with either the CPU pipeline or caching. I would recommend consulting the Intel Software Optimizazion Guide for information on these topics, it's very helpful.
0 Kudos
Reply