- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hello all,
First of all, I'd like to let you know that I'm new to IA-32 assembly programming, so any tips regarding the following code would be welcome.
I've got a piece of code generated by VS C++ compiler running about 28% faster than my hand-coded, "optimized" piece, and I can't figure out why.
Also, I'd like to know if there's an easy way to see/count the gaps in my pipeline and/or see what the scheduler does to my code. (I've tried interleaving independent instruction sequences in hopes of reducing the cycle-count, but to no observable improvement, possibly because the scheduler already does that).
I'm measuring the cycle count by executing CPUID; RDTSC before and after the code in question and subtracting the results. Prior to this, I issue the following calls:
[cpp]SetPriorityClass (GetCurrentThread(), REALTIME_PRIORITY_CLASS); SetThreadPriority (GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL); SetProcessPriorityBoost (GetCurrentThread(), 0); SetThreadAffinityMask (GetCurrentThread(), 0x1);[/cpp]
as advised by Peter Kankowski in his article.
One more thing: The CPU I'm running/testing the code on is T5470 (Core2Duo @ 1.6GHz).
Compiler generated version of the code:
[plain] 108: { 109: for (qint32 i = 0; i < (qint32)num_data_points; i++) 004012AD cvttsd2si edx,mmword ptr [ebp+38h] 004012B2 xor eax,eax 004012B4 test edx,edx 004012B6 jle LOOP_END+59h (401301h) 004012B8 mov ecx,dword ptr [ebp+64h] 004012BB movsd xmm0,mmword ptr [ebp] 004012C0 movsd xmm1,mmword ptr [ebp-1Ch] 004012C5 movsd xmm2,mmword ptr [ebp+18h] 004012CA movsd xmm3,mmword ptr [ebp+28h] 004012CF add ecx,4 110: { 111: plot_points.setX((qint32)data_offset); 004012D2 cvttsd2si edi,xmm0 004012D6 mov dword ptr [esi+eax*8],edi 112: data_offset += num_pixels_per_point; 004012D9 movapd xmm4,xmm1 004012DD addsd xmm4,xmm0 004012E1 movapd xmm0,xmm4 113: plot_points.setY((qint32)(data->y()) * normalized_zoomed_height + offset_height); 004012E5 cvtsi2sd xmm4,dword ptr [ecx] 004012E9 mulsd xmm4,xmm2 004012ED addsd xmm4,xmm3 004012F1 cvttsd2si edi,xmm4 004012F5 mov dword ptr [esi+eax*8+4],edi 004012F9 inc eax 114: data++; 004012FA add ecx,8 004012FD cmp eax,edx 004012FF jl LOOP_END+2Ah (4012D2h) 115: } 116: }[/plain]
My hand-coded version:
[plain] 66: { 67: __asm 68: { 69: // 240: for (qint32 i = 0; i < (qint32)num_data_points; i++) 70: //xor eax,eax // i 71: movsd xmm4, num_data_points 0040124C movsd xmm4,mmword ptr [ebp+38h] 72: mov ecx, data 00401251 mov ecx,dword ptr [ebp+64h] 73: add ecx,4 // ptr to data[0].y 00401254 add ecx,4 74: mov esi, plot_points 00401257 mov esi,dword ptr [ebp+5Ch] 75: // 241: { 76: // 242: plot_points.setX((qint32)data_offset); 77: movsd xmm2, data_offset 0040125A movsd xmm2,mmword ptr [ebp] 78: movsd xmm3, normalized_zoomed_height 0040125F movsd xmm3,mmword ptr [ebp+18h] 79: movsd xmm5, offset_height 00401264 movsd xmm5,mmword ptr [ebp+28h] 80: movsd xmm6, num_pixels_per_point 00401269 movsd xmm6,mmword ptr [ebp-1Ch] 81: cvttsd2si eax, xmm4 0040126E cvttsd2si eax,xmm4 82: test eax, eax 00401272 test eax,eax 83: jle LOOP_END 00401274 jle LOOP_END (4012A8h) 00401276 jmp LOOP_START (401280h) 00401278 lea esp,[esp] 0040127F nop 84: align 16 85: LOOP_START: 86: cvttsd2si edi, xmm2 00401280 cvttsd2si edi,xmm2 87: mov dword ptr [esi], edi 00401284 mov dword ptr [esi],edi 88: 89: // 244: plot_points.setY((qint32)(data->y()) * normalized_zoomed_height + offset_height); 90: cvtsi2sd xmm4, dword ptr [ecx] 00401286 cvtsi2sd xmm4,dword ptr [ecx] 91: // 243: offset += num_pixels_per_point; 92: addsd xmm2, xmm6 0040128A addsd xmm2,xmm6 93: mulsd xmm4, xmm3 0040128E mulsd xmm4,xmm3 94: add esi, 4 00401292 add esi,4 95: addsd xmm4, xmm5 00401295 addsd xmm4,xmm5 96: add ecx, 8 00401299 add ecx,8 97: cvttsd2si edi, xmm4 0040129C cvttsd2si edi,xmm4 98: mov dword ptr [esi], edi 004012A0 mov dword ptr [esi],edi 99: add esi, 4 004012A2 add esi,4 100: // 245: data++; 101: dec eax 004012A5 dec eax 102: jnz LOOP_START 004012A6 jne LOOP_START (401280h) 103: // 246: } 104: LOOP_END: 105: } 106: }[/plain]
Thanks in advance,
Djordje
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi all,
It seems the main issue is here:
movapd xmm4,xmm1
addsd xmm4,xmm0
movapd xmm0,xmm4
instead of
addsd xmm0,xmm1
makes the compiler generated loop perform significantly faster than the hand-coded version.
Any ideas as to why this is happening ?
Thanks,
Djordje
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page