- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hello all,
First of all, I'd like to let you know that I'm new to IA-32 assembly programming, so any tips regarding the following code would be welcome.
I've got a piece of code generated by VS C++ compiler running about 28% faster than my hand-coded, "optimized" piece, and I can't figure out why.
Also, I'd like to know if there's an easy way to see/count the gaps in my pipeline and/or see what the scheduler does to my code. (I've tried interleaving independent instruction sequences in hopes of reducing the cycle-count, but to no observable improvement, possibly because the scheduler already does that).
I'm measuring the cycle count by executing CPUID; RDTSC before and after the code in question and subtracting the results. Prior to this, I issue the following calls:
[cpp]SetPriorityClass (GetCurrentThread(), REALTIME_PRIORITY_CLASS); SetThreadPriority (GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL); SetProcessPriorityBoost (GetCurrentThread(), 0); SetThreadAffinityMask (GetCurrentThread(), 0x1);[/cpp]
as advised by Peter Kankowski in his article.
One more thing: The CPU I'm running/testing the code on is T5470 (Core2Duo @ 1.6GHz).
Compiler generated version of the code:
[plain] 108: {
109: for (qint32 i = 0; i < (qint32)num_data_points; i++)
004012AD cvttsd2si edx,mmword ptr [ebp+38h]
004012B2 xor eax,eax
004012B4 test edx,edx
004012B6 jle LOOP_END+59h (401301h)
004012B8 mov ecx,dword ptr [ebp+64h]
004012BB movsd xmm0,mmword ptr [ebp]
004012C0 movsd xmm1,mmword ptr [ebp-1Ch]
004012C5 movsd xmm2,mmword ptr [ebp+18h]
004012CA movsd xmm3,mmword ptr [ebp+28h]
004012CF add ecx,4
110: {
111: plot_points.setX((qint32)data_offset);
004012D2 cvttsd2si edi,xmm0
004012D6 mov dword ptr [esi+eax*8],edi
112: data_offset += num_pixels_per_point;
004012D9 movapd xmm4,xmm1
004012DD addsd xmm4,xmm0
004012E1 movapd xmm0,xmm4
113: plot_points.setY((qint32)(data->y()) * normalized_zoomed_height + offset_height);
004012E5 cvtsi2sd xmm4,dword ptr [ecx]
004012E9 mulsd xmm4,xmm2
004012ED addsd xmm4,xmm3
004012F1 cvttsd2si edi,xmm4
004012F5 mov dword ptr [esi+eax*8+4],edi
004012F9 inc eax
114: data++;
004012FA add ecx,8
004012FD cmp eax,edx
004012FF jl LOOP_END+2Ah (4012D2h)
115: }
116: }[/plain] My hand-coded version:
[plain] 66: {
67: __asm
68: {
69: // 240: for (qint32 i = 0; i < (qint32)num_data_points; i++)
70: //xor eax,eax // i
71: movsd xmm4, num_data_points
0040124C movsd xmm4,mmword ptr [ebp+38h]
72: mov ecx, data
00401251 mov ecx,dword ptr [ebp+64h]
73: add ecx,4 // ptr to data[0].y
00401254 add ecx,4
74: mov esi, plot_points
00401257 mov esi,dword ptr [ebp+5Ch]
75: // 241: {
76: // 242: plot_points.setX((qint32)data_offset);
77: movsd xmm2, data_offset
0040125A movsd xmm2,mmword ptr [ebp]
78: movsd xmm3, normalized_zoomed_height
0040125F movsd xmm3,mmword ptr [ebp+18h]
79: movsd xmm5, offset_height
00401264 movsd xmm5,mmword ptr [ebp+28h]
80: movsd xmm6, num_pixels_per_point
00401269 movsd xmm6,mmword ptr [ebp-1Ch]
81: cvttsd2si eax, xmm4
0040126E cvttsd2si eax,xmm4
82: test eax, eax
00401272 test eax,eax
83: jle LOOP_END
00401274 jle LOOP_END (4012A8h)
00401276 jmp LOOP_START (401280h)
00401278 lea esp,[esp]
0040127F nop
84: align 16
85: LOOP_START:
86: cvttsd2si edi, xmm2
00401280 cvttsd2si edi,xmm2
87: mov dword ptr [esi], edi
00401284 mov dword ptr [esi],edi
88:
89: // 244: plot_points.setY((qint32)(data->y()) * normalized_zoomed_height + offset_height);
90: cvtsi2sd xmm4, dword ptr [ecx]
00401286 cvtsi2sd xmm4,dword ptr [ecx]
91: // 243: offset += num_pixels_per_point;
92: addsd xmm2, xmm6
0040128A addsd xmm2,xmm6
93: mulsd xmm4, xmm3
0040128E mulsd xmm4,xmm3
94: add esi, 4
00401292 add esi,4
95: addsd xmm4, xmm5
00401295 addsd xmm4,xmm5
96: add ecx, 8
00401299 add ecx,8
97: cvttsd2si edi, xmm4
0040129C cvttsd2si edi,xmm4
98: mov dword ptr [esi], edi
004012A0 mov dword ptr [esi],edi
99: add esi, 4
004012A2 add esi,4
100: // 245: data++;
101: dec eax
004012A5 dec eax
102: jnz LOOP_START
004012A6 jne LOOP_START (401280h)
103: // 246: }
104: LOOP_END:
105: }
106: }[/plain] Thanks in advance,
Djordje
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi all,
It seems the main issue is here:
movapd xmm4,xmm1
addsd xmm4,xmm0
movapd xmm0,xmm4
instead of
addsd xmm0,xmm1
makes the compiler generated loop perform significantly faster than the hand-coded version.
Any ideas as to why this is happening ?
Thanks,
Djordje
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page