- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
my email:zhoudka@gmail.com
my msn:dk_zhou@hotmail.com
There is two attached file . one is generate form gcc with optimize option -O2 (nbody.s.1), one is modify from nbody.s.1 (nbody.s) , i use sse instruction to take place the x87 instruction .
I use the standard x86 instruction and sse instruction to do calc , i use gcc with -O2 optimize option to accomplish same function .
my code is 104 line in assemble , and gcc 's code is 105 lines in assemble .
my binary code is 356 byte , and gcc's code is 233 byte.
i call the function 10000000 times.
my code is slow than gcc's code about half time , why ?
can somebody give me some idea?
gcc's code:
00000008
8: d9 ee fldz
a: 55 push %ebp
b: 89 e5 mov %esp,%ebp
d: 83 ec 1c sub $0x1c,%esp
10: 57 push %edi
11: 56 push %esi
12: 53 push %ebx
13: 31 d2 xor %edx,%edx
15: 3b 55 0c cmp 0xc(%ebp),%edx
18: 0f 8d c5 00 00 00 jge e3
1e: 66 90 xchg %ax,%ax
20: 8b 4d 10 mov 0x10(%ebp),%ecx
23: 8d 04 d5 00 00 00 00 lea 0x0(,%edx,8),%eax
2a: 29 d0 sub %edx,%eax
2c: 8d 34 c1 lea (%ecx,%eax,8),%esi
2f: dd 46 18 fldl 0x18(%esi)
32: dd 46 20 fldl 0x20(%esi)
35: d9 c9 fxch %st(1)
37: d8 c8 fmul %st(0),%st
39: d9 c9 fxch %st(1)
3b: d8 c8 fmul %st(0),%st
3d: de c1 faddp %st,%st(1)
3f: dd 46 28 fldl 0x28(%esi)
42: d8 c8 fmul %st(0),%st
44: dd 46 30 fldl 0x30(%esi)
47: dc 0d 00 00 00 00 fmull 0x0
4d: d9 ca fxch %st(2)
4f: de c1 faddp %st,%st(1)
51: de c9 fmulp %st,%st(1)
53: 8b 45 0c mov 0xc(%ebp),%eax
56: 42 inc %edx
57: 89 55 fc mov %edx,-0x4(%ebp)
5a: de c1 faddp %st,%st(1)
5c: 39 c2 cmp %eax,%edx
5e: 7d 77 jge d7
60: 8b 7d 0c mov 0xc(%ebp),%edi
63: 89 d0 mov %edx,%eax
65: c1 e0 03 shl $0x3,%eax
68: 29 d0 sub %edx,%eax
6a: 8d 1c c1 lea (%ecx,%eax,8),%ebx
6d: 29 d7 sub %edx,%edi
6f: 90 nop
70: dd 06 fldl (%esi)
72: dd 46 08 fldl 0x8(%esi)
75: dd 46 10 fldl 0x10(%esi)
78: d9 ca fxch %st(2)
7a: dc 23 fsubl (%ebx)
7c: d9 c9 fxch %st(1)
7e: dc 63 08 fsubl 0x8(%ebx)
81: d9 ca fxch %st(2)
83: dc 63 10 fsubl 0x10(%ebx)
86: d9 c9 fxch %st(1)
88: d8 c8 fmul %st(0),%st
8a: d9 ca fxch %st(2)
8c: d8 c8 fmul %st(0),%st
8e: d9 c9 fxch %st(1)
90: d8 c8 fmul %st(0),%st
92: d9 ca fxch %st(2)
94: de c1 faddp %st,%st(1)
96: de c1 faddp %st,%st(1)
98: d9 c0 fld %st(0)
9a: d9 fa fsqrt
9c: dd e0 fucom %st(0)
9e: df e0 fnstsw %ax
a0: 80 e4 45 and $0x45,%ah
a3: 80 fc 40 cmp $0x40,%ah
a6: 74 1d je c5
a8: dd d8 fstp %st(0)
aa: 83 c4 f8 add $0xfffffff8,%esp
ad: 83 ec 08 sub $0x8,%esp
b0: dd 1c 24 fstpl (%esp)
b3: db 7d f0 fstpt -0x10(%ebp)
b6: e8 fc ff ff ff call b7
bb: 83 c4 10 add $0x10,%esp
be: db 6d f0 fldt -0x10(%ebp)
c1: d9 c9 fxch %st(1)
c3: eb 02 jmp c7
c5: dd d9 fstp %st(1)
c7: dd 46 30 fldl 0x30(%esi)
ca: dc 4b 30 fmull 0x30(%ebx)
cd: de f1 fdivp %st,%st(1)
cf: 83 c3 38 add $0x38,%ebx
d2: de e9 fsubrp %st,%st(1)
d4: 4f dec %edi
d5: 75 99 jne 70
d7: 8b 55 fc mov -0x4(%ebp),%edx
da: 3b 55 0c cmp 0xc(%ebp),%edx
dd: 0f 8c 3d ff ff ff jl 20
e3: 8b 45 08 mov 0x8(%ebp),%eax
e6: dd 18 fstpl (%eax)
e8: 8d 65 d8 lea -0x28(%ebp),%esp
eb: 5b pop %ebx
ec: 5e pop %esi
ed: 5f pop %edi
ee: 89 ec mov %ebp,%esp
f0: 5d pop %ebp
f1: c3 ret
my code:
00000008
8: 55 push %ebp
9: 89 e5 mov %esp,%ebp
b: 83 ec 40 sub $0x40,%esp
e: c7 45 d0 00 00 00 00 movl $0x0,-0x30(%ebp)
15: c7 45 c8 00 00 00 00 movl $0x0,-0x38(%ebp)
1c: c7 45 cc 00 00 00 00 movl $0x0,-0x34(%ebp)
23: bb 00 00 00 00 mov $0x0,%ebx
28: 66 0f 3a 22 fb 02 pinsrd $0x2,%ebx,%xmm7
2e: bb 00 00 00 00 mov $0x0,%ebx
33: 66 0f 3a 22 fb 03 pinsrd $0x3,%ebx,%xmm7
39: 8b 45 d0 mov -0x30(%ebp),%eax
3c: 39 45 0c cmp %eax,0xc(%ebp)
3f: 7f 0c jg 4d
41: 8b 45 08 mov 0x8(%ebp),%eax
44: 66 0f 17 38 movhpd %xmm7,(%eax)
48: e9 11 01 00 00 jmp 15e
4d: 8b 45 d0 mov -0x30(%ebp),%eax
50: 89 c1 mov %eax,%ecx
52: 6b c0 38 imul $0x38,%eax,%eax
55: 41 inc %ecx
56: 89 4d d4 mov %ecx,-0x2c(%ebp)
59: 03 45 10 add 0x10(%ebp),%eax
5c: 89 c1 mov %eax,%ecx
5e: 89 4d d8 mov %ecx,-0x28(%ebp)
61: f2 0f f0 70 28 lddqu 0x28(%eax),%xmm6
66: 0f 12 c6 movhlps %xmm6,%xmm0
69: f2 0f 59 05 38 00 00 mulsd 0x38,%xmm0
70: 00
71: f2 0f f0 68 18 lddqu 0x18(%eax),%xmm5
76: 66 0f 59 ed mulpd %xmm5,%xmm5
7a: f2 0f 59 f6 mulsd %xmm6,%xmm6
7e: 0f 12 cd movhlps %xmm5,%xmm1
81: f2 0f 58 e9 addsd %xmm1,%xmm5
85: f2 0f 58 ee addsd %xmm6,%xmm5
89: f2 0f 59 c5 mulsd %xmm5,%xmm0
8d: 0f 12 cf movhlps %xmm7,%xmm1
90: f2 0f 58 c1 addsd %xmm1,%xmm0
94: 66 0f 13 45 c8 movlpd %xmm0,-0x38(%ebp)
99: 0f 16 f8 movlhps %xmm0,%xmm7
9c: 8b 45 d4 mov -0x2c(%ebp),%eax
9f: 39 45 0c cmp %eax,0xc(%ebp)
a2: 7f 05 jg a9
a4: ff 45 d0 incl -0x30(%ebp)
a7: eb 90 jmp 39
a9: 66 0f 12 7d d4 movlpd -0x2c(%ebp),%xmm7
ae: 66 0f 3a 16 f8 00 pextrd $0x0,%xmm7,%eax
b4: 6b c0 38 imul $0x38,%eax,%eax
b7: 66 0f 3a 16 f9 00 pextrd $0x0,%xmm7,%ecx
bd: 41 inc %ecx
be: 89 4d d4 mov %ecx,-0x2c(%ebp)
c1: 03 45 10 add 0x10(%ebp),%eax
c4: 89 c1 mov %eax,%ecx
c6: 89 4d dc mov %ecx,-0x24(%ebp)
c9: 66 0f 3a 16 fa 01 pextrd $0x1,%xmm7,%edx
cf: 66 0f 12 02 movlpd (%edx),%xmm0
d3: 66 0f 3a 16 fb 01 pextrd $0x1,%xmm7,%ebx
d9: 66 0f 16 43 08 movhpd 0x8(%ebx),%xmm0
de: f2 0f f0 30 lddqu (%eax),%xmm6
e2: 66 0f 5c c6 subpd %xmm6,%xmm0
e6: 66 0f 12 48 30 movlpd 0x30(%eax),%xmm1
eb: 0f 16 c8 movlhps %xmm0,%xmm1
ee: 66 0f 3a 16 fb 01 pextrd $0x1,%xmm7,%ebx
f4: 66 0f 12 53 30 movlpd 0x30(%ebx),%xmm2
f9: 0f 16 d0 movlhps %xmm0,%xmm2
fc: 66 0f 59 ca mulpd %xmm2,%xmm1
100: 66 0f 13 45 e0 movlpd %xmm0,-0x20(%ebp)
105: 66 0f 3a 16 fa 01 pextrd $0x1,%xmm7,%edx
10b: 66 0f 12 52 10 movlpd 0x10(%edx),%xmm2
110: f2 0f 5c 50 10 subsd 0x10(%eax),%xmm2
115: 66 0f 3a 0d c2 01 blendpd $0x1,%xmm2,%xmm0
11b: 66 0f 3a 0d d0 02 blendpd $0x2,%xmm0,%xmm2
121: 66 0f 59 c2 mulpd %xmm2,%xmm0
125: 66 0f 17 55 e8 movhpd %xmm2,-0x18(%ebp)
12a: 66 0f 13 55 f0 movlpd %xmm2,-0x10(%ebp)
12f: 0f 12 d1 movhlps %xmm1,%xmm2
132: 0f 12 d8 movhlps %xmm0,%xmm3
135: f2 0f 58 d3 addsd %xmm3,%xmm2
139: f2 0f 58 d0 addsd %xmm0,%xmm2
13d: f2 0f 51 c2 sqrtsd %xmm2,%xmm0
141: f2 0f 5e c8 divsd %xmm0,%xmm1
145: 66 0f 13 45 f8 movlpd %xmm0,-0x8(%ebp)
14a: 0f 12 c7 movhlps %xmm7,%xmm0
14d: f2 0f 5c c1 subsd %xmm1,%xmm0
151: 66 0f 13 45 c8 movlpd %xmm0,-0x38(%ebp)
156: 0f 16 f8 movlhps %xmm0,%xmm7
159: e9 3e ff ff ff jmp 9c
15e: 89 ec mov %ebp,%esp
160: 5d pop %ebp
161: c3 ret
my code will modify esi,edi,ebx,so if you want to call the function ,you had to save it!
my code will use a global constant
LDouble0$inline:
.long 0x0,0x3fe00000
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Thanks for your question. I'll move this thread to our CPU instructions forum, where more of the Intel engineers and the community Black Belts can have a look at it.
Best regards,
==
Aubrey W.
Intel Software Network Support
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Instead of just attempting to optimize a scalar function, you should try to process things in parallel to benefit from SSE.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Whether use sse instruction is decide by the logical of source code. i have try to use packed sse instruction .
Why the denser code is more efficient ? Is it relation to the pipeline ?
Use parallel is another optional , my job is to optimize a compiler , so i want to make a single thread faster !
About gcc is another choice , but i am trying to make a better compiler than gcc.
May i have another question :
If i use packed sse instruction to handle no-continued integer or float or double math operation , will it advance the performance?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
If there is a simple way to improve performance by changing from x87 to packed sse, any auto-vectorizing compiler should give you a good starting point. Many here have devoted much of their career to making Intel icc a "better compiler than gcc," but gcc is a good compiler if you give it a chance.
As much of the time spent might be expected to occur in the divide sqrt sequence, that would appear to be the first place to attempt optimization.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page