hi , i am optimize the compiler , and use more sse instruction

dk_zhou · ‎02-26-2011

Hi,

my email:zhoudka@gmail.com

my msn:dk_zhou@hotmail.com

There is two attached file . one is generate form gcc with optimize option -O2 (nbody.s.1), one is modify from nbody.s.1 (nbody.s) , i use sse instruction to take place the x87 instruction .

I use the standard x86 instruction and sse instruction to do calc , i use gcc with -O2 optimize option to accomplish same function .

my code is 104 line in assemble , and gcc 's code is 105 lines in assemble .

my binary code is 356 byte , and gcc's code is 233 byte.

i call the function 10000000 times.

my code is slow than gcc's code about half time , why ?

can somebody give me some idea?

gcc's code:

00000008 :
8: d9 ee fldz
a: 55 push %ebp
b: 89 e5 mov %esp,%ebp
d: 83 ec 1c sub $0x1c,%esp
10: 57 push %edi
11: 56 push %esi
12: 53 push %ebx
13: 31 d2 xor %edx,%edx
15: 3b 55 0c cmp 0xc(%ebp),%edx
18: 0f 8d c5 00 00 00 jge e3
1e: 66 90 xchg %ax,%ax
20: 8b 4d 10 mov 0x10(%ebp),%ecx
23: 8d 04 d5 00 00 00 00 lea 0x0(,%edx,8),%eax
2a: 29 d0 sub %edx,%eax
2c: 8d 34 c1 lea (%ecx,%eax,8),%esi
2f: dd 46 18 fldl 0x18(%esi)
32: dd 46 20 fldl 0x20(%esi)
35: d9 c9 fxch %st(1)
37: d8 c8 fmul %st(0),%st
39: d9 c9 fxch %st(1)
3b: d8 c8 fmul %st(0),%st
3d: de c1 faddp %st,%st(1)
3f: dd 46 28 fldl 0x28(%esi)
42: d8 c8 fmul %st(0),%st
44: dd 46 30 fldl 0x30(%esi)
47: dc 0d 00 00 00 00 fmull 0x0
4d: d9 ca fxch %st(2)
4f: de c1 faddp %st,%st(1)
51: de c9 fmulp %st,%st(1)
53: 8b 45 0c mov 0xc(%ebp),%eax
56: 42 inc %edx
57: 89 55 fc mov %edx,-0x4(%ebp)
5a: de c1 faddp %st,%st(1)

5c: 39 c2 cmp %eax,%edx
5e: 7d 77 jge d7
60: 8b 7d 0c mov 0xc(%ebp),%edi
63: 89 d0 mov %edx,%eax
65: c1 e0 03 shl $0x3,%eax
68: 29 d0 sub %edx,%eax
6a: 8d 1c c1 lea (%ecx,%eax,8),%ebx
6d: 29 d7 sub %edx,%edi
6f: 90 nop
70: dd 06 fldl (%esi)
72: dd 46 08 fldl 0x8(%esi)
75: dd 46 10 fldl 0x10(%esi)
78: d9 ca fxch %st(2)
7a: dc 23 fsubl (%ebx)
7c: d9 c9 fxch %st(1)
7e: dc 63 08 fsubl 0x8(%ebx)
81: d9 ca fxch %st(2)
83: dc 63 10 fsubl 0x10(%ebx)
86: d9 c9 fxch %st(1)
88: d8 c8 fmul %st(0),%st
8a: d9 ca fxch %st(2)
8c: d8 c8 fmul %st(0),%st
8e: d9 c9 fxch %st(1)
90: d8 c8 fmul %st(0),%st
92: d9 ca fxch %st(2)
94: de c1 faddp %st,%st(1)
96: de c1 faddp %st,%st(1)
98: d9 c0 fld %st(0)
9a: d9 fa fsqrt
9c: dd e0 fucom %st(0)
9e: df e0 fnstsw %ax
a0: 80 e4 45 and $0x45,%ah
a3: 80 fc 40 cmp $0x40,%ah
a6: 74 1d je c5
a8: dd d8 fstp %st(0)
aa: 83 c4 f8 add $0xfffffff8,%esp
ad: 83 ec 08 sub $0x8,%esp
b0: dd 1c 24 fstpl (%esp)
b3: db 7d f0 fstpt -0x10(%ebp)
b6: e8 fc ff ff ff call b7
bb: 83 c4 10 add $0x10,%esp
be: db 6d f0 fldt -0x10(%ebp)
c1: d9 c9 fxch %st(1)
c3: eb 02 jmp c7

c5: dd d9 fstp %st(1)
c7: dd 46 30 fldl 0x30(%esi)
ca: dc 4b 30 fmull 0x30(%ebx)
cd: de f1 fdivp %st,%st(1)
cf: 83 c3 38 add $0x38,%ebx
d2: de e9 fsubrp %st,%st(1)
d4: 4f dec %edi
d5: 75 99 jne 70
d7: 8b 55 fc mov -0x4(%ebp),%edx
da: 3b 55 0c cmp 0xc(%ebp),%edx
dd: 0f 8c 3d ff ff ff jl 20
e3: 8b 45 08 mov 0x8(%ebp),%eax
e6: dd 18 fstpl (%eax)
e8: 8d 65 d8 lea -0x28(%ebp),%esp
eb: 5b pop %ebx
ec: 5e pop %esi
ed: 5f pop %edi
ee: 89 ec mov %ebp,%esp
f0: 5d pop %ebp
f1: c3 ret

my code:

00000008 :
8: 55 push %ebp
9: 89 e5 mov %esp,%ebp
b: 83 ec 40 sub $0x40,%esp
e: c7 45 d0 00 00 00 00 movl $0x0,-0x30(%ebp)
15: c7 45 c8 00 00 00 00 movl $0x0,-0x38(%ebp)
1c: c7 45 cc 00 00 00 00 movl $0x0,-0x34(%ebp)
23: bb 00 00 00 00 mov $0x0,%ebx
28: 66 0f 3a 22 fb 02 pinsrd $0x2,%ebx,%xmm7
2e: bb 00 00 00 00 mov $0x0,%ebx
33: 66 0f 3a 22 fb 03 pinsrd $0x3,%ebx,%xmm7
39: 8b 45 d0 mov -0x30(%ebp),%eax
3c: 39 45 0c cmp %eax,0xc(%ebp)
3f: 7f 0c jg 4d
41: 8b 45 08 mov 0x8(%ebp),%eax
44: 66 0f 17 38 movhpd %xmm7,(%eax)
48: e9 11 01 00 00 jmp 15e
4d: 8b 45 d0 mov -0x30(%ebp),%eax
50: 89 c1 mov %eax,%ecx
52: 6b c0 38 imul $0x38,%eax,%eax
55: 41 inc %ecx
56: 89 4d d4 mov %ecx,-0x2c(%ebp)
59: 03 45 10 add 0x10(%ebp),%eax
5c: 89 c1 mov %eax,%ecx
5e: 89 4d d8 mov %ecx,-0x28(%ebp)
61: f2 0f f0 70 28 lddqu 0x28(%eax),%xmm6
66: 0f 12 c6 movhlps %xmm6,%xmm0
69: f2 0f 59 05 38 00 00 mulsd 0x38,%xmm0
70: 00
71: f2 0f f0 68 18 lddqu 0x18(%eax),%xmm5
76: 66 0f 59 ed mulpd %xmm5,%xmm5
7a: f2 0f 59 f6 mulsd %xmm6,%xmm6
7e: 0f 12 cd movhlps %xmm5,%xmm1
81: f2 0f 58 e9 addsd %xmm1,%xmm5
85: f2 0f 58 ee addsd %xmm6,%xmm5
89: f2 0f 59 c5 mulsd %xmm5,%xmm0
8d: 0f 12 cf movhlps %xmm7,%xmm1

90: f2 0f 58 c1 addsd %xmm1,%xmm0
94: 66 0f 13 45 c8 movlpd %xmm0,-0x38(%ebp)
99: 0f 16 f8 movlhps %xmm0,%xmm7
9c: 8b 45 d4 mov -0x2c(%ebp),%eax
9f: 39 45 0c cmp %eax,0xc(%ebp)
a2: 7f 05 jg a9
a4: ff 45 d0 incl -0x30(%ebp)
a7: eb 90 jmp 39
a9: 66 0f 12 7d d4 movlpd -0x2c(%ebp),%xmm7
ae: 66 0f 3a 16 f8 00 pextrd $0x0,%xmm7,%eax
b4: 6b c0 38 imul $0x38,%eax,%eax
b7: 66 0f 3a 16 f9 00 pextrd $0x0,%xmm7,%ecx
bd: 41 inc %ecx
be: 89 4d d4 mov %ecx,-0x2c(%ebp)
c1: 03 45 10 add 0x10(%ebp),%eax
c4: 89 c1 mov %eax,%ecx
c6: 89 4d dc mov %ecx,-0x24(%ebp)
c9: 66 0f 3a 16 fa 01 pextrd $0x1,%xmm7,%edx
cf: 66 0f 12 02 movlpd (%edx),%xmm0
d3: 66 0f 3a 16 fb 01 pextrd $0x1,%xmm7,%ebx
d9: 66 0f 16 43 08 movhpd 0x8(%ebx),%xmm0
de: f2 0f f0 30 lddqu (%eax),%xmm6
e2: 66 0f 5c c6 subpd %xmm6,%xmm0
e6: 66 0f 12 48 30 movlpd 0x30(%eax),%xmm1
eb: 0f 16 c8 movlhps %xmm0,%xmm1
ee: 66 0f 3a 16 fb 01 pextrd $0x1,%xmm7,%ebx
f4: 66 0f 12 53 30 movlpd 0x30(%ebx),%xmm2
f9: 0f 16 d0 movlhps %xmm0,%xmm2
fc: 66 0f 59 ca mulpd %xmm2,%xmm1
100: 66 0f 13 45 e0 movlpd %xmm0,-0x20(%ebp)
105: 66 0f 3a 16 fa 01 pextrd $0x1,%xmm7,%edx
10b: 66 0f 12 52 10 movlpd 0x10(%edx),%xmm2
110: f2 0f 5c 50 10 subsd 0x10(%eax),%xmm2
115: 66 0f 3a 0d c2 01 blendpd $0x1,%xmm2,%xmm0
11b: 66 0f 3a 0d d0 02 blendpd $0x2,%xmm0,%xmm2
121: 66 0f 59 c2 mulpd %xmm2,%xmm0
125: 66 0f 17 55 e8 movhpd %xmm2,-0x18(%ebp)
12a: 66 0f 13 55 f0 movlpd %xmm2,-0x10(%ebp)
12f: 0f 12 d1 movhlps %xmm1,%xmm2
132: 0f 12 d8 movhlps %xmm0,%xmm3
135: f2 0f 58 d3 addsd %xmm3,%xmm2
139: f2 0f 58 d0 addsd %xmm0,%xmm2
13d: f2 0f 51 c2 sqrtsd %xmm2,%xmm0
141: f2 0f 5e c8 divsd %xmm0,%xmm1
145: 66 0f 13 45 f8 movlpd %xmm0,-0x8(%ebp)

14a: 0f 12 c7 movhlps %xmm7,%xmm0
14d: f2 0f 5c c1 subsd %xmm1,%xmm0
151: 66 0f 13 45 c8 movlpd %xmm0,-0x38(%ebp)
156: 0f 16 f8 movlhps %xmm0,%xmm7
159: e9 3e ff ff ff jmp 9c
15e: 89 ec mov %ebp,%esp
160: 5d pop %ebp
161: c3 ret

my code will modify esi,edi,ebx,so if you want to call the function ,you had to save it!

my code will use a global constant

LDouble0$inline:
.long 0x0,0x3fe00000

Aubrey_W_ · ‎03-10-2011

Hello,

Thanks for your question. I'll move this thread to our CPU instructions forum, where more of the Intel engineers and the community Black Belts can have a look at it.

Best regards,

==
Aubrey W.
Intel Software Network Support

TimP · ‎03-10-2011

Why not have gcc generate sse code? Recent gcc, particularly current and near future releases, do a good job. Gains with sse are usually achieved by performing the same job with fewer instructions.

capens__nicolas · ‎03-10-2011

I'm seeing mostly scalar SSE instructions in your code. That means it's not going to be any faster than x87 code. The reason you're actually seeing lower perfomance is probably due to the many extract/insert/shuffle kind of instructions. With x87, fxch is free, but all those data movement SSE instructions take extra cycles. Also, the x87 code is much denser, so you might be seeingan instruction cache or decoding benefit too.

Instead of just attempting to optimize a scalar function, you should try to process things in parallel to benefit from SSE.

dk_zhou · ‎03-26-2011

Thank's for all the reply.
Whether use sse instruction is decide by the logical of source code. i have try to use packed sse instruction .
Why the denser code is more efficient ? Is it relation to the pipeline ?

Use parallel is another optional , my job is to optimize a compiler , so i want to make a single thread faster !

About gcc is another choice , but i am trying to make a better compiler than gcc.

May i have another question :

If i use packed sse instruction to handle no-continued integer or float or double math operation , will it advance the performance?

TimP · ‎03-26-2011

If there is a simple way to improve performance by changing from x87 to packed sse, any auto-vectorizing compiler should give you a good starting point. Many here have devoted much of their career to making Intel icc a "better compiler than gcc," but gcc is a good compiler if you give it a chance.

As much of the time spent might be expected to occur in the divide sqrt sequence, that would appear to be the first place to attempt optimization.

capens__nicolas · ‎03-28-2011

If you aim to outperform GCC I can highly recommend working on LLVM. It is a very modular compiler framework, which makes it very easy to write your own optimization passes.

Auto-vectorization still has mixed results though, with any compiler. In general the developer has to be well aware of the limitations to get good results. And even if the generated assembly code is as expected, the performance can be dissapointing. Hardware support for gather and scatterinstructions would revolutionize auto-vectorization though, as they're the parallel equivalent of load and store instructions. It would make a lot more loops amenable for parallel execution.

Anyway, could you post the source code for your function? That might help us help you optimize this code for SSE.

hi , i am optimize the compiler , and use more sse instruction , but i get result that sse instruction is slow than x87 instruction about 50%, can you have some commen!thanks.