Software Archive
Read-only legacy content
17061 Discussions

(pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

dario_mx
Beginner
489 Views
Hallo,

I am trying to improve the performance of a very simple program, by using SSE2. Program snipet only needs to calculate the distance of all pairs of a set of N points (I know I could take advantage of symmetry, but I am focusing now at low vectorization).

I am using intrinsics, have examined a bit the generated assembler (though I far from being an expert) and it looks fike. However, the performance is only 10% less than original one. What am I missing?

Thanks.


Program is the following:
#include 
#include
#include
#include

#define SSE2_ALIGNED __attribute__ ((aligned (16)))
#define print_y(x) printf("%d ",(int) x)
#define print_n(x) x
#define print print_n

int D;
float *X,*Y;

inline static void dist(int i,int j)
{
float xd = X - X;
float yd = Y - Y;
print(rint(xd*xd + yd*yd));
}

inline static void dist_sse(int i)
{
float d[8] SSE2_ALIGNED;
int j;
__m128 xmm0 = _mm_set1_ps(X);
__m128 xmm1 = xmm0;
__m128 xmm2 = _mm_set1_ps(Y);
__m128 xmm3 = xmm2;
__m128 xmm4,xmm5,xmm6,xmm7;
for(j=0; j {
xmm4 =_mm_load_ps(X+j);
xmm5 =_mm_load_ps(X+j+4);
xmm6 =_mm_load_ps(Y+j);
xmm7 =_mm_load_ps(Y+j+4);
xmm4 = _mm_sub_ps(xmm0,xmm4);
xmm5 = _mm_sub_ps(xmm1,xmm5);
xmm6 = _mm_sub_ps(xmm2,xmm6);
xmm7 = _mm_sub_ps(xmm3,xmm7);
xmm4 = _mm_mul_ps(xmm4,xmm4);
xmm5 = _mm_mul_ps(xmm5,xmm5);
xmm6 = _mm_mul_ps(xmm6,xmm6);
xmm7 = _mm_mul_ps(xmm7,xmm7);
xmm4 = _mm_add_ps(xmm4,xmm6);
xmm5 = _mm_add_ps(xmm5,xmm7);
_mm_store_ps(d,xmm4);
_mm_store_ps(d+4,xmm5);
print(rint(d[0]));
print(rint(d[1]));
print(rint(d[2]));
print(rint(d[3]));
print(rint(d[4]));
print(rint(d[5]));
print(rint(d[6]));
print(rint(d[7]));
}
}

int main(int argc, char * argv[])
{
int i,j,opc;

if ( argc != 3 )
{
fprintf(stderr," Usage: %s ",argv[0]);
return 1;
}

opc = atoi(argv[1]);
D = atoi(argv[2]);

if ( D %8 != 0 )
{
fprintf(stderr," Dimension %d must be multiple of 8: ",D);
return 2;
}

if ( opc == 0 )
{
X = (float *) malloc(D * sizeof(float));
Y = (float *) malloc(D * sizeof(float));
}
else
{
X = (float *) _mm_malloc(D * sizeof(float), 16);
Y = (float *) _mm_malloc(D * sizeof(float), 16);
}

for(i=0;i {
X = i;
Y = D - i;
}

if ( opc == 0 )
for(i=0;i for(j=0;j dist(i,j);
else
for(i=0;i dist_sse(i);

return 0;
}

I am compiling with:

CC = gcc
CFLAGS = -O3 -Wall -march=pentium-m -msse2

all: kk

And generated assembler is:

	.file	"kk.c"
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC0:
.ascii "12Usage: %s 1212�"
.align 4
LC1:
.ascii "12Dimension %d must be multiple of 8: 1212�"
.text
.p2align 4,,15
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp
movl $16, %ea x
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $108, %esp
movl 12(%ebp), %ebx
andl $-16, %esp
call __alloca
call ___main
cmpl $3, 8(%ebp)
je L2
call ___getreent
movl (%ebx), %esi
movl $LC0, %ecx
movl %ecx, 4(%esp)
movl %esi, 8(%esp)
movl 12(%eax), %edx
movl %edx, (%esp)
call _fprintf
movl $1, %eax
leal -12(%ebp), %esp
L90:
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
L2:
movl 4(%ebx), %edi
movl %edi, (%esp)
call _atoi
movl %eax, %edi
movl 8(%ebx), %eax
movl %eax, (%esp)
call _atoi
movl %eax, _D
testb $7, %al
movl %eax, %ecx
jne L82
testl %edi, %edi
je L83
xorl %edx, %edx
sall $2, %eax
jne L84
L7:
movl %edx, _X
movl %ecx, %eax
xorl %edx, %edx
sall $2, %eax
jne L85
L12:
movl %edx, _Y
movl %edx, %ebx
L5:
xorl %edx, %edx
cmpl %ecx, %edx
jge L59
movl _X, %esi
.p2align 4,,15
L19:
movl %ecx, %eax
cvtsi2ss %edx, %xmm1
subl %edx, %eax
cvtsi2ss %eax, %xmm0
movss %xmm1, (%esi,%edx,4)
movss %xmm0, (%ebx,%edx,4)
incl %edx
cmpl %ecx, %edx
jl L19
L59:
testl %edi, %edi
jne L20
xorl %esi, %esi
cmpl %ecx, %esi
jge L30
.p2align 4,,15
L92:
xorl %ebx, %ebx
cmpl %ecx, %ebx
jge L63
.p2align 4,,15
L91:
movl _X, %edx
movl _Y, %edi
flds (%edx,%ebx,4)
flds (%edi,%ebx,4)
fxch %st(1)
incl %ebx
fsubrs (%edx,%esi,4)
fxch %st(1)
fsubrs (%edi,%esi,4)
fxch %st(1)
fmul %st(0), %st
fxch %st(1)
fmul %st(0), %st
faddp %st, %st(1)
fstpl (%esp)
call _rint
fstp %st(0)
movl _D, %ecx
cmpl %ecx, %ebx
jl L91
L63:
incl %esi
cmpl %ecx, %esi
jl L92
L30:
leal -12(%ebp), %esp
xorl %eax, %eax
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
L83:
leal 0(,%eax,4), %eax
movl %eax, (%esp)
call _malloc
movl %eax, _X
movl _D, %esi
sall $2, %esi
movl %esi, (%esp)
call _malloc
movl %eax, _Y
movl _D, %ecx
movl %eax, %ebx
jmp L5
L20:
xorl %edi, %edi
cmpl %ecx, %edi
jge L30
.p2align 4,,15
L75:
movl _X, %edx
movl (%edx,%edi,4), %eax
movl %eax, -60(%ebp)
movl (%ebx,%edi,4), %esi
movss -60(%ebp), %xmm2
movl %esi, -64(%ebp)
xorl %esi, %esi
shufps $0, %xmm2, %xmm2
movss -64(%ebp), %xmm4
cmpl %ecx, %esi
movaps %xmm2, -88(%ebp)
shufps $0, %xmm4, %xmm4
movaps %xmm4, -104(%ebp)
jl L76
jmp L66
.p2align 4,,7
L67:
movl _X, %edx
movl _Y, %ebx
L76:
movaps -88(%ebp), %xmm7
leal 0(,%esi,4), %ecx
leal (%ecx,%edx), %edx
movaps (%edx), %xmm5
addl %ebx, %ecx
addl $8, %esi
movaps (%ecx), %xmm6
movaps -104(%ebp), %xmm4
subps %xmm5, %xmm7
movaps %xmm7, %xmm5
movaps 16(%edx), %xmm3
mulps %xmm5, %xmm5
subps %xmm6, %xmm4
movaps 16(%ecx), %xmm1
movaps %xmm4, %xmm7
mulps %xmm4, %xmm7
movaps -88(%ebp), %xmm2
addps %xmm7, %xmm5
movaps -104(%ebp), %xmm0
movaps %xmm5, -56(%ebp)
flds -56(%ebp)
subps %xmm3, %xmm2
subps %xmm1, %xmm0
movaps %xmm2, %xmm3
movaps %xmm0, %xmm6
mulps %xmm2, %xmm3
mulps %xmm0, %xmm6
addps %xmm6, %xmm3
movaps %xmm3, -40(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -52(%ebp)
fstpl (%e sp)
call _rint
fstp %st(0)
flds -48(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -44(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -40(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -36(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -32(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -28(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
movl _D, %ecx
cmpl %ecx, %esi
jl L67
L66:
incl %edi
cmpl %ecx, %edi
jge L30
movl _Y, %ebx
jmp L75
L85:
addl $16, %eax
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L78
leal 16(%eax), %ecx
andl $-16, %ecx
movl %ecx, %edx
movl %eax, -4(%ecx)
L78:
movl _D, %ecx
jmp L12
L84:
addl $16, %eax
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L77
leal 16(%eax), %ebx
andl $-16, %ebx
movl %ebx, %edx
movl %eax, -4(%ebx)
L77:
movl _D, %ecx
jmp L7
L82:
call ___getreent
movl _D, %ecx
movl $LC1, %edx
movl %edx, 4(%esp)
movl %ecx, 8(%esp)
movl 12(%eax), %ebx
movl %ebx, (%esp)
call _fprintf
movl $2, %eax
leal -12(%ebp), %esp
jmp L90
.comm _D, 16 # 4
.comm _X, 16 # 4
.comm _Y, 16 # 4
.def _atoi; .scl 3; .type 32; .endef
.def ___getreent; .scl 3; .type 32; .endef
.def _fprintf; .scl 3; .type 32; .endef
.def _rint; .scl 3; .type 32; .endef
.def _malloc; .scl 3; .type 32; .endef


0 Kudos
9 Replies
TimP
Honored Contributor III
489 Views
This topic had already been covered extensively in the following public e-mail thread:
http://gcc.gnu.org/ml/gcc-help/2008-04/msg00073.html
It was never clear why a major speedup was expected with printf() in the inner loop, nor whether a vectorizing compiler such as g++ 4.3 or icpc was tried (with printf removed so as to attempt auto-vectorization).
C99 math functions (e.g. rint) presumably are available in g++, with -std=gnu99.
0 Kudos
dario_mx
Beginner
489 Views
Will read post ... but meanwhile can tell that the printf is conditionally removed from code. I used only for debugging the correctness of vectorized version, of course for the real timing, i removed.

thanks.
0 Kudos
dario_mx
Beginner
489 Views
LOL

The post u mentioned is just the one I made first, before coming here to Intel forumns !!!

With "extensively covered" you mean the topic was left unanswered? ;-1 Cause thats what happened. I made all suggestions I got, and still no signigicant improvement. The modified version is the one I posted here, at Intel.

There you have the assembler generated by GCC ... there we can see the SSE2 instructions. So, that is wrong with this picture?!

I would expect that, if there is one place on earth with people being experts in Intel assembler, that place must be here ... hehe. So, please check the assembler and tell me what is wrong.

Thanks.

0 Kudos
TimP
Honored Contributor III
489 Views
People on the gcc-help list did their best to help you define what you wanted to do. If you wanted only to make a non-vector cross between a macro and inline function of a style understood only by gcc, with parallel SSE inside that function, perhaps what you showed was what you wanted.
Otherwise, you could have posted a shorter example, which could be compiled by standard compilers, illustrating your interest.
The rint() function is not recognized as a vectorizable function by either gcc or icc. If you meant it as a substitute for sqrtf(), the latter can be vectorized in line by icc, but apparently not yet by gcc.
0 Kudos
dario_mx
Beginner
489 Views
Certaingly I appreciate all the advices gcc-help community provide me. Indeed, I followed all their advices.

I was not asking for a way to achieve the vectorization ... I already did it. Neither was I looking to automatic vectorization ... I manually code it. We can see the assembler there, generated directly from intrinsics functions ... it includes SSE2 of course (thus, this is not a problem of whether the compiler automatically vectorized or not).

What I am asking for, kindly of course hehe, is for advice about why my manually coded vector version is not improving significantly the performance (it offers a gain of 10% in runtime, which is very poor considering I am vectorizing the whole thing ... I would expect a gain of 75%)

Thanks.
0 Kudos
Anat_S_Intel
Employee
489 Views

Hi,

Try changing the allocation scheme to allocating one aligned block for all the data instead of many allocations. This will improve data locality in the caches and might gain performance.

The code here uses rint(). This part is not parallelized and is similar for the two versions. I dont know what it does so its impossible to estimate its duration relative to the other operations. It might be another reason for the low speedup.

Regards

0 Kudos
SHIH_K_Intel
Employee
489 Views

Just to add a few more comments to explain the likely reasons for your observation of lack of speed up.

In addition to what my colleagues have pointed out, on calling an external function like rint. The pitfalls in the code you've shown have significant overhead in terms calling scalar external functions. For the purpose of rounding, using another integer conversion technique might make more sense than throwing a bunch of rint's at the end of each hand-vectorized SSE loop. Secondly, depending on the parameter "D" you use when testing, it is possible in some portion of your loop iteration the rounding of floating-point to integer may experience exceptions. That can have different amount of delays between x87 code and SSE code.

I did a quick test by simplifying your code somewhat to use the intrinsic of cvtps2pi for rounding. I also modified the scalar c code into two versions to compare the overhead of using rint vs. a simple type cast conversion.

Using a fixed value of D= 1024 (10^6 scalar loops, each loop has 2 mul, 1 add, 2 subtract), and compiled with simply /O2 on an ICC and MSC,

the scalar distance calculation with rint took ~ 40 M cycles

a modified scalar calculation with type cast conversion took ~ 17 M cycles

the modified SSE code with _mm_cvtps_pi32 took ~ 5 M cycles.

There certainly will be variances with different compilers, using an external function like rint vs. type cast convert vs. intrinsic convert. The value of D you choose and the method of timing measurement will make further variance on your measured speedup.

For your reference the modifications I made is based on accumulating the results of each evaluation of

int dist(int i,int j)
{ float xd = X - X;
float yd = Y - Y;
int z = rint(xd*xd + yd*yd);
return z;
}

int distB(int i,int j) // compare overhead of rint
{ float xd = X - X;
float yd = Y - Y;
return (int) (xd*xd + yd*yd);
}

The SSE version includes replacing the rint with _mm_cvtps_pi32 and a bunch of _mm_add_pi32, so that the different loop structure of vectorized code have the same amount of add relative to the double-nested loop of scalar code and accumulated result.

0 Kudos
dario_mx
Beginner
489 Views
Jalo,

You may be watching at an old version of program. Initial posts I put on gcc-help list, shown an AOS (Array Of StructureS). Due suggestions, I changed that to a SOA (Structure Of Arrays). I have now a few aligned arrays with all the data.

I am going to post new version, which included latests suggestions I received here. You may wanna take a look at it.

Thanks.
0 Kudos
dario_mx
Beginner
489 Views
Jallo,

It seems that rint function entered a lot of noise ;-1 It was not an escencial part of my problem, so I removed for the sake of this proof of concept. I can see now a gain of 30% less in execution time. But still, I would expect more, given SSE2 is performing 4 operations at a time, right?

Unless maybe, I am loosing something ... as usual ;-|

Thanks for your attention (below c and assembler)

PS: Dunno why, but when used an if to repeat code for calling one function or the other, instead of a func pointer, the performance is worse than 30%. I know that using function pointer may prevent inlining the functions, so I tried to avoid its usage ... with the surprise that the SSE2 gain was less !. That's the reason I kept that part. Althought this may b an interesting point on its own, think thats another topic, not related with this serial vs vectorized debate.



#include
#include
#include

int D;
float *X,*Y,*Z;

inline static void dist(int i)
{
float xd,yd;
int j;
for(j=0; j {
xd = X - X;
yd = Y - Y;
Z = xd*xd + yd*yd;
}
}

inline static void dist_sse(int i)
{
int j;
__m128 xmm0 = _mm_set1_ps(X);
__m128 xmm1 = xmm0;
__m128 xmm2 = _mm_set1_ps(Y);
__m128 xmm3 = xmm2;
__m128 xmm4,xmm5,xmm6,xmm7;
for(j=0; j {
xmm4 =_mm_load_ps(X+j);
xmm5 =_mm_load_ps(X+j+4);
xmm6 =_mm_load_ps(Y+j);
xmm7 =_mm_load_ps(Y+j+4);
xmm4 = _mm_sub_ps(xmm0,xmm4);
xmm5 = _mm_sub_ps(xmm1,xmm5);
xmm6 = _mm_sub_ps(xmm2,xmm6);
xmm7 = _mm_sub_ps(xmm3,xmm7);
xmm4 = _mm_mul_ps(xmm4,xmm4);
xmm5 = _mm_mul_ps(xmm5,xmm5);
xmm6 = _mm_mul_ps(xmm6,xmm6);
xmm7 = _mm_mul_ps(xmm7,xmm7);
xmm4 = _mm_add_ps(xmm4,xmm6);
xmm5 = _mm_add_ps(xmm5,xmm7);
_mm_store_ps(Z+j,xmm4);
_mm_store_ps(Z+j+4,xmm5);
}
}

int main(int argc, char * argv[])
{
int i,j,opc,debug;
void (*opc_func)(int);

if ( argc != 4 )
{
fprintf(stderr," Usage: %s ",argv[0]);
return 1;
}

opc = atoi(argv[1]);
D = atoi(argv[2]);
debug = atoi(argv[3]);

if ( D %8 != 0 )
{
fprintf(stderr," Dimension %d must be multiple of 8: ",D);
return 2;
}

if ( opc == 0 )
{
X = (float *) malloc(D * sizeof(float));
Y = (float *) malloc(D * sizeof(float));
Z = (float *) malloc(D * sizeof(float));
}
else
{
X = (float *) _mm_malloc(D * sizeof(float), 16);
Y = (float *) _mm_malloc(D * sizeof(float), 16);
Z = (float *) _mm_malloc(D * sizeof(float), 16);
}

for(i=0;i {
X = i;
Y = D - i;
}

opc_func = opc == 0? dist : dist_sse;
for(i=0;i {
opc_func(i);
if ( debug )
{
for(j=0; j printf("%f ",Z);
}
}

return 0;
}


.file "kk.c"
.text
.p2align 4,,15
.def _dist; .scl 3; .type 32; .endef
_dist:
pushl %ebp
xorl %eax, %eax
movl %esp, %ebp
pushl %edi
movl 8(%ebp), %edi
pushl %esi
movl _D, %esi
pushl %ebx
cmpl %esi, %eax
jge L7
movl _X, %ecx
movl _Y, %edx
movl _Z, %ebx
.p2align 4,,15
L5:
flds (%ecx,%eax,4)
flds (%edx,%eax,4)
fxch %st(1)
fsubrs (%ecx,%edi,4)
fxch %st(1)
fsubrs (%edx,%edi,4)
fxch %st(1)
fmul %st(0), %st
fxch %st(1)
fmul %st(0), %st
faddp %st, %st(1)
fstps (%ebx,%eax,4)
incl %eax
cmpl %esi, %eax
jl L5
L7:
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.p2align 4,,15
.def _dist_sse; .scl 3; .type 32; .endef
_dist_sse:
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %ebx
subl $8, %esp
movl 8(%ebp), %esi
movl _X, %ebx
movl _Y, %ecx
movl (%ebx,%esi,4), %edx
movl %edx, -12(%ebp)
movl (%ecx,%esi,4), %eax
xorl %es i, %esi
movss -12(%ebp), %xmm5
movl %eax, -16(%ebp)
movss -16(%ebp), %xmm4
cmpl _D, %esi
shufps $0, %xmm5, %xmm5
shufps $0, %xmm4, %xmm4
jl L36
jmp L34
.p2align 4,,7
L35:
movl _X, %ebx
movl _Y, %ecx
L36:
leal 0(,%esi,4), %edx
movaps %xmm5, %xmm1
leal (%edx,%ebx), %eax
movaps (%eax), %xmm2
leal (%edx,%ecx), %ebx
movaps %xmm5, %xmm6
movaps (%ebx), %xmm0
addl $8, %esi
movaps 16(%eax), %xmm3
subps %xmm2, %xmm1
movl _Z, %eax
movaps 16(%ebx), %xmm7
movaps %xmm1, %xmm2
movaps %xmm4, %xmm1
subps %xmm0, %xmm1
movl %edx, %ebx
movaps %xmm1, %xmm0
mulps %xmm2, %xmm2
mulps %xmm0, %xmm0
subps %xmm3, %xmm6
addl %eax, %ebx
addps %xmm0, %xmm2
movaps %xmm6, %xmm3
movaps %xmm2, (%ebx)
movaps %xmm4, %xmm6
subps %xmm7, %xmm6
movl _Z, %ecx
movaps %xmm6, %xmm1
mulps %xmm3, %xmm3
mulps %xmm6, %xmm1
addps %xmm1, %xmm3
addl %ecx, %edx
movaps %xmm3, 16(%edx)
cmpl _D, %esi
jl L35
L34:
addl $8, %esp
popl %ebx
popl %esi
popl %ebp
ret
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
.align 4
LC1:
.ascii "12Usage: %s 1212�"
LC3:
.ascii "%f12�"
.align 4
LC2:
.ascii "12Dimension %d must be multiple of 8: 1212�"
.text
.p2alig n 4,,15
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp
movl $16, %eax
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $28, %esp
movl 12(%ebp), %ebx
andl $-16, %esp
call __alloca
call ___main
cmpl $4, 8(%ebp)
je L38
call ___getreent
movl (%ebx), %esi
movl $LC1, %ecx
movl %ecx, 4(%esp)
movl %esi, 8(%esp)
movl 12(%eax), %edx
movl %edx, (%esp)
call _fprintf
movl $1, %eax
leal -12(%ebp), %esp
L99:
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.p2align 4,,7
L38:
movl 4(%ebx), %edx
movl %edx, (%esp)
call _atoi
movl %eax, -16(%ebp)
movl 8(%ebx), %eax
movl %eax, (%esp)
call _atoi
movl %eax, _D
movl 12(%ebx), %edi
movl %edi, (%esp)
call _atoi
movl %eax, -20(%ebp)
movl _D, %ecx
testb $7, %cl
jne L89
movl -16(%ebp), %edi
testl %edi, %edi
je L90
movl %ecx, %eax
xorl %edx, %edx
sall $2, %eax
jne L91
L43:
movl %edx, _X
movl %ecx, %eax
xorl %edx, %edx
sall $2, %eax
jne L92
L48:
movl %edx, _Y
movl %ecx, %eax
xorl %edx, %edx
sall $2, %eax
jne&nb sp; L93
L53:
movl %edx, _Z
L41:
xorl %edx, %edx
cmpl %ecx, %edx
jge L73
movl _X, %esi
movl _Y, %ebx
.p2align 4,,15
L60:
movl %ecx, %eax
cvtsi2ss %edx, %xmm1
subl %edx, %eax
cvtsi2ss %eax, %xmm0
movss %xmm1, (%esi,%edx,4)
movss %xmm0, (%ebx,%edx,4)
incl %edx
cmpl %ecx, %edx
jl L60
L73:
movl -16(%ebp), %ebx
movl $_dist, %edi
movl $_dist_sse, %edx
testl %ebx, %ebx
cmovne %edx, %edi
xorl %esi, %esi
cmpl %ecx, %esi
jge L75
.p2align 4,,15
L101:
movl %esi, (%esp)
call *%edi
movl -20(%ebp), %ecx
testl %ecx, %ecx
je L77
movl _D, %eax
xorl %ebx, %ebx
cmpl %eax, %ebx
jge L65
.p2align 4,,15
L100:
movl _Z, %eax
flds (%eax,%ebx,4)
incl %ebx
movl $LC3, (%esp)
fstpl 4(%esp)
call _printf
movl _D, %eax
cmpl %eax, %ebx
jl L100
L65:
incl %esi
cmpl %eax, %esi
L102:
jl L101
L75:
leal -12(%ebp), %esp
xorl %eax, %eax
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
L90:
leal 0(,%ecx,4), %ebx
movl %ebx, (%esp)
call _malloc
movl %eax, _X
movl _D, %edx
sall $2, %edx
movl %edx, (%esp)
call _malloc
movl %eax, _Y
& nbsp; movl _D, %eax
sall $2, %eax
movl %eax, (%esp)
call _malloc
movl %eax, _Z
movl _D, %ecx
jmp L41
.p2align 4,,7
L77:
movl _D, %eax
incl %esi
cmpl %eax, %esi
jmp L102
L93:
addl $16, %eax
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L86
leal 16(%eax), %edi
andl $-16, %edi
movl %edi, %edx
movl %eax, -4(%edi)
L86:
movl _D, %ecx
jmp L53
L92:
addl $16, %eax
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L85
leal 16(%eax), %esi
andl $-16, %esi
movl %esi, %edx
movl %eax, -4(%esi)
L85:
movl _D, %ecx
jmp L48
L91:
addl $16, %eax
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L84
leal 16(%eax), %ecx
andl $-16, %ecx
movl %ecx, %edx
movl %eax, -4(%ecx)
L84:
movl _D, %ecx
jmp L43
L89:
call ___getreent
movl _D, %esi
movl $LC2, %ecx
movl %ecx, 4(%esp)
movl %esi, 8(%esp)
movl 12(%eax), %ebx
movl %ebx, (%esp)
call _fprintf
movl $2, %eax
leal -12(%ebp), %esp
jmp L99
.comm _D, 16 # 4
.comm _X, 16 # 4
.comm _Y, 16 # 4
.comm _Z, 16 # 4
.def _printf; .scl 3; .type 32; .endef
.def _atoi; .scl 3; .type 32; .endef
.def ___getreent; .scl 3; .type 32; .endef
.def _fprintf; .scl 3; .type 32; .endef
.def _malloc; .scl 3; .type 32; .endef

0 Kudos
Reply