(pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

dario_mx · ‎04-14-2008

Hallo,

I am trying to improve the performance of a very simple program, by using SSE2. Program snipet only needs to calculate the distance of all pairs of a set of N points (I know I could take advantage of symmetry, but I am focusing now at low vectorization).

I am using intrinsics, have examined a bit the generated assembler (though I far from being an expert) and it looks fike. However, the performance is only 10% less than original one. What am I missing?

Thanks.

Program is the following:

#include 
#include 
#include 
#include 

#define SSE2_ALIGNED __attribute__ ((aligned (16)))
#define print_y(x) printf("%d
",(int) x)
#define print_n(x) x
#define print print_n

int D;
float *X,*Y;

inline static void dist(int i,int j)
{
  float xd = X - X;
  float yd = Y - Y;
  print(rint(xd*xd + yd*yd));
}

inline static void dist_sse(int i)
{
  float d[8] SSE2_ALIGNED;
  int j;
  __m128 xmm0 = _mm_set1_ps(X);
  __m128 xmm1 = xmm0;
  __m128 xmm2 = _mm_set1_ps(Y);
  __m128 xmm3 = xmm2;
  __m128 xmm4,xmm5,xmm6,xmm7;
  for(j=0; j  {
    xmm4 =_mm_load_ps(X+j);
    xmm5 =_mm_load_ps(X+j+4);
    xmm6 =_mm_load_ps(Y+j);
    xmm7 =_mm_load_ps(Y+j+4);
    xmm4 = _mm_sub_ps(xmm0,xmm4);
    xmm5 = _mm_sub_ps(xmm1,xmm5);
    xmm6 = _mm_sub_ps(xmm2,xmm6);
    xmm7 = _mm_sub_ps(xmm3,xmm7);
    xmm4 = _mm_mul_ps(xmm4,xmm4);
    xmm5 = _mm_mul_ps(xmm5,xmm5);
    xmm6 = _mm_mul_ps(xmm6,xmm6);
    xmm7 = _mm_mul_ps(xmm7,xmm7);
    xmm4 = _mm_add_ps(xmm4,xmm6);
    xmm5 = _mm_add_ps(xmm5,xmm7);
    _mm_store_ps(d,xmm4);
    _mm_store_ps(d+4,xmm5);
    print(rint(d[0]));
    print(rint(d[1]));
    print(rint(d[2]));
    print(rint(d[3]));
    print(rint(d[4]));
    print(rint(d[5]));
    print(rint(d[6]));
    print(rint(d[7]));
  }
}

int main(int argc, char * argv[])
{
  int i,j,opc;

  if ( argc != 3 )
  {
    fprintf(stderr,"
Usage: %s  

",argv[0]);
    return 1; 
  }

  opc = atoi(argv[1]);
  D = atoi(argv[2]);

  if ( D %8 != 0 )
  {
    fprintf(stderr,"
Dimension %d must be multiple of 8: 

",D);
    return 2; 
  }

  if ( opc == 0 )
  {
    X = (float *) malloc(D * sizeof(float));
    Y = (float *) malloc(D * sizeof(float));
  }
  else
  {
    X = (float *) _mm_malloc(D * sizeof(float), 16);
    Y = (float *) _mm_malloc(D * sizeof(float), 16);
  }

  for(i=0;i  {
    X = i;
    Y = D - i;
  }

  if ( opc == 0 )
    for(i=0;i      for(j=0;j        dist(i,j);
  else
    for(i=0;i      dist_sse(i);

  return 0;
}

I am compiling with:

CC = gcc
CFLAGS = -O3 -Wall -march=pentium-m -msse2

all: kk

And generated assembler is:

	.file	"kk.c"
	.def	___main;	.scl	2;	.type	32;	.endef
	.section .rdata,"dr"
LC0:
	.ascii "12Usage: %s  1212�"
	.align 4
LC1:
	.ascii "12Dimension %d must be multiple of 8: 1212�"
	.text
	.p2align 4,,15
.globl _main
	.def	_main;	.scl	2;	.type	32;	.endef
_main:
	pushl	%ebp
	movl	$16, %ea
x
	movl	%esp, %ebp
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	subl	$108, %esp
	movl	12(%ebp), %ebx
	andl	$-16, %esp
	call	__alloca
	call	___main
	cmpl	$3, 8(%ebp)
	je	L2
	call	___getreent
	movl	(%ebx), %esi
	movl	$LC0, %ecx
	movl	%ecx, 4(%esp)
	movl	%esi, 8(%esp)
	movl	12(%eax), %edx
	movl	%edx, (%esp)
	call	_fprintf
	movl	$1, %eax
	leal	-12(%ebp), %esp
L90:
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret
L2:
	movl	4(%ebx), %edi
	movl	%edi, (%esp)
	call	_atoi
	movl	%eax, %edi
	movl	8(%ebx), %eax
	movl	%eax, (%esp)
	call	_atoi
	movl	%eax, _D
	testb	$7, %al
	movl	%eax, %ecx
	jne	L82
	testl	%edi, %edi
	je	L83
	xorl	%edx, %edx
	sall	$2, %eax
	jne	L84
L7:
	movl	%edx, _X
	movl	%ecx, %eax
	xorl	%edx, %edx
	sall	$2, %eax
	jne	L85
L12:
	movl	%edx, _Y
	movl	%edx, %ebx
L5:
	xorl	%edx, %edx
	cmpl	%ecx, %edx
	jge	L59
	movl	_X, %esi
	.p2align 4,,15
L19:
	movl	%ecx, %eax
	cvtsi2ss	%edx, %xmm1
	subl	%edx, %eax
	cvtsi2ss	%eax, %xmm0
	movss	%xmm1, (%esi,%edx,4)
	movss	%xmm0, (%ebx,%edx,4)
	incl	%edx
	cmpl	%ecx, %edx
	jl	L19
L59:
	testl	%edi, %edi
	jne	L20
	xorl	%esi, %esi
	cmpl	%ecx, %esi
	jge	L30
	.p2align 4,,15
L92:
	xorl	%ebx, %ebx
	cmpl	%ecx, %ebx
	jge	L63
	.p2align 4,,15
L91:
	movl	_X, %edx
	movl	_Y, %edi
	flds	(%edx,%ebx,4)
	flds	(%edi,%ebx,4)
	fxch	%st(1)
	incl	%ebx
	fsubrs	(%edx,%esi,4)
	fxch	%st(1)
	fsubrs	(%edi,%esi,4)
	fxch	%st(1)
	fmul	%st(0), %st
	fxch	%st(1)
	fmul	%st(0), %st
	faddp	%st, %st(1)
	fstpl	(%esp)
	call	_rint
	fstp	%st(0)
	movl	_D, %ecx
	cmpl	%ecx, %ebx
	jl	L91
L63:
	incl	%esi
	cmpl	%ecx, %esi
	jl	L92
L30:
	leal	-12(%ebp), %esp
	xorl	%eax, %eax
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret
L83:
	leal	0(,%eax,4), %eax
	movl	%eax, (%esp)
	call	_malloc
	movl	%eax, _X
	movl	_D, %esi
	sall	$2, %esi
	movl	%esi, (%esp)
	call	_malloc
	movl	%eax, _Y
	movl	_D, %ecx
	movl	%eax, %ebx
	jmp	L5
L20:
	xorl	%edi, %edi
	cmpl	%ecx, %edi
	jge	L30
	.p2align 4,,15
L75:
	movl	_X, %edx
	movl	(%edx,%edi,4), %eax
	movl	%eax, -60(%ebp)
	movl	(%ebx,%edi,4), %esi
	movss	-60(%ebp), %xmm2
	movl	%esi, -64(%ebp)
	xorl	%esi, %esi
	shufps	$0, %xmm2, %xmm2
	movss	-64(%ebp), %xmm4
	cmpl	%ecx, %esi
	movaps	%xmm2, -88(%ebp)
	shufps	$0, %xmm4, %xmm4
	movaps	%xmm4, -104(%ebp)
	jl	L76
	jmp	L66
	.p2align 4,,7
L67:
	movl	_X, %edx
	movl	_Y, %ebx
L76:
	movaps	-88(%ebp), %xmm7
	leal	0(,%esi,4), %ecx
	leal	(%ecx,%edx), %edx
	movaps	(%edx), %xmm5
	addl	%ebx, %ecx
	addl	$8, %esi
	movaps	(%ecx), %xmm6
	movaps	-104(%ebp), %xmm4
	subps	%xmm5, %xmm7
	movaps	%xmm7, %xmm5
	movaps	16(%edx), %xmm3
	mulps	%xmm5, %xmm5
	subps	%xmm6, %xmm4
	movaps	16(%ecx), %xmm1
	movaps	%xmm4, %xmm7
	mulps	%xmm4, %xmm7
	movaps	-88(%ebp), %xmm2
	addps	%xmm7, %xmm5
	movaps	-104(%ebp), %xmm0
	movaps	%xmm5, -56(%ebp)
	flds	-56(%ebp)
	subps	%xmm3, %xmm2
	subps	%xmm1, %xmm0
	movaps	%xmm2, %xmm3
	movaps	%xmm0, %xmm6
	mulps	%xmm2, %xmm3
	mulps	%xmm0, %xmm6
	addps	%xmm6, %xmm3
	movaps	%xmm3, -40(%ebp)
	fstpl	(%esp)
	call	_rint
	fstp	%st(0)
	flds	-52(%ebp)
	fstpl	(%e
sp)
	call	_rint
	fstp	%st(0)
	flds	-48(%ebp)
	fstpl	(%esp)
	call	_rint
	fstp	%st(0)
	flds	-44(%ebp)
	fstpl	(%esp)
	call	_rint
	fstp	%st(0)
	flds	-40(%ebp)
	fstpl	(%esp)
	call	_rint
	fstp	%st(0)
	flds	-36(%ebp)
	fstpl	(%esp)
	call	_rint
	fstp	%st(0)
	flds	-32(%ebp)
	fstpl	(%esp)
	call	_rint
	fstp	%st(0)
	flds	-28(%ebp)
	fstpl	(%esp)
	call	_rint
	fstp	%st(0)
	movl	_D, %ecx
	cmpl	%ecx, %esi
	jl	L67
L66:
	incl	%edi
	cmpl	%ecx, %edi
	jge	L30
	movl	_Y, %ebx
	jmp	L75
L85:
	addl	$16, %eax
	movl	%eax, (%esp)
	call	_malloc
	testl	%eax, %eax
	movl	%eax, %edx
	je	L78
	leal	16(%eax), %ecx
	andl	$-16, %ecx
	movl	%ecx, %edx
	movl	%eax, -4(%ecx)
L78:
	movl	_D, %ecx
	jmp	L12
L84:
	addl	$16, %eax
	movl	%eax, (%esp)
	call	_malloc
	testl	%eax, %eax
	movl	%eax, %edx
	je	L77
	leal	16(%eax), %ebx
	andl	$-16, %ebx
	movl	%ebx, %edx
	movl	%eax, -4(%ebx)
L77:
	movl	_D, %ecx
	jmp	L7
L82:
	call	___getreent
	movl	_D, %ecx
	movl	$LC1, %edx
	movl	%edx, 4(%esp)
	movl	%ecx, 8(%esp)
	movl	12(%eax), %ebx
	movl	%ebx, (%esp)
	call	_fprintf
	movl	$2, %eax
	leal	-12(%ebp), %esp
	jmp	L90
	.comm	_D, 16	 # 4
	.comm	_X, 16	 # 4
	.comm	_Y, 16	 # 4
	.def	_atoi;	.scl	3;	.type	32;	.endef
	.def	___getreent;	.scl	3;	.type	32;	.endef
	.def	_fprintf;	.scl	3;	.type	32;	.endef
	.def	_rint;	.scl	3;	.type	32;	.endef
	.def	_malloc;	.scl	3;	.type	32;	.endef

TimP · ‎04-22-2008

This topic had already been covered extensively in the following public e-mail thread:
http://gcc.gnu.org/ml/gcc-help/2008-04/msg00073.htm l
It was never clear why a major speedup was expected with printf() in the inner loop, nor whether a vectorizing compiler such as g++ 4.3 or icpc was tried (with printf removed so as to attempt auto-vectorization).
C99 math functions (e.g. rint) presumably are available in g++, with -std=gnu99.

dario_mx · ‎04-22-2008

Will read post ... but meanwhile can tell that the printf is conditionally removed from code. I used only for debugging the correctness of vectorized version, of course for the real timing, i removed.

thanks.

dario_mx · ‎04-22-2008

LOL

The post u mentioned is just the one I made first, before coming here to Intel forumns !!!

With "extensively covered" you mean the topic was left unanswered? ;-1 Cause thats what happened. I made all suggestions I got, and still no signigicant improvement. The modified version is the one I posted here, at Intel.

There you have the assembler generated by GCC ... there we can see the SSE2 instructions. So, that is wrong with this picture?!

I would expect that, if there is one place on earth with people being experts in Intel assembler, that place must be here ... hehe. So, please check the assembler and tell me what is wrong.

Thanks.

TimP · ‎04-22-2008

People on the gcc-help list did their best to help you define what you wanted to do. If you wanted only to make a non-vector cross between a macro and inline function of a style understood only by gcc, with parallel SSE inside that function, perhaps what you showed was what you wanted.
Otherwise, you could have posted a shorter example, which could be compiled by standard compilers, illustrating your interest.
The rint() function is not recognized as a vectorizable function by either gcc or icc. If you meant it as a substitute for sqrtf(), the latter can be vectorized in line by icc, but apparently not yet by gcc.

dario_mx · ‎04-22-2008

Certaingly I appreciate all the advices gcc-help community provide me. Indeed, I followed all their advices.

I was not asking for a way to achieve the vectorization ... I already did it. Neither was I looking to automatic vectorization ... I manually code it. We can see the assembler there, generated directly from intrinsics functions ... it includes SSE2 of course (thus, this is not a problem of whether the compiler automatically vectorized or not).

What I am asking for, kindly of course hehe, is for advice about why my manually coded vector version is not improving significantly the performance (it offers a gain of 10% in runtime, which is very poor considering I am vectorizing the whole thing ... I would expect a gain of 75%)

Thanks.

Anat_S_Intel · ‎04-23-2008

Hi,

Try changing the allocation scheme to allocating one aligned block for all the data instead of many allocations. This will improve data locality in the caches and might gain performance.

The code here uses rint(). This part is not parallelized and is similar for the two versions. I dont know what it does so its impossible to estimate its duration relative to the other operations. It might be another reason for the low speedup.

Regards

SHIH_K_Intel · ‎04-23-2008

Just to add a few more comments to explain the likely reasons for your observation of lack of speed up.

In addition to what my colleagues have pointed out, on calling an external function like rint. The pitfalls in the code you've shown have significant overhead in terms calling scalar external functions. For the purpose of rounding, using another integer conversion technique might make more sense than throwing a bunch of rint's at the end of each hand-vectorized SSE loop. Secondly, depending on the parameter "D" you use when testing, it is possible in some portion of your loop iteration the rounding of floating-point to integer may experience exceptions. That can have different amount of delays between x87 code and SSE code.

I did a quick test by simplifying your code somewhat to use the intrinsic of cvtps2pi for rounding. I also modified the scalar c code into two versions to compare the overhead of using rint vs. a simple type cast conversion.

Using a fixed value of D= 1024 (10^6 scalar loops, each loop has 2 mul, 1 add, 2 subtract), and compiled with simply /O2 on an ICC and MSC,

the scalar distance calculation with rint took ~ 40 M cycles

a modified scalar calculation with type cast conversion took ~ 17 M cycles

the modified SSE code with _mm_cvtps_pi32 took ~ 5 M cycles.

There certainly will be variances with different compilers, using an external function like rint vs. type cast convert vs. intrinsic convert. The value of D you choose and the method of timing measurement will make further variance on your measured speedup.

For your reference the modifications I made is based on accumulating the results of each evaluation of

int dist(int i,int j)
{ float xd = X - X;
float yd = Y - Y;
int z = rint(xd*xd + yd*yd);
return z;
}

int distB(int i,int j) // compare overhead of rint
{ float xd = X - X;
float yd = Y - Y;
return (int) (xd*xd + yd*yd);
}

The SSE version includes replacing the rint with _mm_cvtps_pi32 and a bunch of _mm_add_pi32, so that the different loop structure of vectorized code have the same amount of add relative to the double-nested loop of scalar code and accumulated result.

dario_mx · ‎04-24-2008

Jalo,

You may be watching at an old version of program. Initial posts I put on gcc-help list, shown an AOS (Array Of StructureS). Due suggestions, I changed that to a SOA (Structure Of Arrays). I have now a few aligned arrays with all the data.

I am going to post new version, which included latests suggestions I received here. You may wanna take a look at it.

Thanks.

dario_mx · ‎04-24-2008

Jallo,

It seems that rint function entered a lot of noise ;-1 It was not an escencial part of my problem, so I removed for the sake of this proof of concept. I can see now a gain of 30% less in execution time. But still, I would expect more, given SSE2 is performing 4 operations at a time, right?

Unless maybe, I am loosing something ... as usual ;-|

Thanks for your attention (below c and assembler)

PS: Dunno why, but when used an if to repeat code for calling one function or the other, instead of a func pointer, the performance is worse than 30%. I know that using function pointer may prevent inlining the functions, so I tried to avoid its usage ... with the surprise that the SSE2 gain was less !. That's the reason I kept that part. Althought this may b an interesting point on its own, think thats another topic, not related with this serial vs vectorized debate.

#include
#include
#include

int D;
float *X,*Y,*Z;

inline static void dist(int i)
{
float xd,yd;
int j;
for(j=0; j {
xd = X - X;
yd = Y - Y;
Z = xd*xd + yd*yd;
}
}

inline static void dist_sse(int i)
{
int j;
__m128 xmm0 = _mm_set1_ps(X);
__m128 xmm1 = xmm0;
__m128 xmm2 = _mm_set1_ps(Y);
__m128 xmm3 = xmm2;
__m128 xmm4,xmm5,xmm6,xmm7;
for(j=0; j {
xmm4 =_mm_load_ps(X+j);
xmm5 =_mm_load_ps(X+j+4);
xmm6 =_mm_load_ps(Y+j);
xmm7 =_mm_load_ps(Y+j+4);
xmm4 = _mm_sub_ps(xmm0,xmm4);
xmm5 = _mm_sub_ps(xmm1,xmm5);
xmm6 = _mm_sub_ps(xmm2,xmm6);
xmm7 = _mm_sub_ps(xmm3,xmm7);
xmm4 = _mm_mul_ps(xmm4,xmm4);
xmm5 = _mm_mul_ps(xmm5,xmm5);
xmm6 = _mm_mul_ps(xmm6,xmm6);
xmm7 = _mm_mul_ps(xmm7,xmm7);
xmm4 = _mm_add_ps(xmm4,xmm6);
xmm5 = _mm_add_ps(xmm5,xmm7);
_mm_store_ps(Z+j,xmm4);
_mm_store_ps(Z+j+4,xmm5);
}
}

int main(int argc, char * argv[])
{
int i,j,opc,debug;
void (*opc_func)(int);

if ( argc != 4 )
{
fprintf(stderr," Usage: %s ",argv[0]);
return 1;
}

opc = atoi(argv[1]);
D = atoi(argv[2]);
debug = atoi(argv[3]);

if ( D %8 != 0 )
{
fprintf(stderr," Dimension %d must be multiple of 8: ",D);
return 2;
}

if ( opc == 0 )
{
X = (float *) malloc(D * sizeof(float));
Y = (float *) malloc(D * sizeof(float));
Z = (float *) malloc(D * sizeof(float));
}
else
{
X = (float *) _mm_malloc(D * sizeof(float), 16);
Y = (float *) _mm_malloc(D * sizeof(float), 16);
Z = (float *) _mm_malloc(D * sizeof(float), 16);
}

for(i=0;i {
X = i;
Y = D - i;
}

opc_func = opc == 0? dist : dist_sse;
for(i=0;i {
opc_func(i);
if ( debug )
{
for(j=0; j printf("%f ",Z);
}
}

return 0;
}

.file "kk.c"
.text
.p2align 4,,15
.def _dist; .scl 3; .type 32; .endef
_dist:
pushl %ebp
xorl %eax, %eax
movl %esp, %ebp
pushl %edi
movl 8(%ebp), %edi
pushl %esi
movl _D, %esi
pushl %ebx
cmpl %esi, %eax
jge L7
movl _X, %ecx
movl _Y, %edx
movl _Z, %ebx
.p2align 4,,15
L5:
flds (%ecx,%eax,4)
flds (%edx,%eax,4)
fxch %st(1)
fsubrs (%ecx,%edi,4)
fxch %st(1)
fsubrs (%edx,%edi,4)
fxch %st(1)
fmul %st(0), %st
fxch %st(1)
fmul %st(0), %st
faddp %st, %st(1)
fstps (%ebx,%eax,4)
incl %eax
cmpl %esi, %eax
jl L5
L7:
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.p2align 4,,15
.def _dist_sse; .scl 3; .type 32; .endef
_dist_sse:
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %ebx
subl $8, %esp
movl 8(%ebp), %esi
movl _X, %ebx
movl _Y, %ecx
movl (%ebx,%esi,4), %edx
movl %edx, -12(%ebp)
movl (%ecx,%esi,4), %eax
xorl %es i, %esi
movss -12(%ebp), %xmm5
movl %eax, -16(%ebp)
movss -16(%ebp), %xmm4
cmpl _D, %esi
shufps $0, %xmm5, %xmm5
shufps $0, %xmm4, %xmm4
jl L36
jmp L34
.p2align 4,,7
L35:
movl _X, %ebx
movl _Y, %ecx
L36:
leal 0(,%esi,4), %edx
movaps %xmm5, %xmm1
leal (%edx,%ebx), %eax
movaps (%eax), %xmm2
leal (%edx,%ecx), %ebx
movaps %xmm5, %xmm6
movaps (%ebx), %xmm0
addl $8, %esi
movaps 16(%eax), %xmm3
subps %xmm2, %xmm1
movl _Z, %eax
movaps 16(%ebx), %xmm7
movaps %xmm1, %xmm2
movaps %xmm4, %xmm1
subps %xmm0, %xmm1
movl %edx, %ebx
movaps %xmm1, %xmm0
mulps %xmm2, %xmm2
mulps %xmm0, %xmm0
subps %xmm3, %xmm6
addl %eax, %ebx
addps %xmm0, %xmm2
movaps %xmm6, %xmm3
movaps %xmm2, (%ebx)
movaps %xmm4, %xmm6
subps %xmm7, %xmm6
movl _Z, %ecx
movaps %xmm6, %xmm1
mulps %xmm3, %xmm3
mulps %xmm6, %xmm1
addps %xmm1, %xmm3
addl %ecx, %edx
movaps %xmm3, 16(%edx)
cmpl _D, %esi
jl L35
L34:
addl $8, %esp
popl %ebx
popl %esi
popl %ebp
ret
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
.align 4
LC1:
.ascii "12Usage: %s 1212�"
LC3:
.ascii "%f12�"
.align 4
LC2:
.ascii "12Dimension %d must be multiple of 8: 1212�"
.text
.p2alig n 4,,15
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp
movl $16, %eax
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $28, %esp
movl 12(%ebp), %ebx
andl $-16, %esp
call __alloca
call ___main
cmpl $4, 8(%ebp)
je L38
call ___getreent
movl (%ebx), %esi
movl $LC1, %ecx
movl %ecx, 4(%esp)
movl %esi, 8(%esp)
movl 12(%eax), %edx
movl %edx, (%esp)
call _fprintf
movl $1, %eax
leal -12(%ebp), %esp
L99:
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.p2align 4,,7
L38:
movl 4(%ebx), %edx
movl %edx, (%esp)
call _atoi
movl %eax, -16(%ebp)
movl 8(%ebx), %eax
movl %eax, (%esp)
call _atoi
movl %eax, _D
movl 12(%ebx), %edi
movl %edi, (%esp)
call _atoi
movl %eax, -20(%ebp)
movl _D, %ecx
testb $7, %cl
jne L89
movl -16(%ebp), %edi
testl %edi, %edi
je L90
movl %ecx, %eax
xorl %edx, %edx
sall $2, %eax
jne L91
L43:
movl %edx, _X
movl %ecx, %eax
xorl %edx, %edx
sall $2, %eax
jne L92
L48:
movl %edx, _Y
movl %ecx, %eax
xorl %edx, %edx
sall $2, %eax
jne&nb sp; L93
L53:
movl %edx, _Z
L41:
xorl %edx, %edx
cmpl %ecx, %edx
jge L73
movl _X, %esi
movl _Y, %ebx
.p2align 4,,15
L60:
movl %ecx, %eax
cvtsi2ss %edx, %xmm1
subl %edx, %eax
cvtsi2ss %eax, %xmm0
movss %xmm1, (%esi,%edx,4)
movss %xmm0, (%ebx,%edx,4)
incl %edx
cmpl %ecx, %edx
jl L60
L73:
movl -16(%ebp), %ebx
movl $_dist, %edi
movl $_dist_sse, %edx
testl %ebx, %ebx
cmovne %edx, %edi
xorl %esi, %esi
cmpl %ecx, %esi
jge L75
.p2align 4,,15
L101:
movl %esi, (%esp)
call *%edi
movl -20(%ebp), %ecx
testl %ecx, %ecx
je L77
movl _D, %eax
xorl %ebx, %ebx
cmpl %eax, %ebx
jge L65
.p2align 4,,15
L100:
movl _Z, %eax
flds (%eax,%ebx,4)
incl %ebx
movl $LC3, (%esp)
fstpl 4(%esp)
call _printf
movl _D, %eax
cmpl %eax, %ebx
jl L100
L65:
incl %esi
cmpl %eax, %esi
L102:
jl L101
L75:
leal -12(%ebp), %esp
xorl %eax, %eax
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
L90:
leal 0(,%ecx,4), %ebx
movl %ebx, (%esp)
call _malloc
movl %eax, _X
movl _D, %edx
sall $2, %edx
movl %edx, (%esp)
call _malloc
movl %eax, _Y
& nbsp; movl _D, %eax
sall $2, %eax
movl %eax, (%esp)
call _malloc
movl %eax, _Z
movl _D, %ecx
jmp L41
.p2align 4,,7
L77:
movl _D, %eax
incl %esi
cmpl %eax, %esi
jmp L102
L93:
addl $16, %eax
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L86
leal 16(%eax), %edi
andl $-16, %edi
movl %edi, %edx
movl %eax, -4(%edi)
L86:
movl _D, %ecx
jmp L53
L92:
addl $16, %eax
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L85
leal 16(%eax), %esi
andl $-16, %esi
movl %esi, %edx
movl %eax, -4(%esi)
L85:
movl _D, %ecx
jmp L48
L91:
addl $16, %eax
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L84
leal 16(%eax), %ecx
andl $-16, %ecx
movl %ecx, %edx
movl %eax, -4(%ecx)
L84:
movl _D, %ecx
jmp L43
L89:
call ___getreent
movl _D, %esi
movl $LC2, %ecx
movl %ecx, 4(%esp)
movl %esi, 8(%esp)
movl 12(%eax), %ebx
movl %ebx, (%esp)
call _fprintf
movl $2, %eax
leal -12(%ebp), %esp
jmp L99
.comm _D, 16 # 4
.comm _X, 16 # 4
.comm _Y, 16 # 4
.comm _Z, 16 # 4
.def _printf; .scl 3; .type 32; .endef
.def _atoi; .scl 3; .type 32; .endef
.def ___getreent; .scl 3; .type 32; .endef
.def _fprintf; .scl 3; .type 32; .endef
.def _malloc; .scl 3; .type 32; .endef