- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

Hi,

I am trying to optimize a simple loop with SIMD SSE instructions. The code compute a 2D field and calculate heat propagation trough time.

Loops use only simple operations, and reference loop is really simple.

The following program contains 3 loops : loop 0 is the reference, loop 1 is a decomposition of the operations, and loop 2 is the optimize loop. However, the second loop is slightly faster than the first one, and the optimize loop is really slow. I am using x64 gcc 4.7 and icc 11 compilers, sames results with -O2, -O3, -msse2/-xSSE2.

Did I make something wrong ?

With my best regards.

Moomba

[csharp]

#include <stdio.h>

#include <time.h>

#include<emmintrin.h>

#define n1 102

#define n2 102

#define niter 20000

double U0[n1][n2];

double U1[n1][n2];

double U2[n1][n2]; /* buffer*/

double *cU0[n1][n2];

/* simd */

__m128d vU0[n1][n2];

__m128d vU1[n1][n2];

__m128d vU2[n1][n2]; /* buffer*/

int i,j,t;

double Dx,Dy,Lx,Ly,InvDxDx,InvDyDy,Dt,alpha,totaltime,Stab,DtAlpha,DxDx,DyDy;

clock_t time0,time1;

FILE *f1;

int main()

{

/* ---- GENERAL ---- */

alpha = 0.4;

totaltime = 1.0;

Dt = totaltime/((niter-1)*1.0);

Lx = 1.0;

Ly = 1.0;

Dx = Lx/((n1-1)*1.0);

Dy = Ly/((n2-1)*1.0);

InvDxDx = 1.0/(Dx*Dx);

InvDyDy = 1.0/(Dy*Dy);

DxDx = Dx*Dx;

DyDy = Dy*Dy;

Stab = alpha*Dt*(InvDxDx+InvDyDy);

DtAlpha = Dt*alpha;

/* Stability if result <= 0.5 */

printf("Stability factor : %f \n",Stab);

/* +----------------+ */

/* | LOOP 0 | */

/* +----------------+ */

/* Init */

for( i = 0; i < n1; i++)

{

for( j = 0; j < n2; j++)

{

U0

U1

}

}

for( i = 0; i < n1; i++)

{

U0*[0] = 1.0; U1*

*[0] = 1.0; } printf("Init OK \n");/* Core */ time0=clock(); for( t = 0; t < niter; t++) { /* even */ for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { U1*

= U0 + DtAlpha*( (U0[i+1]-2.0*U0

+U0[i-1])*InvDxDx + (U0

*[j+1]-2.0*U0*

+U0 *[j-1])*InvDyDy); } } /* odd */ for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { U0*

= U1 + DtAlpha*( (U1[i+1]-2.0*U1

+U1[i-1])*InvDxDx + (U1

*[j+1]-2.0*U1*

+U1 *[j-1])*InvDyDy); } } } time1=clock(); printf("Loop 0, total time : %f \n", (double) time1-time0); f1 = fopen ("out0.dat", "wt"); for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { fprintf (f1, "%d\t%d\t%f\n", i, j, U0*

); } }/* +----------------+ *//* | LOOP 1 | *//* +----------------+ *//* Init */ for( i = 0; i < n1; i++) { for( j = 0; j < n2; j++) { U0 = 0.0; U1

= 0.0; } } for( i = 0; i < n1; i++) { U0

*[0] = 1.0; U1*

*[0] = 1.0; } printf("Init OK \n");/* Core */ time0=clock(); for( t = 0; t < niter; t++) { /* even */ for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { /* U1* = U0 + Dt*alpha*( (U0[i+1]-2.0*U0

+U0[i-1])*InvDxDx + (U0

*[j+1]-2.0*U0*

+U0 *[j-1])*InvDyDy); */ U1*

= -2.0*U0 ; U1

= U1

+ U0[i+1]; U1

= U1

+ U0[i-1]; U1

= U1

* InvDxDx; U2

= -2.0*U0

; U2

= U2

+ U0

*[j+1]; U2*

= U2 + U0

*[j-1]; U2*

= U2 * InvDyDy; U1

= U1

+ U2

; U1

= U1

* DtAlpha; U1

= U1

+ U0

; } } /* odd */ for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { U0

= -2.0*U1

; U0

= U0

+ U1[i+1]; U0

= U0

+ U1[i-1]; U0

= U0

* InvDxDx; U2

= -2.0*U1

; U2

= U2

+ U1

*[j+1]; U2*

= U2 + U1

*[j-1]; U2*

= U2 * InvDyDy; U0

= U0

+ U2

; U0

= U0

* DtAlpha; U0

= U0

+ U1

; } } } time1=clock(); printf("Loop 1, total time : %f \n", (double) time1-time0);/* End */ f1 = fopen ("out1.dat", "wt"); for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { fprintf (f1, "%d\t%d\t%f\n", i, j, U0

); } }

/* +----------------+ */

/* | LOOP 3 | */

/* +----------------+ */

/* Init */

for( i = 0; i < n1; i++)

{

for( j = 0; j < n2; j++)

{

vU0

vU1

}

}

for( i = 0; i < n1; i++)

{

vU0*[0] = _mm_set1_pd(1.0); vU1*

*[0] = _mm_set1_pd(1.0); } printf("Init OK \n");/* Core */ time0=clock(); for( t = 0; t < niter; t++) { /* even */ for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j+=2) { /* U1*

= U0 + Dt*alpha*( (U0[i+1]-2.0*U0

+U0[i-1])*InvDxDx + (U0

*[j+1]-2.0*U0*

+U0 *[j-1])*InvDyDy); */ __m128d va = _mm_set1_pd(-2.0); /* U1*

= -2.0*U0 ; */ vU1

= _mm_mul_pd(va,vU0

); vU1

= _mm_add_pd(vU1

,vU0[i+1]); /* U1

= U1

+ U0[i+1]; */ vU1

= _mm_add_pd(vU1

,vU0[i-1]); /* U1

= U1

+ U0[i-1]; */ __m128d vb = _mm_set1_pd(InvDxDx); /* U1

= U1

* InvDxDx; */ vU1

= _mm_mul_pd(vb,vU1

); vU2

= _mm_mul_pd(va,vU0

); /* U2

= -2.0*U0

; */ vU2

= _mm_add_pd(vU2

,vU0[i+1]); /* U2

= U2

+ U0[i+1]; */ vU2

= _mm_add_pd(vU2

,vU0[i-1]); /* U2

= U2

+ U0[i-1]; */ __m128d vc = _mm_set1_pd(InvDyDy); /* U2

= U2

* InvDyDy; */ vU2

= _mm_mul_pd(vc,vU2

); vU1

= _mm_add_pd(vU1

,vU2

); /* U1

= U1

+ U2

; */ __m128d vd = _mm_set1_pd(DtAlpha); /* U1

= U1

* DtAlpha; */ vU1

= _mm_mul_pd(vd,vU1

); vU1

= _mm_add_pd(vU1

,vU0

); /* U1

= U1

+ U0

; */ } } /* odd */ for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j++) { __m128d va = _mm_set1_pd(-2.0); /* U0

= -2.0*U1

; */ vU0

= _mm_mul_pd(va,vU1

); vU0

= _mm_add_pd(vU0

,vU1[i+1]); /* U0

= U0

+ U1[i+1]; */ vU0

= _mm_add_pd(vU0

,vU1[i-1]); /* U0

= U0

+ U1[i-1]; */ __m128d vb = _mm_set1_pd(InvDxDx); /* U0

= U0

* InvDxDx; */ vU0

= _mm_mul_pd(vb,vU0

); vU2

= _mm_mul_pd(va,vU1

); /* U2

= -2.0*U1

; */ vU2

= _mm_add_pd(vU2

,vU1[i+1]); /* U2

= U2

+ U1[i+1]; */ vU2

= _mm_add_pd(vU2

,vU1[i-1]); /* U2

= U2

+ U1[i-1]; */ __m128d vc = _mm_set1_pd(InvDyDy); /* U2

= U2

* InvDyDy; */ vU2

= _mm_mul_pd(vc,vU2

); vU0

= _mm_add_pd(vU0

,vU2

); /* U0

= U0

+ U2

; */ __m128d vd = _mm_set1_pd(DtAlpha); /* U0

= U0

* DtAlpha; */ vU0

= _mm_mul_pd(vd,vU0

); vU0

= _mm_add_pd(vU0

,vU1

); /* U0

= U0

+ U1

; */ } } } time1=clock(); printf("Loop 3, total time : %f \n", (double) time1-time0);/* End */ f1 = fopen ("out3.dat", "wt"); for( i = 1; i < n1-1; i++) { for( j = 1; j < n2-1; j+=2) { fprintf (f1, "%d\t%d\t%f\n", i, j, vU0

); } }}

[/csharp]

Link Copied

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page