Community
cancel
Showing results for
Did you mean:
Beginner
321 Views

Using SIMD instructions to optimize Numeric 2D derivation

Hi,

I am trying to optimize a simple loop with SIMD SSE instructions. The code compute a 2D field and calculate heat propagation trough time.

Loops use only simple operations, and reference loop is really simple.

The following program contains 3 loops : loop 0 is the reference, loop 1 is a decomposition of the operations, and loop 2 is the optimize loop. However, the second loop is slightly faster than the first one, and the optimize loop is really slow.  I am using x64 gcc 4.7 and icc 11 compilers, sames results with -O2, -O3, -msse2/-xSSE2.

Did I make something wrong ?

With my best regards.

Moomba

[csharp]

#include <stdio.h>

#include <time.h>

#include<emmintrin.h>

#define n1 102

#define n2 102

#define niter 20000

double U0[n1][n2];

double U1[n1][n2];

double U2[n1][n2]; /* buffer*/

double *cU0[n1][n2];

/* simd */

__m128d vU0[n1][n2];

__m128d vU1[n1][n2];

__m128d vU2[n1][n2]; /* buffer*/

int i,j,t;

double Dx,Dy,Lx,Ly,InvDxDx,InvDyDy,Dt,alpha,totaltime,Stab,DtAlpha,DxDx,DyDy;

clock_t time0,time1;

FILE *f1;

int main()

{

/* ---- GENERAL ---- */

alpha = 0.4;

totaltime = 1.0;

Dt = totaltime/((niter-1)*1.0);

Lx = 1.0;

Ly = 1.0;

Dx = Lx/((n1-1)*1.0);

Dy = Ly/((n2-1)*1.0);

InvDxDx = 1.0/(Dx*Dx);

InvDyDy = 1.0/(Dy*Dy);

DxDx = Dx*Dx;

DyDy = Dy*Dy;

Stab = alpha*Dt*(InvDxDx+InvDyDy);

DtAlpha = Dt*alpha;

/* Stability if result <= 0.5 */

printf("Stability factor : %f \n",Stab);

/* +----------------+ */

/* |     LOOP 0     | */

/* +----------------+ */

/* Init */

for( i = 0; i < n1; i++)

{

for( j = 0; j < n2; j++)

{

U0 = 0.0;

U1 = 0.0;

}

}

for( i = 0; i < n1; i++)

{

U0[0] = 1.0;

U1[0] = 1.0;

}

printf("Init OK \n");

/* Core */

time0=clock();

for( t = 0; t < niter; t++)

{

/* even */

for( i = 1; i < n1-1; i++)

{

for( j = 1; j < n2-1; j++)

{

U1 = U0 + DtAlpha*( (U0[i+1]-2.0*U0+U0[i-1])*InvDxDx + (U0[j+1]-2.0*U0+U0[j-1])*InvDyDy);

}

}

/* odd */

for( i = 1; i < n1-1; i++)

{

for( j = 1; j < n2-1; j++)

{

U0 = U1 + DtAlpha*( (U1[i+1]-2.0*U1+U1[i-1])*InvDxDx + (U1[j+1]-2.0*U1+U1[j-1])*InvDyDy);

}

}

}

time1=clock();

printf("Loop 0, total time : %f \n", (double) time1-time0);

f1 = fopen ("out0.dat", "wt");

for( i = 1; i < n1-1; i++)

{

for( j = 1; j < n2-1; j++)

{

fprintf (f1, "%d\t%d\t%f\n", i, j, U0);

}

}

/* +----------------+ */

/* |     LOOP 1     | */

/* +----------------+ */

/* Init */

for( i = 0; i < n1; i++)

{

for( j = 0; j < n2; j++)

{

U0 = 0.0;

U1 = 0.0;

}

}

for( i = 0; i < n1; i++)

{

U0[0] = 1.0;

U1[0] = 1.0;

}

printf("Init OK \n");

/* Core */

time0=clock();

for( t = 0; t < niter; t++)

{

/* even */

for( i = 1; i < n1-1; i++)

{

for( j = 1; j < n2-1; j++)

{

/* U1 = U0 + Dt*alpha*( (U0[i+1]-2.0*U0+U0[i-1])*InvDxDx + (U0[j+1]-2.0*U0+U0[j-1])*InvDyDy); */

U1 = -2.0*U0;

U1 = U1 + U0[i+1];

U1 = U1 + U0[i-1];

U1 = U1 * InvDxDx;

U2 = -2.0*U0;

U2 = U2 + U0[j+1];

U2 = U2 + U0[j-1];

U2 = U2 * InvDyDy;

U1 = U1 + U2;

U1 = U1 * DtAlpha;

U1 = U1 + U0;

}

}

/* odd */

for( i = 1; i < n1-1; i++)

{

for( j = 1; j < n2-1; j++)

{

U0 = -2.0*U1;

U0 = U0 + U1[i+1];

U0 = U0 + U1[i-1];

U0 = U0 * InvDxDx;

U2 = -2.0*U1;

U2 = U2 + U1[j+1];

U2 = U2 + U1[j-1];

U2 = U2 * InvDyDy;

U0 = U0 + U2;

U0 = U0 * DtAlpha;

U0 = U0 + U1;

}

}

}

time1=clock();

printf("Loop 1, total time : %f \n", (double) time1-time0);

/* End */

f1 = fopen ("out1.dat", "wt");

for( i = 1; i < n1-1; i++)

{

for( j = 1; j < n2-1; j++)

{

fprintf (f1, "%d\t%d\t%f\n", i, j, U0);

}

}

/* +----------------+ */

/* |     LOOP 3     | */

/* +----------------+ */

/* Init */

for( i = 0; i < n1; i++)

{

for( j = 0; j < n2; j++)

{

vU0 = _mm_set1_pd(0.0);

vU1 = _mm_set1_pd(0.0);

}

}

for( i = 0; i < n1; i++)

{

vU0[0] = _mm_set1_pd(1.0);

vU1[0] = _mm_set1_pd(1.0);

}

printf("Init OK \n");

/* Core */

time0=clock();

for( t = 0; t < niter; t++)

{

/* even */

for( i = 1; i < n1-1; i++)

{

for( j = 1; j < n2-1; j+=2)

{

/* U1 = U0 + Dt*alpha*( (U0[i+1]-2.0*U0+U0[i-1])*InvDxDx + (U0[j+1]-2.0*U0+U0[j-1])*InvDyDy); */

__m128d va = _mm_set1_pd(-2.0);            /* U1 = -2.0*U0; */

vU1 = _mm_mul_pd(va,vU0);

vU1 = _mm_add_pd(vU1,vU0[i+1]);    /* U1 = U1 + U0[i+1]; */

vU1 = _mm_add_pd(vU1,vU0[i-1]);    /* U1 = U1 + U0[i-1]; */

__m128d vb = _mm_set1_pd(InvDxDx);            /* U1 = U1 * InvDxDx; */

vU1 = _mm_mul_pd(vb,vU1);

vU2 = _mm_mul_pd(va,vU0);            /* U2 = -2.0*U0; */

vU2 = _mm_add_pd(vU2,vU0[i+1]);    /* U2 = U2 + U0[i+1]; */

vU2 = _mm_add_pd(vU2,vU0[i-1]);    /* U2 = U2 + U0[i-1]; */

__m128d vc = _mm_set1_pd(InvDyDy);            /* U2 = U2 * InvDyDy; */

vU2 = _mm_mul_pd(vc,vU2);

vU1 = _mm_add_pd(vU1,vU2);        /* U1 = U1 + U2; */

__m128d vd = _mm_set1_pd(DtAlpha);            /* U1 = U1 * DtAlpha; */

vU1 = _mm_mul_pd(vd,vU1);

vU1 = _mm_add_pd(vU1,vU0);        /* U1 = U1 + U0; */

}

}

/* odd */

for( i = 1; i < n1-1; i++)

{

for( j = 1; j < n2-1; j++)

{

__m128d va = _mm_set1_pd(-2.0);            /* U0 = -2.0*U1; */

vU0 = _mm_mul_pd(va,vU1);

vU0 = _mm_add_pd(vU0,vU1[i+1]);    /* U0 = U0 + U1[i+1]; */

vU0 = _mm_add_pd(vU0,vU1[i-1]);    /* U0 = U0 + U1[i-1]; */

__m128d vb = _mm_set1_pd(InvDxDx);            /* U0 = U0 * InvDxDx; */

vU0 = _mm_mul_pd(vb,vU0);

vU2 = _mm_mul_pd(va,vU1);            /* U2 = -2.0*U1; */

vU2 = _mm_add_pd(vU2,vU1[i+1]);    /* U2 = U2 + U1[i+1]; */

vU2 = _mm_add_pd(vU2,vU1[i-1]);    /* U2 = U2 + U1[i-1]; */

__m128d vc = _mm_set1_pd(InvDyDy);            /* U2 = U2 * InvDyDy; */

vU2 = _mm_mul_pd(vc,vU2);

vU0 = _mm_add_pd(vU0,vU2);        /* U0 = U0 + U2; */

__m128d vd = _mm_set1_pd(DtAlpha);            /* U0 = U0 * DtAlpha; */

vU0 = _mm_mul_pd(vd,vU0);

vU0 = _mm_add_pd(vU0,vU1);        /* U0 = U0 + U1; */

}

}

}

time1=clock();

printf("Loop 3, total time : %f \n", (double) time1-time0);

/* End */

f1 = fopen ("out3.dat", "wt");

for( i = 1; i < n1-1; i++)

{

for( j = 1; j < n2-1; j+=2)

{

fprintf (f1, "%d\t%d\t%f\n", i, j, vU0);

}

}

}

[/csharp]

28 Replies
Valued Contributor II
270 Views
>>... >>double U0[n1][n2]; >>double U1[n1][n2]; >>double U2[n1][n2]; /* buffer*/ >>double *cU0[n1][n2]; >>/* simd */ >>__m128d vU0[n1][n2]; >>__m128d vU1[n1][n2]; >>__m128d vU2[n1][n2]; /* buffer*/ >>... Could you try to declare all these buffers with a '__declspec( align( 16 ) )' specificator?
Beginner
270 Views
I changed to __attribute__ ((aligned(16))) because my gcc couldn't use the __declspec( align( 16 ) ). [csharp] __attribute__ ((aligned(16))) double U0[n1][n2]; __attribute__ ((aligned(16))) double U1[n1][n2]; __attribute__ ((aligned(16))) double U2[n1][n2]; __attribute__ ((aligned(16))) __m128d vU0[n1][n2]; __attribute__ ((aligned(16))) __m128d vU1[n1][n2]; __attribute__ ((aligned(16))) __m128d vU2[n1][n2]; [/csharp] It has no effects, times are the sames. I tried on a Core 2 and on an Xeon i7, same effects.
Valued Contributor II
270 Views
>>...the second loop is slightly faster than the first one, and the optimize loop is really slow... Actually, I would expect a different situation regarding performance: Fastest - 'optimize loop' ( SSE2 ) Then - 'first one' ( Reference ) Followed by - 'second one' ( Decomposition ) I can't promise for 100% but I'll try to look at your case tomorrow on a Windows XP ( 32-bit ) and VS 2005 ( MS & Intel C++ compilers ).
Beginner
270 Views
I would really appreciate if you could test it. This very simple code simulate the same kind of loop we face in CFD computations, and our codes can stay weeks running on 1024 Westmer CPUs. Reducing the cost of this loop would be very effective in terms of time and energy consumption. On top of that, with the new AVX, I expect results to be two times better.
Valued Contributor II
270 Views
I will investigate your test-cases #1 and #3 and won't do anything with the test case #2. >>...I would really appreciate if you could test it. This very simple code simulate the same kind of loop we face in CFD computations... What is CFD? It happens again and again. Guys, when you're using some abbreviation(s) don't assume that everybody knows what it means! I think it is a matter of respect to explain in a short form what some abbreviation means.
Black Belt
270 Views
>>>What is CFD?>>> IIRC "CFD" stands for Computational Fluid Dynamics.
Beginner
270 Views
Yes, it's for Computational Fluid Dynamics. My apologies. We decompose spatial domain in millions of points, and use Navier Stokes equations to resolve fluid advancement in time. Considering that at each iteration, time step is around 1e-8 seconds of real simulated time, and that an iteration takes something like 15s to be done, you can imagine how important it is to optimize calculations. (after 1 week you have 6ms of simulated fluid in average) The example program below is resolving heat diffusion in fluids, and is a good and simple representation of what is done at larger scales in our codes.
Valued Contributor II
270 Views
>>Yes, it's for Computational Fluid Dynamics. My apologies. >>We decompose spatial domain in millions of points, and use Navier Stokes equations to resolve fluid advancement in time. Considering >>that at each iteration, time step is around 1e-8 seconds of real simulated time, and that an iteration takes something like 15s to be done, >>you can imagine how important it is to optimize calculations. (after 1 week you have 6ms of simulated fluid in average). Thank you and that looks Very Intersting. So, I've spent some time today and improved the performance of Test-Case #1 ( LOOP 0 ) for about 12% - 15%. I will upload optimized source codes some time tomorrow since I need to do a couple of more verifications related to a precision loss ( there 3 cases in total and I'll provide details ). Best regards, Sergey
Valued Contributor II
270 Views
>>...after 1 week you have 6ms of simulated fluid in average... What computers do you use? Could you provide some details about a generation of CPU and model? Is it Intel, AMD, or something else? Thanks in advance.
Black Belt
270 Views
>>>We decompose spatial domain in millions of points, and use Navier Stokes equations to resolve fluid advancement in time>>> What numerical approximation is used to solve Navier-Stokes equation?
Beginner
270 Views
Thank you. >> What computers do you use? Could you provide some details about a generation of CPU and model? Is it Intel, AMD, or something else? In fact, a lot of different systems. I ran the code on the following type of architecture: AMD Opteron (don't remember the type), Intel Xeon with 775 socket (JADE 1 at CINES - France), Intel Xeon Westmere with 1366 socket (ANTARES at CRIHAN - France) and I used PowerPC processors with 4.7 Ghz, 32mo L3 cache (ANTARES at IDRISS - France). I used from 16 to 4096 CPUs with MPI library [http://fr.wikipedia.org/wiki/Message_Passing_Interface]. However, most of systems tend to be intel only now, most of the time Westmere and maybe 2011 socket PCUs now because of the tri and quad memory channels (more important than CPU frequency in our cases). I also plan to use Nvidia CUDA Fermi to solve Chemistry because when used in simulations, it calls a lot of mathematical functions and simple matrix operations. >> What numerical approximation is used to solve Navier-Stokes equation? I use Low Mach approach: I suppose that sound propagates at infinite speed. So I cut spatial domain on a small grid with respects to Kolmogorov Scale [http://en.wikipedia.org/wiki/Kolmogorov_microscales] which is an important part of Direct Numerical Simulations. The order of spatial step is around 10 µm. Then, Navier Stokes equations are solved using discretization [http://en.wikipedia.org/wiki/Finite_difference] and finites difference (6 order) for space and Runge Kutta time advancement. (3 iterations for one time step). To conclude, I need to solve a Poisson equations for pressure resolution which is implicit and difficult to solve on memory distributed CPU. I use a BICGSTAB approach combined with a multigrid preconditioning system to solve respectively high and low frequencies. Our code to solve Poisson converges in 15 Iterations in average, independently of the number of CPU used, which is a good achievement. A good example of results can be found here: https://www.youtube.com/results?search_query=direct+numerical+simulation
Black Belt
270 Views
>>>I also plan to use Nvidia CUDA Fermi to solve Chemistry because when used in simulations, it calls a lot of mathematical functions and simple matrix operations.>>> CUDA will be very helpful in your case.Do chemistry calculation could be easily vectorized like a video processing? >>>To conclude, I need to solve a Poisson equations for pressure resolution which is implicit and difficult to solve on memory distributed CPU. I use a BICGSTAB approach combined with a multigrid preconditioning system to solve respectively high and low frequencies. Our code to solve Poisson converges in 15 Iterations in average, independently of the number of CPU used, which is a good achievement.>>> Very interesting info,but it is too advanced for me.I have only written 1D numerical integrators.By reading your code it could be implemented relatively easily with the help of intrinsics.
Valued Contributor II
270 Views
>>...In fact, a lot of different systems. I ran the code... Is it an Open Source project?
Valued Contributor II
270 Views
Here are some comments: - Review all equations because in some cases they could be normalized in order to reduce number of multiplications ( it is related to variables InvDxDx, InvDyDy, DtAlpha ) - I evaluated performance of 4 versions of the Test-Case #1 ( LOOP 0 ) and numbers are as follows: [ With Microsoft C/C++ compiler / Release configuration / All optimizations are Disabled ] --- Version 1 ( Original ) = Base --- Version 2 ( Combined / Process priority Normal ) = Improvement by ~8.42% --- Version 3 ( Unrolled Loops 2-in-1 / Process priority Normal ) = Improvement by ~9.48% --- Version 4 ( Unrolled Loops 3-in-1 / Process priority Normal ) = Improvement by ~10.95% --- Version 4 ( Unrolled Loops 3-in-1 / Process priority Real-time ) = Improvement by ~12.09% [ With MinGW C/C++ compiler / Release configuration / All optimizations are Disabled ] --- Version 4 ( Unrolled Loops 3-in-1 / Process priority Real-time ) = Improvement by ~16.02% - I was very impressed with performance results when MinGW C/C++ compiler was used - I tried to use 'float' data type instead of 'double' data type and I don't recommend to use 'floats' ( performance is almost the same however there is a significant precision loss ) - If all calculations are done on a dedicated computer I recommend to use a priority boost to Real-time or High - I don't expect a significant improvement in performance in a case when C/C++ compiler optimizations are enabled and I would consider instead a better hand-optimization with SSE or AVX instructions - Review a Test-Case #3 ( your SSE codes ) because you've declared a couple of 128-bit variables on the stack and it affects performance ( some memory for these variables will be allocated and de-allocated more than ~208,080,000 times )
Valued Contributor II
270 Views
Modified sources attached.
Valued Contributor II
270 Views
Here are some details regarding a precision loss: ... 61 89 Ref=0.039838 Optimized=0.039839 ... 75 34 Ref=0.326130 Optimized=0.326131 ... 86 57 Ref=0.093223 Optimized=0.093224 ... There are only three cases and all of them related to limitations of IEE 754 Standard. You need to decide if this is acceptable for your R&D work. I also attached a zip-file with outputs for all versions of codes.
Valued Contributor II
270 Views
>>... I also attached a zip-file with outputs for all versions of codes... I used Microsoft's WinDiff utility to compare results.
Valued Contributor II
270 Views
I've done a quick test with a 20-bit precision Fixed Floating Point ( FFP ) type instead of a 53-bit precision Double-Precision floating point 'double' type. I could say that was a right decision to use 'double' data type because there was a significant loss in precision of results if FFP types are used.
Beginner
270 Views