Using SIMD instructions to optimize Numeric 2D derivation

benoit_leveugle · ‎11-08-2012

Hi,

I am trying to optimize a simple loop with SIMD SSE instructions. The code compute a 2D field and calculate heat propagation trough time.

Loops use only simple operations, and reference loop is really simple.

The following program contains 3 loops : loop 0 is the reference, loop 1 is a decomposition of the operations, and loop 2 is the optimize loop. However, the second loop is slightly faster than the first one, and the optimize loop is really slow. I am using x64 gcc 4.7 and icc 11 compilers, sames results with -O2, -O3, -msse2/-xSSE2.

Did I make something wrong ?

With my best regards.

Moomba

[csharp]

#include <stdio.h>

#include <time.h>

#include<emmintrin.h>

#define n1 102

#define n2 102

#define niter 20000

double U0[n1][n2];

double U1[n1][n2];

double U2[n1][n2]; /* buffer*/

double *cU0[n1][n2];

/* simd */

__m128d vU0[n1][n2];

__m128d vU1[n1][n2];

__m128d vU2[n1][n2]; /* buffer*/

int i,j,t;

double Dx,Dy,Lx,Ly,InvDxDx,InvDyDy,Dt,alpha,totaltime,Stab,DtAlpha,DxDx,DyDy;

clock_t time0,time1;

FILE *f1;

int main()

{

/* ---- GENERAL ---- */

   alpha = 0.4;

   totaltime = 1.0;

   Dt = totaltime/((niter-1)*1.0);

   Lx = 1.0;

   Ly = 1.0;

   Dx = Lx/((n1-1)*1.0);

   Dy = Ly/((n2-1)*1.0);

   InvDxDx = 1.0/(Dx*Dx);

   InvDyDy = 1.0/(Dy*Dy);

   DxDx = Dx*Dx;

   DyDy = Dy*Dy;

   Stab = alpha*Dt*(InvDxDx+InvDyDy);

   DtAlpha = Dt*alpha;

/* Stability if result <= 0.5 */

   printf("Stability factor : %f \n",Stab);

/* +----------------+ */

/* |     LOOP 0     | */

/* +----------------+ */

/* Init */

   for( i = 0; i < n1; i++)

   {

      for( j = 0; j < n2; j++)

      {

         U0 = 0.0;

         U1 = 0.0;

      }

   }

   for( i = 0; i < n1; i++)

   {

      U0[0] = 1.0;

      U1[0] = 1.0;

   }

   printf("Init OK \n");

/* Core */

time0=clock();

for( t = 0; t < niter; t++)

{

    /* even */

    for( i = 1; i < n1-1; i++)

    {

       for( j = 1; j < n2-1; j++)

       {

          U1 = U0 + DtAlpha*( (U0[i+1]-2.0*U0+U0[i-1])*InvDxDx + (U0[j+1]-2.0*U0+U0[j-1])*InvDyDy);

       }

    }

    /* odd */

    for( i = 1; i < n1-1; i++)

    {

       for( j = 1; j < n2-1; j++)

       {

          U0 = U1 + DtAlpha*( (U1[i+1]-2.0*U1+U1[i-1])*InvDxDx + (U1[j+1]-2.0*U1+U1[j-1])*InvDyDy);

       }

    }

   }

time1=clock();

printf("Loop 0, total time : %f \n", (double) time1-time0);

f1 = fopen ("out0.dat", "wt");

for( i = 1; i < n1-1; i++)

{

    for( j = 1; j < n2-1; j++)

    {

       fprintf (f1, "%d\t%d\t%f\n", i, j, U0);

    }

}

/* +----------------+ */

/* |     LOOP 1     | */

/* +----------------+ */

/* Init */

   for( i = 0; i < n1; i++)

   {

      for( j = 0; j < n2; j++)

      {

         U0 = 0.0;

         U1 = 0.0;

      }

   }

   for( i = 0; i < n1; i++)

   {

      U0[0] = 1.0;

      U1[0] = 1.0;

   }

   printf("Init OK \n");

/* Core */

time0=clock();

for( t = 0; t < niter; t++)

{

    /* even */

    for( i = 1; i < n1-1; i++)

    {

       for( j = 1; j < n2-1; j++)

       {

          /* U1 = U0 + Dt*alpha*( (U0[i+1]-2.0*U0+U0[i-1])*InvDxDx + (U0[j+1]-2.0*U0+U0[j-1])*InvDyDy); */

          U1 = -2.0*U0;

          U1 = U1 + U0[i+1];

          U1 = U1 + U0[i-1];

          U1 = U1 * InvDxDx;

          U2 = -2.0*U0;

          U2 = U2 + U0[j+1];

          U2 = U2 + U0[j-1];

          U2 = U2 * InvDyDy;

          U1 = U1 + U2;

          U1 = U1 * DtAlpha;

          U1 = U1 + U0;

       }

    }

    /* odd */

    for( i = 1; i < n1-1; i++)

    {

       for( j = 1; j < n2-1; j++)

       {

          U0 = -2.0*U1;

          U0 = U0 + U1[i+1];

          U0 = U0 + U1[i-1];

          U0 = U0 * InvDxDx;

          U2 = -2.0*U1;

          U2 = U2 + U1[j+1];

          U2 = U2 + U1[j-1];

          U2 = U2 * InvDyDy;

          U0 = U0 + U2;

          U0 = U0 * DtAlpha;

          U0 = U0 + U1;

       }

    }

   }

time1=clock();

printf("Loop 1, total time : %f \n", (double) time1-time0);

/* End */

f1 = fopen ("out1.dat", "wt");

for( i = 1; i < n1-1; i++)

{

    for( j = 1; j < n2-1; j++)

    {

       fprintf (f1, "%d\t%d\t%f\n", i, j, U0);

    }

}

/* +----------------+ */

/* |     LOOP 3     | */

/* +----------------+ */

/* Init */

   for( i = 0; i < n1; i++)

   {

      for( j = 0; j < n2; j++)

      {

         vU0 = _mm_set1_pd(0.0);

         vU1 = _mm_set1_pd(0.0);

      }

   }

   for( i = 0; i < n1; i++)

   {

      vU0[0] = _mm_set1_pd(1.0);

      vU1[0] = _mm_set1_pd(1.0);

   }

   printf("Init OK \n");

/* Core */

time0=clock();

for( t = 0; t < niter; t++)

{

    /* even */

    for( i = 1; i < n1-1; i++)

    {

       for( j = 1; j < n2-1; j+=2)

       {

          /* U1 = U0 + Dt*alpha*( (U0[i+1]-2.0*U0+U0[i-1])*InvDxDx + (U0[j+1]-2.0*U0+U0[j-1])*InvDyDy); */

          __m128d va = _mm_set1_pd(-2.0);           /* U1 = -2.0*U0; */

          vU1 = _mm_mul_pd(va,vU0);

          vU1 = _mm_add_pd(vU1,vU0[i+1]);   /* U1 = U1 + U0[i+1]; */

          vU1 = _mm_add_pd(vU1,vU0[i-1]);   /* U1 = U1 + U0[i-1]; */

          __m128d vb = _mm_set1_pd(InvDxDx);           /* U1 = U1 * InvDxDx; */

          vU1 = _mm_mul_pd(vb,vU1);

          vU2 = _mm_mul_pd(va,vU0);           /* U2 = -2.0*U0; */



          vU2 = _mm_add_pd(vU2,vU0[i+1]);   /* U2 = U2 + U0[i+1]; */

          vU2 = _mm_add_pd(vU2,vU0[i-1]);   /* U2 = U2 + U0[i-1]; */



          __m128d vc = _mm_set1_pd(InvDyDy);           /* U2 = U2 * InvDyDy; */

          vU2 = _mm_mul_pd(vc,vU2);



          vU1 = _mm_add_pd(vU1,vU2);       /* U1 = U1 + U2; */



          __m128d vd = _mm_set1_pd(DtAlpha);           /* U1 = U1 * DtAlpha; */

          vU1 = _mm_mul_pd(vd,vU1);



          vU1 = _mm_add_pd(vU1,vU0);       /* U1 = U1 + U0; */



       }

    }

    /* odd */

    for( i = 1; i < n1-1; i++)

    {

       for( j = 1; j < n2-1; j++)

       {

          __m128d va = _mm_set1_pd(-2.0);           /* U0 = -2.0*U1; */

          vU0 = _mm_mul_pd(va,vU1);

          vU0 = _mm_add_pd(vU0,vU1[i+1]);   /* U0 = U0 + U1[i+1]; */

          vU0 = _mm_add_pd(vU0,vU1[i-1]);   /* U0 = U0 + U1[i-1]; */

          __m128d vb = _mm_set1_pd(InvDxDx);           /* U0 = U0 * InvDxDx; */

          vU0 = _mm_mul_pd(vb,vU0);

          vU2 = _mm_mul_pd(va,vU1);           /* U2 = -2.0*U1; */



          vU2 = _mm_add_pd(vU2,vU1[i+1]);   /* U2 = U2 + U1[i+1]; */

          vU2 = _mm_add_pd(vU2,vU1[i-1]);   /* U2 = U2 + U1[i-1]; */



          __m128d vc = _mm_set1_pd(InvDyDy);           /* U2 = U2 * InvDyDy; */

          vU2 = _mm_mul_pd(vc,vU2);



          vU0 = _mm_add_pd(vU0,vU2);       /* U0 = U0 + U2; */



          __m128d vd = _mm_set1_pd(DtAlpha);           /* U0 = U0 * DtAlpha; */

          vU0 = _mm_mul_pd(vd,vU0);



          vU0 = _mm_add_pd(vU0,vU1);       /* U0 = U0 + U1; */

       }

    }

   }

time1=clock();

printf("Loop 3, total time : %f \n", (double) time1-time0);

/* End */

f1 = fopen ("out3.dat", "wt");

for( i = 1; i < n1-1; i++)

{

    for( j = 1; j < n2-1; j+=2)

    {

       fprintf (f1, "%d\t%d\t%f\n", i, j, vU0);

    }

}

}

[/csharp]

SergeyKostrov · ‎11-08-2012

>>... >>double U0[n1][n2]; >>double U1[n1][n2]; >>double U2[n1][n2]; /* buffer*/ >>double *cU0[n1][n2]; >>/* simd */ >>__m128d vU0[n1][n2]; >>__m128d vU1[n1][n2]; >>__m128d vU2[n1][n2]; /* buffer*/ >>... Could you try to declare all these buffers with a '__declspec( align( 16 ) )' specificator?

benoit_leveugle · ‎11-08-2012

I changed to __attribute__ ((aligned(16))) because my gcc couldn't use the __declspec( align( 16 ) ). [csharp] __attribute__ ((aligned(16))) double U0[n1][n2]; __attribute__ ((aligned(16))) double U1[n1][n2]; __attribute__ ((aligned(16))) double U2[n1][n2]; __attribute__ ((aligned(16))) __m128d vU0[n1][n2]; __attribute__ ((aligned(16))) __m128d vU1[n1][n2]; __attribute__ ((aligned(16))) __m128d vU2[n1][n2]; [/csharp] It has no effects, times are the sames. I tried on a Core 2 and on an Xeon i7, same effects.

SergeyKostrov · ‎11-08-2012

>>...the second loop is slightly faster than the first one, and the optimize loop is really slow... Actually, I would expect a different situation regarding performance: Fastest - 'optimize loop' ( SSE2 ) Then - 'first one' ( Reference ) Followed by - 'second one' ( Decomposition ) I can't promise for 100% but I'll try to look at your case tomorrow on a Windows XP ( 32-bit ) and VS 2005 ( MS & Intel C++ compilers ).

benoit_leveugle · ‎11-09-2012

I would really appreciate if you could test it. This very simple code simulate the same kind of loop we face in CFD computations, and our codes can stay weeks running on 1024 Westmer CPUs. Reducing the cost of this loop would be very effective in terms of time and energy consumption. On top of that, with the new AVX, I expect results to be two times better.

SergeyKostrov · ‎11-10-2012

I will investigate your test-cases #1 and #3 and won't do anything with the test case #2. >>...I would really appreciate if you could test it. This very simple code simulate the same kind of loop we face in CFD computations... What is CFD? It happens again and again. Guys, when you're using some abbreviation(s) don't assume that everybody knows what it means! I think it is a matter of respect to explain in a short form what some abbreviation means.

Bernard · ‎11-10-2012

>>>What is CFD?>>> IIRC "CFD" stands for Computational Fluid Dynamics.

benoit_leveugle · ‎11-10-2012

Yes, it's for Computational Fluid Dynamics. My apologies. We decompose spatial domain in millions of points, and use Navier Stokes equations to resolve fluid advancement in time. Considering that at each iteration, time step is around 1e-8 seconds of real simulated time, and that an iteration takes something like 15s to be done, you can imagine how important it is to optimize calculations. (after 1 week you have 6ms of simulated fluid in average) The example program below is resolving heat diffusion in fluids, and is a good and simple representation of what is done at larger scales in our codes.

SergeyKostrov · ‎11-10-2012

>>Yes, it's for Computational Fluid Dynamics. My apologies. >>We decompose spatial domain in millions of points, and use Navier Stokes equations to resolve fluid advancement in time. Considering >>that at each iteration, time step is around 1e-8 seconds of real simulated time, and that an iteration takes something like 15s to be done, >>you can imagine how important it is to optimize calculations. (after 1 week you have 6ms of simulated fluid in average). Thank you and that looks Very Intersting. So, I've spent some time today and improved the performance of Test-Case #1 ( LOOP 0 ) for about 12% - 15%. I will upload optimized source codes some time tomorrow since I need to do a couple of more verifications related to a precision loss ( there 3 cases in total and I'll provide details ). Best regards, Sergey

SergeyKostrov · ‎11-10-2012

>>...after 1 week you have 6ms of simulated fluid in average... What computers do you use? Could you provide some details about a generation of CPU and model? Is it Intel, AMD, or something else? Thanks in advance.

Bernard · ‎11-10-2012

>>>We decompose spatial domain in millions of points, and use Navier Stokes equations to resolve fluid advancement in time>>> What numerical approximation is used to solve Navier-Stokes equation?

benoit_leveugle · ‎11-11-2012

Thank you. >> What computers do you use? Could you provide some details about a generation of CPU and model? Is it Intel, AMD, or something else? In fact, a lot of different systems. I ran the code on the following type of architecture: AMD Opteron (don't remember the type), Intel Xeon with 775 socket (JADE 1 at CINES - France), Intel Xeon Westmere with 1366 socket (ANTARES at CRIHAN - France) and I used PowerPC processors with 4.7 Ghz, 32mo L3 cache (ANTARES at IDRISS - France). I used from 16 to 4096 CPUs with MPI library [http://fr.wikipedia.org/wiki/Message_Passing_Interface]. However, most of systems tend to be intel only now, most of the time Westmere and maybe 2011 socket PCUs now because of the tri and quad memory channels (more important than CPU frequency in our cases). I also plan to use Nvidia CUDA Fermi to solve Chemistry because when used in simulations, it calls a lot of mathematical functions and simple matrix operations. >> What numerical approximation is used to solve Navier-Stokes equation? I use Low Mach approach: I suppose that sound propagates at infinite speed. So I cut spatial domain on a small grid with respects to Kolmogorov Scale [http://en.wikipedia.org/wiki/Kolmogorov_microscales] which is an important part of Direct Numerical Simulations. The order of spatial step is around 10 µm. Then, Navier Stokes equations are solved using discretization [http://en.wikipedia.org/wiki/Finite_difference] and finites difference (6 order) for space and Runge Kutta time advancement. (3 iterations for one time step). To conclude, I need to solve a Poisson equations for pressure resolution which is implicit and difficult to solve on memory distributed CPU. I use a BICGSTAB approach combined with a multigrid preconditioning system to solve respectively high and low frequencies. Our code to solve Poisson converges in 15 Iterations in average, independently of the number of CPU used, which is a good achievement. A good example of results can be found here: https://www.youtube.com/results?search_query=direct+numerical+simulation

Bernard · ‎11-11-2012

>>>I also plan to use Nvidia CUDA Fermi to solve Chemistry because when used in simulations, it calls a lot of mathematical functions and simple matrix operations.>>> CUDA will be very helpful in your case.Do chemistry calculation could be easily vectorized like a video processing? >>>To conclude, I need to solve a Poisson equations for pressure resolution which is implicit and difficult to solve on memory distributed CPU. I use a BICGSTAB approach combined with a multigrid preconditioning system to solve respectively high and low frequencies. Our code to solve Poisson converges in 15 Iterations in average, independently of the number of CPU used, which is a good achievement.>>> Very interesting info,but it is too advanced for me.I have only written 1D numerical integrators.By reading your code it could be implemented relatively easily with the help of intrinsics.

SergeyKostrov · ‎11-11-2012

>>...In fact, a lot of different systems. I ran the code... Is it an Open Source project?

SergeyKostrov · ‎11-11-2012

Here are some comments: - Review all equations because in some cases they could be normalized in order to reduce number of multiplications ( it is related to variables InvDxDx, InvDyDy, DtAlpha ) - I evaluated performance of 4 versions of the Test-Case #1 ( LOOP 0 ) and numbers are as follows: [ With Microsoft C/C++ compiler / Release configuration / All optimizations are Disabled ] --- Version 1 ( Original ) = Base --- Version 2 ( Combined / Process priority Normal ) = Improvement by ~8.42% --- Version 3 ( Unrolled Loops 2-in-1 / Process priority Normal ) = Improvement by ~9.48% --- Version 4 ( Unrolled Loops 3-in-1 / Process priority Normal ) = Improvement by ~10.95% --- Version 4 ( Unrolled Loops 3-in-1 / Process priority Real-time ) = Improvement by ~12.09% [ With MinGW C/C++ compiler / Release configuration / All optimizations are Disabled ] --- Version 4 ( Unrolled Loops 3-in-1 / Process priority Real-time ) = Improvement by ~16.02% - I was very impressed with performance results when MinGW C/C++ compiler was used - I tried to use 'float' data type instead of 'double' data type and I don't recommend to use 'floats' ( performance is almost the same however there is a significant precision loss ) - If all calculations are done on a dedicated computer I recommend to use a priority boost to Real-time or High - I don't expect a significant improvement in performance in a case when C/C++ compiler optimizations are enabled and I would consider instead a better hand-optimization with SSE or AVX instructions - Review a Test-Case #3 ( your SSE codes ) because you've declared a couple of 128-bit variables on the stack and it affects performance ( some memory for these variables will be allocated and de-allocated more than ~208,080,000 times )

SergeyKostrov · ‎11-11-2012

Modified sources attached.

SergeyKostrov · ‎11-11-2012

Here are some details regarding a precision loss: ... 61 89 Ref=0.039838 Optimized=0.039839 ... 75 34 Ref=0.326130 Optimized=0.326131 ... 86 57 Ref=0.093223 Optimized=0.093224 ... There are only three cases and all of them related to limitations of IEE 754 Standard. You need to decide if this is acceptable for your R&D work. I also attached a zip-file with outputs for all versions of codes.

SergeyKostrov · ‎11-11-2012

>>... I also attached a zip-file with outputs for all versions of codes... I used Microsoft's WinDiff utility to compare results.

SergeyKostrov · ‎11-12-2012

I've done a quick test with a 20-bit precision Fixed Floating Point ( FFP ) type instead of a 53-bit precision Double-Precision floating point 'double' type. I could say that was a right decision to use 'double' data type because there was a significant loss in precision of results if FFP types are used.

benoit_leveugle · ‎11-18-2012

A bit late, my apologies. I was on travel and I couldn't get a secured internet connection. I am downloading your code and I will take a look at it tomorrow. >>> CUDA will be very helpful in your case.Do chemistry calculation could be easily vectorized like a video processing? Unfortunately no, and that is the challenge. Another team is working on it, and they face difficulties with the Fortran 77 code that is using a lot of “go to” instructructions. >>> Very interesting info,but it is too advanced for me.I have only written 1D numerical integrators.By reading your code it could be implemented relatively easily with the help of intrinsics. Yes. The equations are relatively complex, but the main core is always the same : Derivative calculations, which is the same than image processing. >>> Is it an Open Source project? Not know, but it is on the way to be, yes. We are currently rewriting some parts of the code to make it more “usable”. Then, we will release the sources, which could be used in OpenFoam For example (an open source fluid mechanic solver). >>> - Review all equations because in some cases they could be normalized in order to reduce number of multiplications ( it is related to variables InvDxDx, InvDyDy, DtAlpha ) I gave you a simple loop. In normal time, Dx and Dy are not constant, so the loop can be more complicated. You need to calculate each terms at a time, but yes, I think we could comment the main loop and rewrite it in a more optimized way. >>> - I was very impressed with performance results when MinGW C/C++ compiler was used Yes, they made impressive improvements lately. >>> - Review a Test-Case #3 ( your SSE codes ) because you've declared a couple of 128-bit variables on the stack and it affects performance ( some memory for these variables will be allocated and de-allocated more than ~208,080,000 times ) I will try this and report results tomorrow. >>> >>... I also attached a zip-file with outputs for all versions of codes... I used Microsoft's WinDiff utility to compare results. >>> I've done a quick test with a 20-bit precision Fixed Floating Point ( FFP ) type instead of a 53-bit precision Double-Precision floating point 'double' type. I could say that was a right decision to use 'double' data type because there was a significant loss in precision of results if FFP types are used. Thank you. I don’t know this program, so I will take a look. In fact, because the solver can perform this loop more than 300 times per iterations, and considering that a run can be more than 100,000 iterations, precision is extremely important. For example, we tried to calculate using simple reals, and the results diverged after only 12 iterations. And if you add chemistry calculations, due to exponential operations, it explodes immediately. On top of that, the BICGSTAB solver converged at 10^-15 of precision, which cannot be achieved with simple precision. I will report tomorrow the results on my Core i7 Xeon.

SergeyKostrov · ‎11-18-2012

Thanks for the feedback! >>...I gave you a simple loop. In normal time, Dx and Dy are not constant, so the loop can be more complicated... I suspected that. >>...In fact, because the solver can perform this loop more than 300 times per iterations, and considering that a run can be more than >>100,000 iterations, precision is extremely important... I don't know if you tried to use 'long double' type but I wouldn't recommend even to try it. I've done lots of tests and it doesn't help in my case. Test is, multiplication of very big matricies.