Hi Tim, I am back after lot

Aniruddha_P_ · ‎07-28-2018

I have a matrix solver (BiCCG) which is being used to solve set of algebraic equations arising from a 3 dimensional computational domain. I have tried to parallelise it using OpenMP but struggling with performance issues. On inspecting the code using Intel Advisor, it is evident that almost 80% of the solution time goes in the solver out of which there is one function which accounts for 50% of the solution time. Digging even deeper it is found that 5 loops out of 6 loops are performing terribly with no automatic vectorization since they suffer from data dependencies. What I do see is that there is a dependency (for eg in loop 3 ) because there i th iteration is using i-1 iteration's values.

How to change the design of the parallelisation such that it can be the most efficient with this algorithm ( rather that changing the algorithm altogether). Whether specifying #pragma omp simd safelen(1) would help.

The loops are as follows ( all loops except loop 5 are not automatically vectorised and are the sole bottlenecks of the function )

# pragma omp parallel num_threads(NTt) default(none) private(i,j,k, mythread, dummy) shared(STA,res_sparse_s,COEFF,p_sparse_s, ap_sparse_s,h_sparse_s,RLL, pipi_sparse, normres_sparse, riri_sparse,riri_sparse2,noemer_sparse, nx, ny, nz, nv, PeriodicBoundaryX, PeriodicBoundaryY, PeriodicBoundaryZ)
{

    mythread  = omp_get_thread_num();//0

    // loop 1
        #pragma omp for reduction(+:pipi_sparse)
        for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
        {                    
            dummy = COEFF[6] * p_sparse_s;

            if (PeriodicBoundaryX && i == 1)  dummy += COEFF[0] * p_sparse_s[nx ];
            else                              dummy += COEFF[0] * p_sparse_s[i-1];

            if (PeriodicBoundaryX && i == nx) dummy += COEFF[1] * p_sparse_s[1  ];
            else                              dummy += COEFF[1] * p_sparse_s[i+1];

            if (PeriodicBoundaryY && j == 1)  dummy += COEFF[2] * p_sparse_s[ny ];
            else                              dummy += COEFF[2] * p_sparse_s[j-1];

            if (PeriodicBoundaryY && j == ny) dummy += COEFF[3] * p_sparse_s[  1];
            else                              dummy += COEFF[3] * p_sparse_s[j+1];

            if (PeriodicBoundaryZ && k == 1)  dummy += COEFF[4] * p_sparse_s[nz ];
            else                              dummy += COEFF[4] * p_sparse_s[k-1];


            if (PeriodicBoundaryZ && k == nz) dummy += COEFF[5] * p_sparse_s[  1];
            else                              dummy += COEFF[5] * p_sparse_s[k+1];

            ap_sparse_s = dummy;
            pipi_sparse += p_sparse_s * ap_sparse_s;
        }


    // loop 2
        #pragma omp for reduction(+:normres_sparse)
        for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
        {
            STA += (riri_sparse / pipi_sparse) * p_sparse_s;
            res_sparse_s -= (riri_sparse / pipi_sparse) * ap_sparse_s;

            normres_sparse += (res_sparse_s * res_sparse_s)/ nv;// need to take square root separately
        }


    // loop 3
        // FORWARD
        #pragma omp for schedule(static, nx/NTt)
        for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
        {


            dummy = res_sparse_s;

                                           dummy -= COEFF[7] * RLL[i-1];
            if (PeriodicBoundaryX && i==nx)dummy -= COEFF[8] * RLL[1  ];


                                            dummy -= COEFF[2] * RLL[j-1];
            if (PeriodicBoundaryY && j==ny) dummy -= COEFF[3] * RLL[1  ];


                                            dummy -= COEFF[4] * RLL[k-1];
            if (PeriodicBoundaryZ && k==nz) dummy -= COEFF[5] * RLL[1  ];


            RLL = dummy / h_sparse_s;
        }

    // loop 4
        // BACKWARD
        #pragma omp for schedule(static, nx/NTt)
        for (i=nx; i>=1;i--) for (j=ny; j>=1;j--) for (k=nz; k>=1;k--)
        {
            dummy = RLL*h_sparse_s;

            if (PeriodicBoundaryX && i==1) dummy -= COEFF[7] * RLL[nx ];
                                           dummy -= COEFF[8] * RLL[i+1];

            if (PeriodicBoundaryY && j==1) dummy -= COEFF[2] * RLL[ny ];
                                           dummy -= COEFF[3] * RLL[j+1];

            if (PeriodicBoundaryZ && k==1) dummy -= COEFF[4] * RLL[nz ];
                                           dummy -= COEFF[5] * RLL[k+1];


            RLL =  dummy  / h_sparse_s;
        }


    // loop 5
        #pragma omp for reduction(+:riri_sparse2)
        for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
        {
            riri_sparse2 += res_sparse_s * RLL;
        }

    // loop 6
        #pragma omp for
        for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
        {
            p_sparse_s = RLL + (riri_sparse2 / noemer_sparse) * p_sparse_s;
        }
    }

    noemer_sparse = riri_sparse2;
    riri_sparse = riri_sparse2;

    return;

}

TimP · ‎07-30-2018

If you don't have the time to test the more obvious tactics, it seems unlikely you will succeed.

safelen(1) is an obvious tactic to prevent vectorization, should that be your intent. You would likely need the simd clause only if you don't use aggressive compile options like icc -fp-model fast . That is the default for icc, but for gcc you would need -ffast-math with or without the simd clause. You don't save yourself much work by not making loop invariants explicit, taking those scalar divides out of the loop.

Not that it necessarily matters, but it seems cleaner first to remove the redundant privates, next to eliminate others by declaring variables correctly local.

It looks trivial to try splitting the inner reduction loops off. It may not even lose data locality, if you don't split the outer loops.

Your stride of 9 or more on COEFF[] will likely prevent useful vectorization of loops using that array.

If your problem with loop 6 is that it won't compile due to apparent omissions, why do you ask about it?

You would probably need to split the inner loops so as to see whether you can vectorize the case of no periodic boundary; you probably have little chance of efficiency anyway if that isn't responsible for most of the time spent. You might vectorize the case PeriodicBoundaryZ by splitting out for(k=2;k<nz;l++). The fake dependency in loops 3 and 4 is removed this way.

In my experience with several compilers, optimized reduction with nested loops usually requires that you take the trouble to make a separate named reduction variable with simd reduction in the inner loop.

Aniruddha_P_ · ‎07-30-2018

Hi Tim,

Seems like I have implemented most of the things namely splitting the loops for separate reduction clauses and using simd which definitely improves the performance of loop 2 and loop 6. Currently i am in the process to write the COEFF as some sort of 3D matrix rather than a 4D matrix which will likely help in the vectorisation. Regarding loop 1, loop 3 and loop 4 which are the most compute-intensive loops rights now I am still stuck on possible ways to remove data dependencies.

Loop1 - the reduction line of pipi_sparse can be moved out of the loop to another loop with reduction directive. I tried it the new loop is vectorised but the old one without the reduction line still suffers from data dependency and is not vectorised. If we look at it it seems ap_sparse only depends on p_sparse matrix with i+1, i-1, j+1, j-1, k+1, k-1 locations BUT p_sparse matrix is Read-only and is not being written anywhere in the loop.
Loop3 - Data dependency do exist RLL depends on the values of RLL[i-1] , RLL[j-1] and RLL[k-1]. Will skewing of the loop help in this case
Loop4 - Data dependency do exist RLL depends on the values of RLL[i+1] , RLL[j+1] and RLL[k+1]. Will skewing help here (Wavefront transformations)

TimP · ‎07-31-2018

I'll amplify a bit more. Your periodicity conditions aren't true data dependencies, in that they may be vectorized by the usual peeling method (splitting the end conditions out of the loop). However, you have exceeded the number of peels which the compiler will perform automatically (1 or 2?), even though taking the inner loop by itself it is easily manageable, if the situation is recognized. You may also push against the number of privates for which icc will optimize fully, so it may help to clean those up.

In loop 2, you could easily remove the division explicitly from the loop, if you don't care to examine whether the compiler did it properly. In fact, the compiler would be violating the C standard if it took /nv out of the loop by postponing the divide until afterwards, but it is an excellent change, provided you don't over- or under-flow.

In loop 4, the compiler probably won't vectorize unless it can reverse the loop automatically. Again, that involves further technical violations of the C standard, unless you write it explicitly. I submitted problem reports for years of how the compiler needed to handle more reverse loop cases automatically. icc may do the simplest ones now, but asking it to reverse and peel is probably outside its limits. As the loop does nothing but introduce roundoff errors at the interior points, not computing those points should be advantageous, even if none of what remains is vectorized (and then reversal may not be needed).

If you have split outer parallel loops, you should probably undo that, splitting only the inner loops. This would improve data locality and overcome the overhead of starting and ending parallel loops. It doesn't look as if you would have any work imbalance, other than what may be introduced by problems with data locality.

Aniruddha_P_ · ‎07-31-2018

Hi Tim, I am back after lot of trials on different things. Lets take the example of loop 1, i tried rewriting the loop from 2 to nx-1 , 2 to ny - 1 and 2 to nz - 1, BUT then I am left with lots of permutations to cover the whole surface of the cube for the periodic boundary condition. Is there any other way out. Also the optimisation report shows dependency issues in loop 1 (also in loop 3 and loop 4).

I can understand the reason of data dependecy for loop 3 and loop 4 since the value of RLL depends on previous iteration and next iteration value for which I think I would have to resort to loop skewing to remove the dependency. BUT in loop 1 only values from p_sparse_s is accessed and is not being updated. Assumed dependecy points to line 032 (in the top code). Any pointers ?

Aniruddha_P_ · ‎08-07-2018

Updating with a simple (easy to run example). Matrix a and b have a dependency in iteration i,j,k with that of i-1,j-1,k-1

#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#include<omp.h>

typedef double lr;

#define nx 4
#define ny 4
#define nz 4

void
print3dmatrix(double a[nx+2][ny+2][nz+2])
{
    for(int i=1; i<= nx; i++) {
        for(int j=1; j<= ny; j++) {
            for(int k=1; k<= nz; k++) {
                printf("%f ", a);
            }
            printf("\n");
        }
        printf("\n");
    }
}

int 
main()
{

    double a[nx+2][ny+2][nz+2];
    double b[nx+2][ny+2][nz+2];

    srand(3461833726);


    // matrix filling 
    // b is just a copy of a
    for(int i=0; i< nx+2; i++) for(int j=0; j< ny+2; j++) for(int k=0; k< nz+2; k++)
    {
        a = rand() % 5;
        b = a;
    }

    // loop 1
    //#pragma omp parallel for num_threads(1)
    for(int i=1; i<= nx; i++) for(int j=1; j<= ny; j++) for(int k=1; k<= nz; k++)
    {
        a = -1*a[i-1] - 1*a[j-1] -1 * a[k-1] + 4 * a;
    }

    print3dmatrix(a);
    printf("******************************\n");

    // loop 2
    //#pragma omp parallel for num_threads(1)
    for(int i=1; i<= nx; i++) 
        for(int j=1; j<= ny; j++)
            // #pragma omp simd
            for(int m=j+1; m<= j+nz; m++)
            {
                b[m-j] = -1*b[i-1][m-j] - 1*b[j-1][m-j] -1 * b[m-j-1] + 4 * b[m-j];
            }

    print3dmatrix(b);
    printf("=========================\n");

    return 0;
}

Key observations-

Matrix a is filled with random numbers in between 0 to 5 and loop 1 is non transformed original loop whereas loop 2 is a transformed loop
The transformed loop has been skewed so as to remove dependency (atleast for the inner loop).
After the operation matrix a and b are same if run without the openmp parallelization
if open mp is deployed the answers change (because of maybe race conditions) [ loop is not paralellisable irrespective of where the pragma is placed ]
if #pragma omp simd is used to enforce vectorisation of innermost loop it fails.

Any suggestion as to how to remove this dependency in a simplified example.

jimdempseyatthecove · ‎08-13-2018

Inner loop (line 61) has loop order dependency:

b[m-j] = -1*b[i-1][m-j] - 1*b[j-1][m-j] -1 * b[j][m-j-1] + 4 * b[m-j];

The [m-j-1] on rhs requires prior iteration value of [m-1] on lhs which may not be available due to vectorization.

I haven't tested the following code, try this (report back)

    for(int i=1; i<= nx; i++) 
        for(int j=1; j<= ny; j++)
        {
            double bTemp[nz+2];

            #pragma omp simd
            for(int m=j+1; m<= j+nz; m++)
            {
                bTemp[m-j] = -1*b[i-1][m-j] - 1*b[j-1][m-j] -1 * b[m-j-1] + 4 * b[m-j];
            }
            #pragma omp simd
            for(int m=j+1; m<= j+nz; m++)
            {
                b[m-j] = bTemp[m-j];
            }
        }

Note, bTemp should be cache line aligned for a little extra boost (depending on CPU)

Jim Dempsey

Nested for loop, data dependency