ScaLAPACK pdgetrf_ factorises only one block of global matrix

Robert_S_12 · ‎07-07-2015

Hi all,

I am modifying a large C program to factorise a dense square matrix and solve the resulting system of 916 equations by back-substitution in parallel. I set up a 4x4 process grid and distribute one 229x229 block to each process. My call to pdgetrf_ operates on only the first block of the matrix, returning its LU decomposition embedded in the original matrix. Partitioning and broadcasting the global matrix to each process using pdgeadd_ works fine, as does gathering the result, also using pdgeadd_ and dgsum2d_. I believe my choice of parameters is consistent with the documentation, but I just can't get things to work as intended.

I get the feeling I am misunderstanding something. Is pdgetrf_ meant to return the factorisation of the whole distributed matrix after all processes have finished? Or should it return submatrix LU decompositions block by block?

I have played around with the indices iA, jA a bit. Searching the forums and documentation, it seems like conventional wisdom is to set these to (1, 1) for all processes. But I know where my blocks begin in the global matrix by design, and it seems intuitive that I could pass these row, column indices to pdgetrf_. Is there any situation in which iA and jA can differ for each process? I have tried specifying different indices based on the current process row and column, e.g. (1, 1), (1, 230), (1, 459), etc. Unsurprisingly, pdgetrf_ complains on every process, returning info = -4. If I pass one pair of legal indices to every process, say (230, 230), the appropriate sub-block is factorized and gathered into its place in the global matrix.

If it helps, I am linking to MKL through Composer XE 2013 SP 1.2.144 and using OpenMPI. My link line appears to be correct and I get no compiler warnings from mpicc, and no runtime errors. I run the program using PBS and mpirun, OMP_NUM_THREADS is set to 1, and my system has two machines with eight cores each.

Any insight into where I am going awry would be welcome. Thank you in advance for your help! Please let me know if there is any other useful information that I haven't provided.

Here's the relevant code snippet. This is currently the only part of the program that calls PBLAS and ScaLAPACK. IC is the dense 916x916 matrix to be factorised, and B is a 916x1 vector. The rest of the program uses nrutil dvector and dmatrix data structures, so I collect these into row-major arrays before proceeding with ScaLAPACK.

#include "mpi.h"
#include <mkl_scalapack.h>
#include <mkl_blacs.h>
#include <mkl_pblas.h>

typedef MKL_INT MDESC[9];

        ...

        // Define MKL constants and parameters
        MKL_INT i_zero = 0, i_one = 1, i_two = 2, i_four = 4, i_negone = -1;
        double zero = 0.0E0, one = 1.0E0, two = 2.0E0, negone = -1.0E0;
        MKL_INT ictxt, info, iam, nprocs;
        // num_eqns is determined elsewhere; in this case, it is 916
        MKL_INT n = num_eqns, nb = 229, mp, nq, lld, lld_local, nprow = 4, npcol = 4, myrow, mycol;
        MKL_INT iIC = 1, jIC= 1, iB = 1, jB = 1;
        MKL_INT iwork[4];
        MKL_INT *ipiv;
        
        // Store influence matrix as array
        double *IC, *B, *IC_local, *B_local, *IC_result, *B_result;
        IC_local = (double*) calloc(n*n, sizeof(double));
        B_local = (double*) calloc(n, sizeof(double));
                   
        for(i = 0; i < n; i++) {
            for(j = 0; j < n; j++) {
                IC_local[i + n*j] = ic_matrix[j+1][i+1];
            }
            B_local = b_vector[i+1];
        }

        blacs_pinfo_(&iam, &nprocs);
         
        // Initialise process grid for main computations
        blacs_get_(&i_negone, &i_zero, &ictxt);
        blacs_gridinit_(&ictxt, "R", &nprow, &npcol);
        blacs_gridinfo_(&ictxt, &nprow, &npcol, &myrow, &mycol);
        
        IC_result = (double*) calloc(n*n, sizeof(double));
        B_result = (double*) calloc(n, sizeof(double));
        
        for(i = 0; i < n; i++) {
            B_result = 0;
        }
        
        blacs_barrier_(&ictxt, "A");
        
        // Find dimensions of local blocks
        mp = numroc_(&n, &nb, &myrow, &i_zero, &nprow);
        nq = numroc_(&n, &nb, &mycol, &i_zero, &npcol);
        lld_local = MAX(numroc_(&n, &n, &myrow, &i_zero, &nprow), 1);
        lld = MAX(mp, 1);

        // Allocate distributed arrays
        IC = (double*) calloc(mp*nq, sizeof(double));
        B = (double*) calloc(mp, sizeof(double));
        ipiv = (MKL_INT*) calloc(lld + nb, sizeof(int));
        
        // Initialise array descriptor for local matrices
        MDESC descIC, descB, descIC_local, descB_local, descIC_result, descB_result;
        descinit_(descIC_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info);
        descinit_(descB_local, &n, &i_one, &n, &i_one, &i_zero, &i_zero, &ictxt, &lld_local, &info);
        descinit_(descIC_result, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info);
        descinit_(descB_result, &n, &i_one, &n, &i_one, &i_zero, &i_zero, &ictxt, &lld_local, &info);

        // ... and distributed matrices
        descinit_(descIC, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info);
        descinit_(descB, &n, &i_one, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info);
       
        // Distribute arrays onto process grid using pdgeadd
        pdgeadd_("N", &n, &n, &one, IC_local, &i_one, &i_one, descIC_local, &zero, IC, &i_one, &i_one, descIC);
        pdgeadd_("N", &n, &i_one, &one, B_local, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB);

        // Get LU factorisation      
        pdgetrf_(&mp, &nq, IC, &i_one, &i_one, descIC, ipiv, &info);      
        blacs_barrier_(&ictxt, "A");
        
        // Solve equations: currently, I don't run this
        // pdgetrs_("N", &nq, &i_one, IC, &iIC, &jIC, descIC, ipiv, B, &iB, &jB, descB, &info);
        blacs_barrier_(&ictxt, "A");

        // Gather: if I comment out pdgetrf_ above, this returns the original IC and B arrays
        pdgeadd_("N", &mp, &nq, &one, IC, &i_one, &i_one, descIC, &zero, IC_result, &i_one, &i_one, descIC_result);
        pdgeadd_("N", &n, &i_one, &one, B, &i_one, &i_one, descB, &zero, B_result, &i_one, &i_one, descB_result);
        blacs_barrier_(&ictxt, "A");

        dgsum2d_(&ictxt, "A", " ", &n, &n, IC_result, &lld_local, &i_negone, &i_zero);
        dgsum2d_(&ictxt, "A", " ", &n, &n, B_result, &lld_local, &i_negone, &i_zero);
        
        blacs_barrier_(&ictxt, "A");
        blacs_gridexit_(&ictxt);

...

Ying_H_Intel · ‎07-08-2015

Hi Robert,

As i see, most of your code are no problem. What is wrong with the real code?

pdgetrf_ meant to return the factorisation of the whole distributed matrix after the function called. And it is right that conventional wisdom is to set these to (1, 1) for all processes.

The documentation explain the input and output. l pdgetrf(m, n, a, ia, ja, desca, ipiv, info)

The p?getrf routine forms the LU factorization of a general m-by-n distributed matrix Gob(A) = A(ia:ia+m-1, ja:ja+n-1)

a:

(local)
REALfor psgetrf
DOUBLE PRECISIONfor pdgetrf
COMPLEXfor pcgetrf
DOUBLE COMPLEXfor pzgetrf.
Pointer into the local memory to an array of local dimension (lld_a,
LOCc(ja+n-1)).

ia, ja (global) INTEGER. The row and column indices in the global array A

indicating the first row and the first column of the submatrix A(ia:ia+n-1,
ja:ja+n-1), respectively

So ia, ja should be related to global array A, not the position of local A (229x229). As In your case, Glob A is 916 x 916, continiously, . A (1: 1+916 -1, 1:j 1+n-1).

both ia and ja are 1 and 1.for all process.

Regards,

Ying

Robert_S_12 · ‎07-08-2015

Thanks for your reply.

I understand now that iA and jA refer to row and column indices in the global array. The code using (1,1) only returns the LU factorisation of the first block of the global matrix.

The documentation says that the array argument to pdgetrf_ is the local block to be factorized, IC in my code, and the descriptor is the descriptor of the global matrix. Am I passing the correct descriptor above? Should it be descIC_local, which describes the entire distributed matrix?

Ying_H_Intel · ‎07-09-2015

Hi Robert,

We happened to be building another sample in the thread pdpotrf() fails to identify SPD matrix and the example in there or in http://stackoverflow.com/questions/31231428/cholesky-with-scalapack (commenting MPI_Bcast()).

You may change the code to pdgetrf and let us know if any result.

I guess,there is issue in last part in your current code, where gather matrix. I test the code, it can't return the total result in

IC_result.

// Gather: if I comment out pdgetrf_ above, this returns the original IC and B arrays
        pdgeadd_("N", &mp, &nq, &one, IC, &i_one, &i_one, descIC, &zero, IC_result, &i_one, &i_one, descIC_result);
        pdgeadd_("N", &n, &i_one, &one, B, &i_one, &i_one, descB, &zero, B_result, &i_one, &i_one, descB_result);
        blacs_barrier_(&ictxt, "A");

Best Regards,

Ying

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include <mkl_scalapack.h>
#include <mkl_blacs.h>
#include <mkl_pblas.h>

#define MAX(a,b)((a)<(b)?(b):(a))
typedef MKL_INT MDESC[9];

extern "C" {
/* Cblacs declarations */
void Cblacs_pinfo(int*, int*);
void Cblacs_get(int, int, int*);
void Cblacs_gridinit(int*, const char*, int, int);
void Cblacs_pcoord(int, int, int*, int*);
void Cblacs_gridexit(int);
void Cblacs_barrier(int, const char*);
void Cdgerv2d(int, int, int, double*, int, int, int);
void Cdgesd2d(int, int, int, double*, int, int, int);
void descinit_(int*, int*, int*, int*, int*, int*, int*, int*, int*, int*);

int numroc_(int*, int*, int*, int*, int*);
}

int main( int argc, char *argv[] )
{

/************ MPI ***************************/
/* int iam, nprocs;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &iam);
   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
*/
   /************ BLACS ***************************/
// Define MKL constants and parameters
MKL_INT i_zero = 0, i_one = 1, i_two = 2, i_four = 4, i_negone = -1;
double zero = 0.0E0, one = 1.0E0, two = 2.0E0, negone = -1.0E0;
MKL_INT ictxt, info, iam, nprocs;
// num_eqns is determined elsewhere; in this case, it is 916
MKL_INT n = 5, nb = 2, mp, nq, lld, lld_local, nprow = 2, npcol = 2, myrow, mycol;
MKL_INT iIC = 1, jIC= 1, iB = 1, jB = 1;
MKL_INT iwork[4];
MKL_INT *ipiv;
       int i, j;

// Store influence matrix as array
double *IC, *B, *IC_local, *B_local, *IC_result, *B_result;
IC_local = (double*) calloc(n*n, sizeof(double));
B_local = (double*) calloc(n, sizeof(double));

for(i = 0; i < n; i++) {
for(j = 0; j < n; j++) {
IC_local[i + n*j] = j ; //1.0*rand()/916 * (i + n*j); //ic_matrix[j+1][i+1];
}
B_local = i ; //1.0*rand()*i/916; //b_vector[i+1];
}

blacs_pinfo_(&iam, &nprocs);

// Initialise process grid for main computations
blacs_get_(&i_negone, &i_zero, &ictxt);
blacs_gridinit_(&ictxt, "R", &nprow, &npcol);
blacs_gridinfo_(&ictxt, &nprow, &npcol, &myrow, &mycol);

IC_result = (double*) calloc(n*n, sizeof(double));
B_result = (double*) calloc(n, sizeof(double));

for(i = 0; i < n; i++) {
B_result = 0;
}

blacs_barrier_(&ictxt, "A");

// Find dimensions of local blocks
mp = numroc_(&n, &nb, &myrow, &i_zero, &nprow);
nq = numroc_(&n, &nb, &mycol, &i_zero, &npcol);
lld_local = MAX(numroc_(&n, &n, &myrow, &i_zero, &nprow), 1);
lld = MAX(mp, 1);

// Allocate distributed arrays
IC = (double*) calloc(mp*nq, sizeof(double));
B = (double*) calloc(mp, sizeof(double));
ipiv = (MKL_INT*) calloc(lld + nb, sizeof(int));

// Initialise array descriptor for local matrices
MDESC descIC, descB, descIC_local, descB_local, descIC_result, descB_result;
descinit_(descIC_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info);
descinit_(descB_local, &n, &i_one, &n, &i_one, &i_zero, &i_zero, &ictxt, &lld_local, &info);
descinit_(descIC_result, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info);
descinit_(descB_result, &n, &i_one, &n, &i_one, &i_zero, &i_zero, &ictxt, &lld_local, &info);

// ... and distributed matrices
descinit_(descIC, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info);
descinit_(descB, &n, &i_one, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info);

// Distribute arrays onto process grid using pdgeadd
pdgeadd_("N", &n, &n, &one, IC_local, &i_one, &i_one, descIC_local, &zero, IC, &i_one, &i_one, descIC);
pdgeadd_("N", &n, &i_one, &one, B_local, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB);

       int id, r, c;
       for (id = 0; id < nprow*npcol; ++id) {
if (id == iam) {
printf( "A_loc on node %d , mp %d , nq %d \n", id, mp, nq);
for ( r = 0; r < mp; ++r) {
for ( c = 0; c < nq; ++c)
printf(" %f \t ", *(IC+mp*c+r));
printf("\n");
}
printf("\n");
}
Cblacs_barrier(ictxt, "All");
}

// Get LU factorisation
// pdgetrf_(&mp, &nq, IC, &i_one, &i_one, descIC, ipiv, &info);
// blacs_barrier_(&ictxt, "A");

// Solve equations: currently, I don't run this
// pdgetrs_("N", &nq, &i_one, IC, &iIC, &jIC, descIC, ipiv, B, &iB, &jB, descB, &info);
       // Gather: if I comment out pdgetrf_ above, this returns the original IC and B arrays
       printf( "gather matrix");
pdgeadd_("N", &n, &n &one, IC, &i_one, &i_one, descIC, &zero, IC_result, &i_one, &i_one, descIC_result);
//pdgeadd_("N", &n, &i_one, &one, B, &i_one, &i_one, descB, &zero, B_result, &i_one, &i_one, descB_result);


       for (id = 0; id < nprow*npcol; ++id) {
if (id == iam) {
printf( "A_loc on node %d , mp %d , nq %d \n", id, mp, nq);
for ( r = 0; r < mp; ++r) {
for ( c = 0; c < nq; ++c)
printf(" %f \t ", *(IC_result+mp*c+r));
printf("\n");
}
printf("\n");
}
Cblacs_barrier(ictxt, "All");
}

blacs_barrier_(&ictxt, "A");
blacs_gridexit_(&ictxt);
       return 0;
       }