Intel® oneAPI Math Kernel Library
Ask questions and share information with other developers who use Intel® Math Kernel Library.

ScaLAPACK crash using different block sizes

kalen__stoi
Beginner
551 Views

Hi,

I would like to use pzgesv routine to solve system of linear equations but it crashes if use block size 64 and e.g. with two processes  

it doesn't crash if i run the program with mpirun -np 1 ./myapp, or the block size is 4 and any number of processes. 

number of rows = 116, nrhs = 4; openmpi, mpicxx -v: gcc version 7.4.0 

here is the backtrace:

from the gdb

process 1

Thread 1 "myapp" received signal SIGSEGV, Segmentation fault.
0x00007ffff67483b2 in PMPI_Comm_size ()
   from /usr/lib/x86_64-linux-gnu/libmpi.so.20
(gdb) bt 
#0  0x00007ffff67483b2 in PMPI_Comm_size ()
   from /usr/lib/x86_64-linux-gnu/libmpi.so.20
#1  0x000055555897967a in MKLMPI_Comm_size ()
#2  0x000055555568884c in PB_CpgemmMPI ()
#3  0x000055555564e916 in pzgemm_ ()
#4  0x00005555556355b0 in pzgetrf2_ ()
#5  0x0000555555634aaf in pzgetrf_ ()
#6  0x000055555562c60d in pzgesv_ ()
#7  0x00005555555ed944 in main (argc=1, argv=0x7fffffffd6e8)
    at Main.cpp:159

process 0

Thread 1 "myapp" received signal SIGSEGV, Segmentation fault.
0x00007ffff67483b2 in PMPI_Comm_size ()
   from /usr/lib/x86_64-linux-gnu/libmpi.so.20
(gdb) bt
#0  0x00007ffff67483b2 in PMPI_Comm_size ()
   from /usr/lib/x86_64-linux-gnu/libmpi.so.20
#1  0x000055555897967a in MKLMPI_Comm_size ()
#2  0x000055555568884c in PB_CpgemmMPI ()
#3  0x000055555564e916 in pzgemm_ ()
#4  0x00005555556355b0 in pzgetrf2_ ()
#5  0x0000555555634aaf in pzgetrf_ ()
#6  0x000055555562c60d in pzgesv_ ()
#7  0x00005555555ed944 in main (argc=1, argv=0x7fffffffd6e8)
    at Main.cpp:159

the descriptors and data looks OK.

any idea what is going on?

Regards,

sk
 


 

 

0 Kudos
4 Replies
Gennady_F_Intel
Moderator
551 Views

Could you give us the reproducer which we may compile and run on our side?

0 Kudos
kalen__stoi
Beginner
551 Views

Gennady F. (Blackbelt) wrote:

Could you give us the reproducer which we may compile and run on our side?

it is not exactly the same code that gives the previous backtrace but the following one give similar behavior when I set the blocksize (Mb=64) it fails and pass with Mb=2

#include <iostream>
#include <memory>
#include <complex>

#include <mpi.h>
#include <mkl.h>
#include <complex>
#include <mkl_scalapack.h>

extern "C"
{
/* BLACS C interface */
void Cblacs_pinfo(int* mypnum, int* nprocs);
void Cblacs_get( MKL_INT context, MKL_INT request, MKL_INT* value);
int  Cblacs_gridinit( MKL_INT* context, char * order, MKL_INT np_row, MKL_INT np_col);
void Cblacs_gridinfo( MKL_INT context, MKL_INT*  np_row, MKL_INT* np_col, MKL_INT*  my_row,
                      MKL_INT*  my_col);
void Cblacs_gridexit(MKL_INT ictxt);
void Cblacs_barrier(MKL_INT ictxt, char * order);
void Cblacs_exit(int);
void Czgerv2d(int, int, int, std::complex<double>*, int, int, int);
void Czgesd2d(int, int, int, std::complex<double>*, int, int, int);
void Cdgerv2d(int, int, int, double*, int, int, int);
void Cdgesd2d(int, int, int, double*, int, int, int);
}

int main(int argc, char ** argv)
{
    int dims[] = {0, 0};
    int myid, nprocs;
    MKL_INT nprows, npcols, context, myrow, mycol;
    Cblacs_pinfo(&myid, &nprocs);
    MPI_Dims_create(nprocs, 2, dims);
    nprows = (MKL_INT)dims[0];
    npcols = (MKL_INT)dims[1];

    int negone = -1, zero = 0, one = 1;
    Cblacs_get(negone, zero, &context);   //default system context.

    char row_major[] = "Row";
    Cblacs_gridinit(&context, row_major, nprows, npcols);
    Cblacs_gridinfo(context, &nprows, &npcols, &myrow, &mycol);

    MKL_INT Mb = 64, M = 9, nrhs = 1;
    MKL_INT myone = 1, info=0;

    auto lnrows = numroc_ (&M, &Mb, &myrow, &zero, &nprows);
    auto lncols = numroc_ (&M, &Mb, &mycol, &zero, &npcols);

    //GLOBAL
    std::complex<double> * A;
    A = new std::complex<double> [81] {19,3,1,12,1,16,1,3,11,-19,3,1,12,1,16,1,3,11,-19,-3,1,12,1,16,1,3,11,-19,-3,-1,12,1,16,1,3,11,-19, \
    -3,-1,-12,1,16,1,3,11,-19,-3,-1,-12,-1,16,1,3,11,-19,-3,-1,-12,-1,-16,1,3,11,-19,-3,-1,-12,-1,-16,-1,3,11,-19, \
    -3,-1,-12,-1,-16,-1,-3,11};

    //LOCAL
    auto a = new std::complex<double> [lnrows*lncols]();
    for(int lr=0;lr < lnrows; ++lr) {
        int gr = lr % Mb + Mb * myrow + (lr / Mb) * Mb * nprows;
        for (int lc = 0; lc < lncols; ++lc) {
            int gc = lc % Mb + Mb * mycol + (lc / Mb) * Mb * npcols;
            a[lr + lnrows * lc] = A[gr * M + gc]; //col-major <- row-major
        }
    }

    std::complex<double> * B = new std::complex<double> [9] {0,0,1,0,0,0,0,0,0};
    auto b = new std::complex<double> [lnrows*lncols]();
    for(int lr=0;lr < lnrows; ++lr) {
        int gr = lr % Mb + Mb * myrow + (lr / Mb) * Mb * nprows;
        b[lr] = B[gr]; //col-major <- row-major
    }

    MKL_INT* ipiv = new MKL_INT [lncols*M + Mb]();
    MKL_INT desca[9];
    desca[0] = 1; // descriptor type
    desca[1] = context; // blacs context
    desca[2] = M; // global numberh of rows
    desca[3] = M; // global number of columns
    desca[4] = Mb; // row block size
    desca[5] = Mb; // column block size
    desca[6] = 0; // initial process row
    desca[7] = 0; // initial process column
    desca[8] = lnrows; // leading dimension of local array
    MKL_INT descb[9];
    descb[0] = 1; // descriptor type
    descb[1] = context; // blacs context
    descb[2] = M; // global numberh of rows
    descb[3] = nrhs; // global number of columns
    descb[4] = Mb; // row block size
    descb[5] = Mb; // column block size
    descb[6] = 0; // initial process row
    descb[7] = 0; // initial process column
    descb[8] = lnrows; // leading dimension of local array

    std::cout << "lnrows=" << lnrows << " nprows=" << nprows<< " npcols=" << npcols<< std::endl;

    pzgesv_(&M, &nrhs, (MKL_Complex16*)a, &myone, &myone, desca,ipiv,
            (MKL_Complex16*)b, &myone, &myone, descb, &info);

    if(info != 0) {
        std::cout << "PDGESV problem! Info " << info << std::endl;
    }
    for(int i=0; i< lnrows; ++i){
       std::cout << " sol[" << i << "]= " << b.real() <<std::endl;
    }
    delete [] ipiv;

    Cblacs_barrier(context, "All");
    Cblacs_gridexit(context);
    Cblacs_exit(0);

    return 0;

}

 

0 Kudos
Gennady_F_Intel
Moderator
551 Views

to be exactly on the same page, please show how did you link and which version of mpi do you use?

0 Kudos
kalen__stoi
Beginner
551 Views

it is cmake project:

target_link_libraries(main PRIVATE nlohmann_json::nlohmann_json -Wl,--start-group $ENV{MKLROOT}/lib/intel64/libmkl_scalapack_lp64.a $ENV{MKLROOT}/lib/intel64/libmkl_blacs_openmpi_lp64.a $ENV{MKLROOT}/lib/intel64/libmkl_intel_lp64.a $ENV{MKLROOT}/lib/intel64/libmkl_gnu_thread.a $ENV{MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -lgomp -lpthread -lm -ldl ${HDF5_C_LIBRARIES} ${Boost_LIBRARIES} )

mpirun -V
mpirun (Open MPI) 2.1.1
 

there is no problem when compile and linking.

 

 

0 Kudos
Reply