- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
I would like to use pzgesv routine to solve system of linear equations but it crashes if use block size 64 and e.g. with two processes
it doesn't crash if i run the program with mpirun -np 1 ./myapp, or the block size is 4 and any number of processes.
number of rows = 116, nrhs = 4; openmpi, mpicxx -v: gcc version 7.4.0
here is the backtrace:
from the gdb
process 1
Thread 1 "myapp" received signal SIGSEGV, Segmentation fault.
0x00007ffff67483b2 in PMPI_Comm_size ()
from /usr/lib/x86_64-linux-gnu/libmpi.so.20
(gdb) bt
#0 0x00007ffff67483b2 in PMPI_Comm_size ()
from /usr/lib/x86_64-linux-gnu/libmpi.so.20
#1 0x000055555897967a in MKLMPI_Comm_size ()
#2 0x000055555568884c in PB_CpgemmMPI ()
#3 0x000055555564e916 in pzgemm_ ()
#4 0x00005555556355b0 in pzgetrf2_ ()
#5 0x0000555555634aaf in pzgetrf_ ()
#6 0x000055555562c60d in pzgesv_ ()
#7 0x00005555555ed944 in main (argc=1, argv=0x7fffffffd6e8)
at Main.cpp:159
process 0
Thread 1 "myapp" received signal SIGSEGV, Segmentation fault.
0x00007ffff67483b2 in PMPI_Comm_size ()
from /usr/lib/x86_64-linux-gnu/libmpi.so.20
(gdb) bt
#0 0x00007ffff67483b2 in PMPI_Comm_size ()
from /usr/lib/x86_64-linux-gnu/libmpi.so.20
#1 0x000055555897967a in MKLMPI_Comm_size ()
#2 0x000055555568884c in PB_CpgemmMPI ()
#3 0x000055555564e916 in pzgemm_ ()
#4 0x00005555556355b0 in pzgetrf2_ ()
#5 0x0000555555634aaf in pzgetrf_ ()
#6 0x000055555562c60d in pzgesv_ ()
#7 0x00005555555ed944 in main (argc=1, argv=0x7fffffffd6e8)
at Main.cpp:159
the descriptors and data looks OK.
any idea what is going on?
Regards,
sk
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Could you give us the reproducer which we may compile and run on our side?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Gennady F. (Blackbelt) wrote:Could you give us the reproducer which we may compile and run on our side?
it is not exactly the same code that gives the previous backtrace but the following one give similar behavior when I set the blocksize (Mb=64) it fails and pass with Mb=2
#include <iostream> #include <memory> #include <complex> #include <mpi.h> #include <mkl.h> #include <complex> #include <mkl_scalapack.h> extern "C" { /* BLACS C interface */ void Cblacs_pinfo(int* mypnum, int* nprocs); void Cblacs_get( MKL_INT context, MKL_INT request, MKL_INT* value); int Cblacs_gridinit( MKL_INT* context, char * order, MKL_INT np_row, MKL_INT np_col); void Cblacs_gridinfo( MKL_INT context, MKL_INT* np_row, MKL_INT* np_col, MKL_INT* my_row, MKL_INT* my_col); void Cblacs_gridexit(MKL_INT ictxt); void Cblacs_barrier(MKL_INT ictxt, char * order); void Cblacs_exit(int); void Czgerv2d(int, int, int, std::complex<double>*, int, int, int); void Czgesd2d(int, int, int, std::complex<double>*, int, int, int); void Cdgerv2d(int, int, int, double*, int, int, int); void Cdgesd2d(int, int, int, double*, int, int, int); } int main(int argc, char ** argv) { int dims[] = {0, 0}; int myid, nprocs; MKL_INT nprows, npcols, context, myrow, mycol; Cblacs_pinfo(&myid, &nprocs); MPI_Dims_create(nprocs, 2, dims); nprows = (MKL_INT)dims[0]; npcols = (MKL_INT)dims[1]; int negone = -1, zero = 0, one = 1; Cblacs_get(negone, zero, &context); //default system context. char row_major[] = "Row"; Cblacs_gridinit(&context, row_major, nprows, npcols); Cblacs_gridinfo(context, &nprows, &npcols, &myrow, &mycol); MKL_INT Mb = 64, M = 9, nrhs = 1; MKL_INT myone = 1, info=0; auto lnrows = numroc_ (&M, &Mb, &myrow, &zero, &nprows); auto lncols = numroc_ (&M, &Mb, &mycol, &zero, &npcols); //GLOBAL std::complex<double> * A; A = new std::complex<double> [81] {19,3,1,12,1,16,1,3,11,-19,3,1,12,1,16,1,3,11,-19,-3,1,12,1,16,1,3,11,-19,-3,-1,12,1,16,1,3,11,-19, \ -3,-1,-12,1,16,1,3,11,-19,-3,-1,-12,-1,16,1,3,11,-19,-3,-1,-12,-1,-16,1,3,11,-19,-3,-1,-12,-1,-16,-1,3,11,-19, \ -3,-1,-12,-1,-16,-1,-3,11}; //LOCAL auto a = new std::complex<double> [lnrows*lncols](); for(int lr=0;lr < lnrows; ++lr) { int gr = lr % Mb + Mb * myrow + (lr / Mb) * Mb * nprows; for (int lc = 0; lc < lncols; ++lc) { int gc = lc % Mb + Mb * mycol + (lc / Mb) * Mb * npcols; a[lr + lnrows * lc] = A[gr * M + gc]; //col-major <- row-major } } std::complex<double> * B = new std::complex<double> [9] {0,0,1,0,0,0,0,0,0}; auto b = new std::complex<double> [lnrows*lncols](); for(int lr=0;lr < lnrows; ++lr) { int gr = lr % Mb + Mb * myrow + (lr / Mb) * Mb * nprows; b[lr] = B[gr]; //col-major <- row-major } MKL_INT* ipiv = new MKL_INT [lncols*M + Mb](); MKL_INT desca[9]; desca[0] = 1; // descriptor type desca[1] = context; // blacs context desca[2] = M; // global numberh of rows desca[3] = M; // global number of columns desca[4] = Mb; // row block size desca[5] = Mb; // column block size desca[6] = 0; // initial process row desca[7] = 0; // initial process column desca[8] = lnrows; // leading dimension of local array MKL_INT descb[9]; descb[0] = 1; // descriptor type descb[1] = context; // blacs context descb[2] = M; // global numberh of rows descb[3] = nrhs; // global number of columns descb[4] = Mb; // row block size descb[5] = Mb; // column block size descb[6] = 0; // initial process row descb[7] = 0; // initial process column descb[8] = lnrows; // leading dimension of local array std::cout << "lnrows=" << lnrows << " nprows=" << nprows<< " npcols=" << npcols<< std::endl; pzgesv_(&M, &nrhs, (MKL_Complex16*)a, &myone, &myone, desca,ipiv, (MKL_Complex16*)b, &myone, &myone, descb, &info); if(info != 0) { std::cout << "PDGESV problem! Info " << info << std::endl; } for(int i=0; i< lnrows; ++i){ std::cout << " sol[" << i << "]= " << b.real() <<std::endl; } delete [] ipiv; Cblacs_barrier(context, "All"); Cblacs_gridexit(context); Cblacs_exit(0); return 0; }
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
to be exactly on the same page, please show how did you link and which version of mpi do you use?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
it is cmake project:
target_link_libraries(main PRIVATE nlohmann_json::nlohmann_json -Wl,--start-group $ENV{MKLROOT}/lib/intel64/libmkl_scalapack_lp64.a $ENV{MKLROOT}/lib/intel64/libmkl_blacs_openmpi_lp64.a $ENV{MKLROOT}/lib/intel64/libmkl_intel_lp64.a $ENV{MKLROOT}/lib/intel64/libmkl_gnu_thread.a $ENV{MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -lgomp -lpthread -lm -ldl ${HDF5_C_LIBRARIES} ${Boost_LIBRARIES} )
mpirun -V
mpirun (Open MPI) 2.1.1
there is no problem when compile and linking.
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page