David, yes the problem is

davydden1 · ‎10-22-2018

Dear all,

I see the following terminal output when using pdsyevx from Intel MKL 2019.0.117 with mpich 3.2.1 to calculate only some eigenvalues:

{    0,    0}:  On entry to 
PDORMTR parameter number   12 had an illegal value 
{    0,    0}:  On entry to 
PDORMTR parameter number   12 had an illegal value 
{    0,    0}:  On entry to 
PDORMTR parameter number   12 had an illegal value

whereas no warnings in given when using openblas 0.3.3 + netlib-scalapack 2.0.2 + openmpi 3.1.2. For both setups GCC 7.3.0 from Ubuntu 18.04.1 LTS is used.

Note that I do NOT call pdormtr manually, it must be called from within pdsyevx. It looks like its implementation in Intel MKL has some issues for this case.

Below is a MWE that demonstrates the problem.

#include <mpi.h>
#include <iostream>
#include <iomanip>
#include <string>
#include <fstream>
#include <iostream>
#include <stdlib.h>
#include <sstream>
#include <cmath>
#include <vector>
#include <functional>
using namespace std;

// test program to compute last 5 eigenvalues without eigenvectors for symmetric operators
//
// compile with:
// mpic++ -std=c++11 example.cc -o example -lscalapack -lopenblas
// mpic++ -std=c++11 example.cc -o example -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64 -lmkl_intel_lp64 -lmkl_core -lmkl_sequential


extern "C" {
  /* Cblacs declarations */
  void Cblacs_pinfo(int *, int *);
  void Cblacs_get(int, int, int *);
  void Cblacs_gridinit(int *, const char *, int, int);
  void Cblacs_gridinfo(int, int *, int *, int *,int *);
  void Cblacs_pcoord(int, int, int *, int *);
  void Cblacs_gridexit(int);
  void Cblacs_barrier(int, const char *);
  void Cdgerv2d(int, int, int, double *, int, int, int);
  void Cdgesd2d(int, int, int, double *, int, int, int);

  int numroc_ (const int *n, const int *nb, const int *iproc, const int *isproc, const int *nprocs);

  void pdpotrf_(const char *UPLO,
                const int *N,
                double *A, const int *IA, const int *JA, const int *DESCA,
                int *INFO);

  void descinit_ (int *desc, const int *m, const int *n, const int *mb, const int *nb, const int *irsrc, const int *icsrc, const int *ictxt, const int *lld, int *info);

  int indxl2g_ (const int *indxloc, const int *nb, const int *iproc, const int *isrcproc, const int *nprocs);

  void pdpotri_(const char *UPLO,
                const int *N,
                double *A, const int *IA, const int *JA, const int *DESCA,
                int *INFO);

  void
  pdsyevx_(const char *  jobz,
           const char *  range,
           const char *  uplo,
           const int *   n,
           double *      A,
           const int *   ia,
           const int *   ja,
           const int *   desca,
           const double *VL,
           const double *VU,
           const int *   il,
           const int *   iu,
           const double *abstol,
           const int *   m,
           const int *   nz,
           double *      w,
           double *      orfac,
           double *      Z,
           const int *   iz,
           const int *   jz,
           const int *   descz,
           double *      work,
           int *         lwork,
           int *         iwork,
           int *         liwork,
           int *         ifail,
           int *         iclustr,
           double *      gap,
            int * info);

    double
    pdlamch_(const int *ictxt, const char *cmach);
}

int main(int argc, char **argv)
{
  int mpirank,nprocs;
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &mpirank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  const char uplo('L');
  const int first_process_row(0);
  const int first_process_column(0);

  const bool mpiroot = (mpirank == 0);

  /* Scalapack / Blacs Vars */
  const int N = 200, Nb = 32;
  int M = N, Mb = Nb;

  if (mpiroot)
    cout << "Running with N=" << N << " Nb=" << Nb << std::endl;

  int procrows = 0, proccols = 0;
  {
    const int n_processes_heuristic = int(std::ceil((1.*N)/Nb))*
                                      int(std::ceil((1.*M)/Mb));
    const int Np = std::min(n_processes_heuristic, nprocs);
    const double ratio = 1.0;
    const int Pc = std::sqrt(ratio * Np);
    proccols = std::min (Np, std::max(2, Pc));
    procrows = Np / proccols ;
  }

  /* Begin Cblas context */
  int ctxt = 0, myrow = 0, mycol = 0, numproc = nprocs;
  // Cblacs_pinfo(&myid, &numproc);
  Cblacs_get(0, 0, &ctxt);
  Cblacs_gridinit(&ctxt, "Row-major", procrows, proccols);
  Cblacs_gridinfo( ctxt, &procrows, &proccols, &myrow, &mycol );

  if (mpiroot)
    cout << "Process grid " << procrows << "x"<<proccols << "<=" << numproc << std::endl;

  if ((myrow > -1) && (mycol > -1))
    {
      if (mpiroot)
        cout << "setup..." << std::endl;

      // we won't need eigenvectors, use a sufficiently small distributed matrix on the same process grid:
      const int N_B = procrows;
      const int M_B = procrows;
      const int Nb_B = 1;
      const int Mb_B = 1;

      const int nrows_B = numroc_(&N_B, &Nb_B, &myrow, &first_process_row, &procrows);
      const int ncols_B = numroc_(&M_B, &Mb_B, &mycol, &first_process_column, &proccols);
      const int lda_B = max(1,nrows_B);

      int info=0;
      int descB[9];
      descinit_(descB, &N_B, &M_B, &Nb_B, &Mb_B,&first_process_row,&first_process_column,&ctxt, &lda_B, &info);
      if (info != 0)
        cerr << "error in descinit_" << std::endl;

      std::vector<double> B(nrows_B*ncols_B);

      // set the matrix:
      const int nrows = numroc_(&N, &Nb, &myrow, &first_process_row, &procrows);
      const int ncols = numroc_(&M, &Mb, &mycol, &first_process_column, &proccols);

      const int lda = max(1,nrows);
      int descA[9];
      descinit_(descA, &N, &M, &Nb, &Mb,&first_process_row,&first_process_column,&ctxt, &lda, &info);
      if (info != 0)
        cerr << "error in descinit_" << std::endl;

      std::vector<double> A(nrows*ncols);

      // set to SPD
      {
        int seed = 13;
        srand(seed);
        std::vector<double> global_A(N*M);
        auto gind = [&] (const int row, const int col)
        {
          return col*M+row;
        };

        for (unsigned int c = 0; c < N; ++c)
          for (unsigned int r = c; r < M; ++r)
            {
              const double val = ((double) rand())/ ((double)RAND_MAX);
              if (r==c)
                global_A[gind(r,c)] = val + N;
              else
                {
                  global_A[gind(r,c)] = val;
                  global_A[gind(c,r)] = val;
                }
            }

        int k = 0;
        for (int c=1; c <= ncols; ++c)
          {
            const int glob_c = indxl2g_ (&c, &Nb, &mycol, &first_process_column, &proccols) - 1;
            for (int r = 1; r <= nrows; ++r)
              {
                const int glob_r = indxl2g_ (&r, &Mb, &myrow, &first_process_row, &procrows) - 1;
                A = global_A[gind(glob_r,glob_c)];
                k++;
              }
          }
      }

      const int submatrix_row = 1;
      const int submatrix_column = 1;

      double *A_loc = &A[0];
      double *B_loc = &B[0];

      std::vector<double> ev(N);

      char jobz = 'N';
      char range = 'I';
      double vl = 0, vu = 0;
      int il = N-5, iu = N; // last 5 eigenvalues
      int nz = 0;
      double abstol = 0;
      double orfac = 0;
      std::vector<int> ifail;
      std::vector<int> iclustr;
      std::vector<double> gap(nrows*ncols);

      // get work sizes:
      int lwork = -1;
      int liwork = -1;
      double *eigenvectors_loc = nullptr; // won't need eigenvectors at all
      std::vector<double> work(1);
      std::vector<int> iwork(1);

      char cmach = 'S';
      abstol = pdlamch_(&ctxt, &cmach);
      abstol *= 2;
      ifail.resize(N);
      iclustr.resize(2*procrows*proccols);
      gap.resize(procrows*proccols);

      info = 0;
      if (mpiroot)
        cout << "pdsyevx_..." << std::endl;

      pdsyevx_(&jobz,
                 &range,
                 &uplo,
                 &N,
                 A_loc,
                 &submatrix_row,
                 &submatrix_column,
                 descA,
                 &vl,
                 &vu,
                 &il,
                 &iu,
                 &abstol,
                 &N,
                 &nz,
                 &ev[0],
                 &orfac,
                 eigenvectors_loc,
                 &submatrix_row,
                 &submatrix_column,
                 descB,
                 &work[0],
                 &lwork,
                 &iwork[0],
                 &liwork,
                 &ifail[0],
                 &iclustr[0],
                 &gap[0],
                 &info);

      if (info != 0)
        {
          cerr << "error in pdpotrf_ " << info << std::endl;
          return 1;
        }

      if (mpiroot)
        std::cout << "done!" << std::endl;

      Cblacs_gridexit(ctxt);
    }
  MPI_Finalize();
}

Gennady_F_Intel · ‎10-22-2018

David, how did you run the code?

davydden1 · ‎10-22-2018

Gennady, I run with 4 mpi processes, but I am quite certain the extra message is there even with a single core run.

Gennady_F_Intel · ‎10-23-2018

David, yes the problem is reproduced and escalated. We will check with ScaLapack team and inform you in the case of any updates from the team. thanks for report.

Gennady_F_Intel · ‎12-16-2019

MKL v.2020 has been released and contains the fix of this problem either.

problem with pdsyevx when calculating some eigenvalues only