Hi,

Arne_K_ · ‎03-03-2015

Hi,

I'm developing an application that needs to compute various eigenvalue decompositions. Is it possible to call zfeast_heev from multiple threads in parallel? Ofcourse, each thread has it's own memory. I could not find this kind of information in the documentation. Currently I'm using zhpevd, which works fine when called from multiple threads, zfeast_heev however, do not.

Looking forward to your answers

Gennady_F_Intel · ‎03-03-2015

yes, it should works in this environment correctly. In the case if there is any concern, please give us the example for investigation.

Arne_K_ · ‎03-05-2015

Hi,

I made a small program similar to my usage that shows the error.

Can you reproduce the error?

#include "mkl.h"

#include <boost/thread/thread.hpp>
#include <boost/bind.hpp>
#include <boost/random/normal_distribution.hpp>
#include <boost/random/mersenne_twister.hpp>

#include <vector>
#include <iostream>

#include <cstdio>

//#define USE_THREADS
//#define LOWER_ONLY

struct RunData {
 std::vector<MKL_Complex16> C;
 std::vector<MKL_Complex16> U;
 std::vector<double> D;
 MKL_INT info;
 MKL_INT dim;
 double e_max;
};

struct ThreadCache {
 MKL_INT *fpm;
 double *epsout;
 double *res;
};
 
void compute_eigenvalues( RunData& data, ThreadCache& cache ) {

 feastinit( cache.fpm );

 MKL_INT loop;
 double e_min = 0;
 double e_max = data.e_max;
 MKL_INT n   = data.dim;
 MKL_INT lda = data.dim;
 MKL_INT m0  = data.dim;
 MKL_INT m   = data.dim;
#ifdef LOWER_ONLY
 char uplo   = 'U';
#else
 char uplo   = 'F';
#endif

 zfeast_heev(&uplo, &n, &data.C[0], &lda, cache.fpm, cache.epsout, &loop, &e_min, &e_max, &m0, &data.D[0], &data.U[0], &m, cache.res, &data.info );

};

void thread_compute_function( std::vector< RunData >* data, unsigned int start, unsigned int end ) {
 if( data->empty() || start >= end || data->size() < end ) {
  return;
 }

 std::vector<MKL_INT> fpm(128);
 std::vector<double>  epsout( (*data)[0].dim * (*data)[0].dim );
 std::vector<double>  res( (*data)[0].dim * (*data)[0].dim );

 ThreadCache tc;
 tc.fpm    = &(fpm[0]);
 tc.epsout = &(epsout[0]);
 tc.res    = &(res[0]);


 for( unsigned int i = start; i < end; ++i ) {
  std::cerr << "compute eigenvalues ( " << (i+1-start) << " / " << end-start << " )\n";
  compute_eigenvalues( (*data), tc );
 }
}

// generate random vectors and compute an estimated corrolation matrix
void setup_data( std::vector< RunData > &data, unsigned int n_data, unsigned int n_dims, unsigned int n_loops = 5 ) {

 boost::random::mt19937 rng( 0 );
 boost::random::normal_distribution<double> normal_dist( 0.15, 3.21 );

 data.resize( n_data );
 for( unsigned int i_data = 0; i_data < n_data; ++i_data ) {

#ifdef LOWER_ONLY
  unsigned int size_C = ((n_dims+1) * n_dims)/2;
#else
  unsigned int size_C = n_dims * n_dims;
#endif

  RunData &r_data = data[i_data];
  r_data.dim = n_dims;
  r_data.e_max = 0.0;
  r_data.info = 0;
  r_data.C.resize( size_C );
  r_data.U.resize( n_dims * n_dims );
  r_data.D.resize( n_dims );

  std::vector< MKL_Complex16 > X( n_dims );

  for( unsigned int i_loop = 0; i_loop < n_loops; ++i_loop ) {

   unsigned int cnt = 0;
   // generate random vector
   for( unsigned int i_dim = 0; i_dim < n_dims; ++i_dim ) {
    X[i_dim].real = normal_dist( rng );
    X[i_dim].imag = normal_dist( rng );
   }

   for( unsigned int i = 0; i < n_dims; ++i ) {
#ifdef LOWER_ONLY
    unsigned int j_start = i;
#else
    unsigned int j_start = 0;
#endif
    for( unsigned int j = j_start; j < n_dims; ++j ) {
     MKL_Complex16 x_ij;
     x_ij.real = X.real * X.real + X.imag * X.imag / (n_loops * n_loops);
     x_ij.imag = X.imag * X.real - X.real * X.imag / (n_loops * n_loops);

     r_data.C[cnt++] = x_ij;
    }
   }
  }

  double sum = 0;
  unsigned int cnt = 0;
  for( unsigned int i = 0; i < n_dims; ++i ) {
   #ifdef LOWER_ONLY
    unsigned int j_start = i;
#else
    unsigned int j_start = 0;
#endif
    for( unsigned int j = j_start; j < n_dims; ++j ) {
     if( i == j )
      sum += r_data.C[cnt].real * r_data.C[cnt].real + r_data.C[cnt].imag * r_data.C[cnt].imag;
     ++cnt;
    }
  }

  r_data.e_max = sum;
 }
}

int main(int argc, char* argv[])
{
 unsigned int const n_data  = 20;
 unsigned int const n_dims  = 50;

 std::vector< RunData > data;

 setup_data( data, n_data, n_dims );

#ifdef USE_THREADS
 unsigned int const n_threads = 4;

 boost::thread_group thread_pool;

 unsigned int data_per_thread = std::max<unsigned int>( 1, data.size() / n_threads );

 for( unsigned int i_thread = 0; i_thread < n_threads; ++i_thread ) {
  unsigned int start = i_thread * data_per_thread;
  unsigned int end = std::min( (i_thread+1) * data_per_thread, data.size() );
  thread_pool.create_thread( boost::bind( thread_compute_function, &data, start, end ) );
 }

 thread_pool.join_all();
#else
 thread_compute_function( &data, 0, data.size() );
#endif


 for( std::vector< RunData >::const_iterator iter = data.begin(); iter != data.end(); ++iter ) {
  std::cout << "zfeast_heev info value: " << iter->info << std::endl;
 }

 std::cout << "Done!\n";

 getchar();

 std::cout << "Closing\n";

 return 0;
}

Gennady_F_Intel · ‎03-07-2015

I see the crush even when mkl_set-num_threads(1). I suspeсе you need to check how to split the input data before mkl routine.

Arne_K_ · ‎03-07-2015

Hi,

the input data is all independent of each other so I don't know what es to check for. If USE_THREADS is not defined than everything work but if it is defined it often but not always crashes. As the standard divide and conquer methods works flawlessly when used in parallel I don't understand why this function does not.

For now I will not use this method in my scenario as I can't get it to work properly.

Thank you for your time and comments

Is it save to call ?feast_syev/?feast_heev from multiple threads?