mkl_dcsrsymv crashes in parallel mode

Igor_Tsukanov · ‎06-26-2010

When I am using mkl_dcsrsymv function called from C++ in MKL parallel mode, my program reports memory error and crashes. However, if I use MKL in sequential mode, the program works fine.

How I can I work around this problem? I really need to run this function in parallel mode.

Thank you very much in advance.

Gennady_F_Intel · ‎06-27-2010

Can you give us the example? This is an unexpected behavior.Actually, MKL is thread-safe, which means that all Intel MKL functions1 work correctly duringsimultaneous execution by multiple threads.

--Gennady

Igor_Tsukanov · ‎06-27-2010

Gennady,
thank you for yor response. Please find
below a modified conjugate gradient solver originally taken from Numerical Recipes.
A sparse matrix is stored in sa[] array, ija[] array is used to store column indices of the elements
and p_row[] array contains indices of first non-zero element in a row. These are one based arrays
(artifact of Numerical Recipes) with zero index elements not used.

This function produces correct results if MKL is used in a sequential mode. If I switch to parallel mode
the first call of mkl_dcsrsymv causes the memory access violation error.

#define USE_INTEL_MKL // use Intel MKL
#define EPS 1.0e-14
void matrixclass::linbcg_symmetric(MatrixIndexType n, double b[], double x[], int itol, double tol,
MatrixIndexType itmax, MatrixIndexType *iter, double *err)
{
// FAST VECTORIZED VERSION (tuned for this package) -- does not use A^T

double snrm(MatrixIndexType n, double sx[], int itol);
double ak,akden,akden1,akden2,akden3,akden4,bk,bkden,bknum,bknum1,bknum2,bknum3,bknum4,bnrm;
double *p,*r,*z;
MatrixIndexType j,m,m1;
MatrixIndexType i;

#ifdef USE_INTEL_MKL
#ifdef FM_SYMMETRIC_STORAGE_SCHEME_LOW_TRIANGLE
char uplo = 'L';
#else
char uplo = 'U';
#endif
#endif

#ifdef USE_INTEL_MKL
p = (double *)mkl_malloc( n * sizeof(double), MKL_ALLIGNMENT); p--;
r = (double *)mkl_malloc( n * sizeof(double), MKL_ALLIGNMENT); r--;
z = (double *)mkl_malloc( n * sizeof(double), MKL_ALLIGNMENT); z--;
#else
p=dvector(1,n);
r=dvector(1,n);
z=dvector(1,n);
#endif

#ifdef FM_SYMMETRIC_STORAGE_SCHEME_LOW_TRIANGLE
atimes_symmetric(n,x,r,0);
#else
atimes_symmetric(n,x,r,1);
#endif

m = n % 4;
m1 = m+1;
for ( j=1; j<=m; j++)
{
r=b-r;
}

for ( j=m1; j<=n; j+=4)
{
r = b -r;
r[j+1] = b[j+1]-r[j+1];
r[j+2] = b[j+2]-r[j+2];
r[j+3] = b[j+3]-r[j+3];
}

// atimes(n,r,rr,0); // minimal residual algorithm (check compatibility with symmetric matrix)

bnrm=snrm(n,b,itol);
// For homogeneous systems *****
if( fabs(bnrm) < EPS )
{
bnrm = 1.0;
}

//******************************
asolve_new_symmetric(n,r,p);
//////////////////////////

#ifdef USE_INTEL_MKL
int incr = 1;
bknum = ddot((const int *)&n, &p[1], &incr, &r[1], &incr);
#else
bknum = 0.0;
bknum1 = 0.0;
bknum2 = 0.0;
bknum3 = 0.0;
bknum4 = 0.0;
for ( j=1; j<=m; j++)
{
bknum += p*r; // p instead of z
}

for ( j=m1; j<=n; j+=4)
{
bknum1 += p *r; // p instead of z
bknum2 += p[j+1]*r[j+1]; // p instead of z
bknum3 += p[j+2]*r[j+2]; // p instead of z
bknum4 += p[j+3]*r[j+3]; // p instead of z
}
bknum += bknum1+bknum2+bknum3+bknum4;
#endif

bkden = bknum;

#ifdef USE_INTEL_MKL
// ----------------------- Problem is here ------------------------------------------------------>
mkl_dcsrsymv((char *)&uplo, (int *)&n, &sa[1], (int *)&p_row[1], (int *)&ija[1], &p[1], &z[1]);
// <---------------------------------------------------------------------------------------------------
akden = ddot((const int *)&n, &z[1], &incr, &p[1], &incr);
ak=bknum/akden;

daxpy((const int *)&n, &ak, &p[1], &incr, &x[1], &incr);
ak *= -1.0;
daxpy((const int *)&n, &ak, &z[1], &incr, &r[1], &incr);
ak *= -1.0;
#else
atimes_new_symmetric(n, p, z); // z and zz are computed. the rest of the code is unchanged
akden = 0.0;
akden1 = 0.0;
akden2 = 0.0;
akden3 = 0.0;
akden4 = 0.0;
for (j=1; j<=m; j++)
{
akden += z*p;
}

for (j=m1; j<=n; j+=4)
{
akden1 += z*p;
akden2 += z[j+1]*p[j+1];
akden3 += z[j+2]*p[j+2];
akden4 += z[j+3]*p[j+3];
}
akden += akden1 + akden2 + akden3 + akden4;

ak=bknum/akden;

for (j=1;j<=m;j++)
{
x += ak*p;
r -= ak*z;
}

for (j=m1;j<=n;j+=4)
{
x += ak*p ;
r -= ak*z ;

x[j+1] += ak*p [j+1];
r[j+1] -= ak*z [j+1];

x[j+2] += ak*p [j+2];
r[j+2] -= ak*z [j+2];

x[j+3] += ak*p [j+3];
r[j+3] -= ak*z [j+3];
}
#endif

asolve_error_symmetric(n,r,z,err); // 12/12/2005

//////////////////////////
for( i=2; i <= itmax; i++)
{
*iter = i;
*err /= bnrm;
if( (m_ProgressReportCallBack1 != NULL) && (i % m_report_frequency == 0) )
{
m_ProgressReportCallBack1( i, *err );
}
else if( (m_ProgressReportCallBack2 != NULL) && (i % m_report_frequency == 0) )
{
m_ProgressReportCallBack2( i, *err, physical_error_estimator(x, r, b, n) );
}
if (*err <= tol) break;

#ifdef USE_INTEL_MKL
int incr = 1;
bknum = ddot((const int *)&n, &z[1], &incr, &r[1], &incr);
#else
bknum = 0.0;
bknum1 = 0.0;
bknum2 = 0.0;
bknum3 = 0.0;
bknum4 = 0.0;
for ( j=1; j<=m; j++)
{
bknum += z*r;
}

for ( j=m1; j<=n; j+=4)
{
bknum1 += z*r;
bknum2 += z[j+1]*r[j+1];
bknum3 += z[j+2]*r[j+2];
bknum4 += z[j+3]*r[j+3];
}
bknum += bknum1+bknum2+bknum3+bknum4;
#endif

bk=bknum/bkden;

for (j=1;j<=m;j++)
{
p=bk*p+z;
}

for (j=m1;j<=n;j+=4)
{
p =bk*p +z ;
p[j+1] =bk*p [j+1] +z [j+1];
p[j+2] =bk*p [j+2] +z [j+2];
p[j+3] =bk*p [j+3] +z [j+3];
}

bkden=bknum;

#ifdef USE_INTEL_MKL
mkl_dcsrsymv((char *)&uplo, (int *)&n, &sa[1], (int *)&p_row[1], (int *)&ija[1], &p[1], &z[1]);
akden = ddot((const int *)&n, &z[1], &incr, &p[1], &incr);
ak=bknum/akden;
daxpy((const int *)&n, &ak, &p[1], &incr, &x[1], &incr);
ak *= -1.0;
daxpy((const int *)&n, &ak, &z[1], &incr, &r[1], &incr);
ak *= -1.0;
#else
atimes_new_symmetric(n, p, z);

akden = 0.0;
akden1 = 0.0;
akden2 = 0.0;
akden3 = 0.0;
akden4 = 0.0;
for (j=1; j<=m; j++)
{
akden += z*p;
}

for (j=m1; j<=n; j+=4)
{
akden1 += z *p;
akden2 += z[j+1]*p[j+1];
akden3 += z[j+2]*p[j+2];
akden4 += z[j+3]*p[j+3];
}
akden += akden1 + akden2 + akden3 + akden4;
ak=bknum/akden;
for (j=1;j<=m;j++)
{
x += ak*p;
r -= ak*z;
}

for (j=m1;j<=n;j+=4)
{
x += ak*p;
r -= ak*z;

x[j+1] += ak*p[j+1];
r[j+1] -= ak*z[j+1];

x[j+2] += ak*p[j+2];
r[j+2] -= ak*z[j+2];

x[j+3] += ak*p[j+3];
r[j+3] -= ak*z[j+3];
}
#endif

asolve_error_symmetric(n,r,z,err); // 12/12/2005
}

*iter -= 1;

m_solution_error_Siemens = physical_error_estimator(x, r, b, n);

if (*err <= tol)
{
if( m_status_report_flag )
{
#ifdef __FIELDMAGIG_64BIT_MATRIX_INDEX
printf("%I64d iterations were performed. Accuracy of solution is %17.10le\n",*iter,*err);
#else
printf("%d iterations were performed. Accuracy of solution is %17.10le\n",*iter,*err);
#endif
printf("Accuracy of solution (energy criterion) is %17.10le\n",m_solution_error_Siemens);
}
}
else
{
if( m_status_report_flag )
{
#ifdef __FIELDMAGIG_64BIT_MATRIX_INDEX
printf("Slow convergence: maximum number (%I64d) of iterations was performed.\nAccuracy of solution is %17.10le\n",*iter,*err);
#else
printf("Slow convergence: maximum number (%d) of iterations was performed.\nAccuracy of solution is %17.10le\n",*iter,*err);
#endif
printf("Accuracy of solution (energy criterion) is %17.10le\n",m_solution_error_Siemens);
}
}

#ifdef USE_INTEL_MKL
p++; r++; z++;
mkl_free( p );
mkl_free( r );
mkl_free( z );
mkl_free_buffers();
#else
free_dvector(p,1,n);
free_dvector(r,1,n);
free_dvector(z,1,n);
#endif
}
#undef EPS

And this is a function which I would like to replace with MKL function mkl_dcsrsymv:

void matrixclass::atimes_new_symmetric(MatrixIndexType n, double x[], double b[])
{
// performs multiplication A X
// optimized code for symmetric matrices ---->
MatrixIndexType i, j, k, k1, m, j1, j2, j3, j4;

double xi1, s1b, s2b, s3b, s4b;

m = n % 4;

for( i=1; i<=n; i++ )
{
b = 0.0;
}

#ifdef FM_SYMMETRIC_STORAGE_SCHEME_LOW_TRIANGLE
for( i=1; i<=n; i++)
{
k1 = p_row[i+1]-2; // exclude diagonal element
m = p_row + ((k1 - p_row + 1) % 4)-1;

s1b = 0.0;
s2b = 0.0;
s3b = 0.0;
s4b = 0.0;
xi1 = x;

for( k = p_row; k <= m; k++ )
{
j=ija;
s1b += sa * x; // A
b += sa * xi1;
}

for( k = m+1; k <= k1; k+=4 )
{
j1=ija;
j2=ija[k+1];
j3=ija[k+2];
j4=ija[k+3];

s1b += sa * x [j1]; // A
b [j1] += sa * xi1;

s2b += sa[k+1] * x [j2]; // A
b [j2] += sa[k+1] * xi1;

s3b += sa[k+2] * x [j3]; // A
b [j3] += sa[k+2] * xi1;

s4b += sa[k+3] * x [j4]; // A
b [j4] += sa[k+3] * xi1;
}
k = k1 + 1;
j1=ija;
s1b += sa * x [j1]; // diagonal element
b += s1b+s2b+s3b+s4b;
}
#else
for( i=1; i<=n; i++)
{
k1 = p_row[i+1]-1;

m = p_row + ((k1 - p_row) % 4);

xi1 = x;

k = p_row;
s1b = sa * xi1; // diagonal element
s2b = 0.0;
s3b = 0.0;
s4b = 0.0;

for( k = p_row+1; k <= m; k++ ) // exclude diagonal element
{
j=ija;
s1b += sa * x; // A
b += sa * xi1;
}

for( k = m+1; k <= k1; k+=4 )
{
j1=ija;
j2=ija[k+1];
j3=ija[k+2];
j4=ija[k+3];

s1b += sa * x [j1]; // A
b [j1] += sa * xi1;

s2b += sa[k+1] * x [j2]; // A
b [j2] += sa[k+1] * xi1;

s3b += sa[k+2] * x [j3]; // A
b [j3] += sa[k+2] * xi1;

s4b += sa[k+3] * x [j4]; // A
b [j4] += sa[k+3] * xi1;
}
b += s1b+s2b+s3b+s4b;
}
#endif
}

Todd_R_Intel · ‎10-26-2010

A fix for this was released in Intel MKL 10.2.6. -Todd