dcsrmm is throwing 'integer division by zero' after upgrading to 11.0 update 5

Henrik_A_ · ‎01-12-2015

After updating the installed MKL version when attempting to avoid another bug we discovered, previously working code has started throwing division by zero errors after sparse matrices exceed a certain nonzero count. (Again using parallel 64bit MKL)

Unhandled exception at 0x000007FEDEC0DB3D (mkl_avx.dll) in TestSparseMultiply.exe: 0xC0000094: Integer division by zero.

Callstack:

     mkl_avx.dll!000007fedec0db3d()   Unknown
    mkl_intel_thread.dll!000007fee0549de3()   Unknown

    mkl_intel_thread.dll!000007fee02d5037()   Unknown

Repro case:

#include <mkl.h>



int _tmain(int argc, _TCHAR* argv[])
{
	const size_t sparseMatrixWidth = 6045696;
	const size_t sparseMatrixHeight = 200;
	const size_t sparseMatrixUsage = 1000000 / 2; // <= this should crash
//	const size_t sparseMatrixUsage = 1000000 / 3; // <= This still works

	double *sparseMatrixData = new double[sparseMatrixUsage * sparseMatrixHeight];
	::memset(sparseMatrixData, 0, sizeof(double) * sparseMatrixUsage * sparseMatrixHeight);

	MKL_INT *colIndices = new MKL_INT[sparseMatrixUsage * sparseMatrixHeight];
	::memset(colIndices, 0, sizeof(MKL_INT) * sparseMatrixUsage * sparseMatrixHeight);

	MKL_INT *rowOffsets = new MKL_INT[sparseMatrixHeight + 1];
	::memset(colIndices, 0, sizeof(MKL_INT) * (sparseMatrixHeight + 1));

	::srand(42);
	rowOffsets[0] = 0 + 1; // 1 based
	for (unsigned long y=0;y<sparseMatrixHeight;++y)
	{
		MKL_INT lastIndex = 0;
		for (unsigned long x=0;x<sparseMatrixUsage;++x)
		{
			//	Jump forward a random amount, ensuring even if we hit the maximum jump everytime we do not exceed the matrix limits
			int jump = rand() % (sparseMatrixWidth / sparseMatrixUsage - 1);

			lastIndex = lastIndex + 1 + jump; // Ensure we jump forward at least 1 column
			sparseMatrixData[x + y*sparseMatrixUsage] = 1.0;
			colIndices[x + y*sparseMatrixUsage] = lastIndex + 1; // 1 based
		}
		rowOffsets[y + 1] = sparseMatrixUsage * (y + 1) + 1; // 1 based
	}


	//	Doublecheck: Verify sparse matrix properties
	for (unsigned long y=0;y<sparseMatrixHeight;++y)
	{
		//	Since we are using one based matrices, nothing is allowed to be zero
		if (rowOffsets == 0)
			return -1;
		//	Row data must be ordered in memory
		if (rowOffsets[y + 1]< rowOffsets)
			return -1;
		//	If rows are not empty ...
		if (rowOffsets != rowOffsets[y + 1])
		{
			// ... make sure column indices are one based and do not exceed the matrix size
			for (unsigned long i = rowOffsets;i < rowOffsets[y + 1];++i)
			{
				if (colIndices[i - 1] <= 0)
					return -1;
				if (colIndices[i - 1] >= sparseMatrixWidth)
					return -1;
			}

			//	... make sure column indices are in ascending order
			for (unsigned long i = rowOffsets;i < rowOffsets[y + 1] - 1;++i)
			{
				if (colIndices[i - 1 + 1] <= colIndices[i - 1])
					return -1;
			}
		}
	}

	//	Calculate matrix average by multiplying with a vector containing ones and scaling by the inverse vector length
	const size_t rightVectorLength = sparseMatrixWidth;
	double *vectorData = new double[rightVectorLength];
	for (unsigned long x=0;x<rightVectorLength;++x)
	{
		vectorData = 1.0;
	}

	//	Store the result in this vector
	const size_t resultVectorLength = sparseMatrixHeight;
	double *resultData = new double[resultVectorLength];
	

	char matdescra[6] = {'g', 'l', 'n', 'f', 'x', 'x'};
	/* https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-34C8DB79-0139-46E0-8B53-99F3BEE7B2D4.htm#TBL2-6
	G: General. D: Diagonal
	L/U Lower/Upper triangular (ignored with G)
	N: non-unit diagonal (ignored with G)
	C: zero-based indexing. / F: one-based indexing
	*/

	MKL_INT strideA = static_cast<MKL_INT>(rightVectorLength);

	char transposeAtxt = 'N';
	MKL_INT numRowsA = static_cast<MKL_INT>(sparseMatrixHeight),
			numDestCols = static_cast<MKL_INT>(1),
			numColsA = static_cast<MKL_INT>(sparseMatrixWidth);

	double tempAlpha = 1.0 / sparseMatrixWidth;
	double tempBeta = 0.0;
	MKL_INT resultStride = static_cast<MKL_INT>(resultVectorLength);

	::mkl_dcsrmm(	&transposeAtxt,
					&numRowsA,
					&numDestCols,
					&numColsA,
					&tempAlpha,
					matdescra,
					sparseMatrixData,
					colIndices,
					rowOffsets,
					rowOffsets + 1,
					vectorData, &strideA,
					&tempBeta,
					resultData, &resultStride );


	return 0;
}

VipinKumar_E_Intel · ‎01-14-2015

Henrik,

I can reproduce the crash in MKL 11.2 as well with this testcase. Let's investigate it further.

--Vipin

VipinKumar_E_Intel · ‎06-23-2015

Hi Henrik,

This is to inform you that the fix for this issue is available in our recently release 11.3 beta update 1 and 11.2.3 MKL releases.

Vipin