Solved: Writing c extension for python that calls mkl and can use more than 1 cpu.

Roger_X_ · ‎06-16-2016

I tried directly calling the mkl from python with ctypes, but in that case, mkl can only use a single cpu. The cause of that problem is unknown.

I am writing a c extension for python that calls the mkl as an alternative approach.

The following c extension can be imported into python without problem. However, when I call the function, it created the following error message:

    Intel MKL FATAL ERROR: Cannot load libmkl_mc.so or libmkl_def.so

What is the correct options for the icc compiler that I should use in setup.py?

I get some of the options in the setup.py from the intel link line advisor. I can't put all the options into setup.py.

mkl_helper.c

#include "Python.h"
#include "mkl.h"
#include "numpy/arrayobject.h"

static PyObject* test4 (PyObject *self, PyObject *args)
{
	// test4 (m, n,
	//        a, ja, ia,
	//        c, jc, ic)
	
	PyArrayObject *shape_array;
	PyArrayObject *a_array;   // csr_matrix.data
	PyArrayObject *ja_array;  // csr_matrix.indices
	PyArrayObject *ia_array;  // csr_matrix.indptr
	PyArrayObject *c_array;
	PyArrayObject *jc_array;
	PyArrayObject *ic_array;
	
	if (!PyArg_ParseTuple(args, "O!O!O!O!O!O!O!", 
				&PyArray_Type, &shape_array,
				&PyArray_Type, &a_array,
				&PyArray_Type, &ja_array,
				&PyArray_Type, &ia_array,
				&PyArray_Type, &c_array,
				&PyArray_Type, &jc_array,
				&PyArray_Type, &ic_array))
	{
		return NULL;
	}

	int  * ptr_int     = shape_array->data;
	int m               = ptr_int[0];
	int n               = ptr_int[1];
	int k               = n;

	float *  a_data_ptr =  a_array->data;
	float * ja_data_ptr = ja_array->data;
	float * ia_data_ptr = ia_array->data;
	float *  c_data_ptr =  c_array->data;
	float * jc_data_ptr = jc_array->data;
	float * ic_data_ptr = ic_array->data;

	char trans  = 'T';
	int sort    = 0;
	int nzmax   = n*n;
	int info    = 0;
	int request = 0;

	// This is supposed to "suggest" mkl use 12 threads.
        // I also tried mkl_set_num_threads(&num_cpu);
        // That also doesn't work.

	int num_cpu = 12;
	mkl_set_num_threads(12);
        mkl_set_num_threads_local(12);
        mkl_domain_set_num_threads(12,0);

	mkl_scsrmultcsr(&trans, &request, &sort,
			    &m, &n, &k,
			    a_data_ptr, ja_data_ptr, ia_data_ptr,
			    a_data_ptr, ja_data_ptr, ia_data_ptr,
			    c_data_ptr, jc_data_ptr, ic_data_ptr,
			    &nzmax, &info);

	return PyInt_FromLong(info);
}


static struct PyMethodDef methods[] = {
    {"test4", test4, METH_VARARGS, "test2(arr1)\n take a numpy array and return its shape as a tuple"},
    {NULL, NULL, 0, NULL}
};

PyMODINIT_FUNC
initmkl_helper (void)
{
    (void)Py_InitModule("mkl_helper", methods);
    import_array();
}

setup.py

from distutils.core import setup, Extension
import numpy as np
extra_link_args=["-Bstatic","-I${MKLROOT}/include", "-L{$MKLROOT}/lib/intel64/"]
extra_link_args += ["-mkl"]
extra_link_args += ["-lrt" ]
extra_link_args += ["-L${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a", "-L${MKLROOT}/lib/intel64/libmkl_core.a", "-L${MKLROOT}/lib/intel64/libmkl_intel_thread.a", "-lpthread", "-lm", "-ldl"] 

extra_link_args += ["-DMKL_ILP64", "-qopenmp" ,"-I${MKLROOT}/include"]

ext_modules = [ Extension('mkl_helper', sources = ['mkl_helper.c'], extra_link_args=extra_link_args) ]


setup(
        name = 'mkl_helper',
        version = '1.0',
        include_dirs = [np.get_include()], #Add Include path of numpy
        ext_modules = ext_modules
)

test.py

import mkl_helper
import numpy as np

import numpy as np
import scipy.sparse as spsp

def get_csr_handle2(data, indices, indptr, shape):
	a_pointer   = data.ctypes.data_as(POINTER(c_float))
	ja_pointer  = indices.ctypes.data_as(POINTER(c_int))
	ia_pointer  = indptr.ctypes.data_as(POINTER(c_int))
	return (a_pointer, ja_pointer, ia_pointer, shape)

def get_csr_handle(A,clear=False):
	if clear == True:
		A.indptr[:] = 0
		A.indices[:] = 0
		A.data[:] = 0
	return get_csr_handle2(A.data, A.indices, A.indptr, A.shape)

print "test4"

test_size = 1200
test_size2 = 1200
AA = np.random.choice([0,1], size=(test_size,test_size2), replace=True, p=[0.99,0.01])
A_original = spsp.csr_matrix(AA)
print "Answer from scipy:"
print AA.dot(AA.T)
A = A_original.astype(np.float32).tocsc()
A = spsp.csr_matrix( (A.data, A.indices, A.indptr) )

A.indptr  += 1 # convert to 1-based indexing
A.indices += 1 # convert to 1-based indexing

C = spsp.csr_matrix( np.ones((test_size,test_size)), dtype=np.float32)

(m,n) = A.shape
shape_arr = np.array([m,n], dtype=np.int32)
while(True):
	ret = mkl_helper.test4(shape_arr, A.data, A.indices, A.indptr, C.data, C.indices, C.indptr)
C.indptr  -= 1
C.indices -= 1
nz = C.indptr[test_size]
print "nz:",nz
print "Answer from mkl"
C_fix = spsp.csr_matrix( (C.data[:nz], C.indices[:nz], C.indptr[:(test_size+1)]), shape=(test_size, test_size))
print C_fix.todense().astype(int)
#print C.todense()
print "ret:", ret

Gennady_F_Intel · ‎06-17-2016

if we are talking about mkl_scsrmultcsr(...), then the problem size is pretty small and internally this computation is dispatched to 1 thread execution mode.

View solution in original post

Roger_X_ · ‎06-16-2016

I finally made a setup.py that seems to work ...

from distutils.core import setup, Extension
import numpy as np
d = {}
d['MKLROOT'] = "/gpfs/rxu/intel/compilers_and_libraries_2016.3.210/linux/mkl"

extra_compile_args = "-DMKL_ILP64 -qopenmp -I{MKLROOT}/include".format(**d).split(' ')
extra_link_args = "-Wl,--start-group {MKLROOT}/lib/intel64/libmkl_intel_ilp64.a {MKLROOT}/lib/intel64/libmkl_core.a {MKLROOT}/lib/intel64/libmkl_intel_thread.a -Wl,--end-group -lpthread -lm -ldl -liomp5".format(**d).split(' ')
ext_modules = [ Extension('mkl_helper', sources = ['mkl_helper.c'], extra_link_args=extra_link_args, extra_compile_args=extra_compile_args)] 


setup(
        name = 'mkl_helper',
        version = '1.0',
        include_dirs = [np.get_include()], #Add Include path of numpy
        ext_modules = ext_modules
)

From mkl linkline advisor

	Intel® Math Kernel Library (Intel® MKL) Link Line Advisor v4.6
	Select Intel® product:      Intel(R) Parallel Studio XE 2016
	Select OS:	                Linux*

	Select usage model of
	Intel® Xeon Phi™
	Coprocessor:                None

	Select compiler:	        Intel(R) C/C++
	Select architecture:        Intel(R) 64

	Select dynamic or
	static linking:	            Static

	Select interface layer:     ILP64 (64-bit integer)
	Select threading layer:     OpenMP threading
	Select OpenMP library:      Intel(R) (libiomp5)
	Select cluster library:     [ ] Cluster PARDISO (BLACS required)
								[ ] CDFT (BLACS required)
								[ ] ScaLAPACK (BLACS required)
								[ ] BLACS
	Select MPI library:         <Select MPI>

	Select the Fortran
	95 interfaces:              [ ] BLAS95
								[ ] LAPACK95

	Link with Intel® MKL
	libraries explicitly:       [Check] 

	Use this link line:
	 -Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a ${MKLROOT}/lib/intel64/libmkl_core.a ${MKLROOT}/lib/intel64/libmkl_intel_thread.a -Wl,--end-group -lpthread -lm -ldl

	Compiler options:
	 -DMKL_ILP64 -qopenmp -I${MKLROOT}/include

shorthand:

    %INTELHOME=/gpfs/rxu/intel/compilers_and_libraries_2016.3.210
    %MKLHOME=%INTELHOME/linux/mkl
    %USR=/home/rxu/local_icc/

Setting up evironment

    source %CCEHOME/bin/iccvars.sh 
    source %FCEHOME/bin/ifortvars.sh 
    source %OPENMPIHOME/bin/mpivars.sh
    source %INTELHOME/bin/compilervars.sh intel64
    export PATH=%USR/bin:/share/apps/pkgs/openmpi.1.8.1/bin:$PATH
    source %MKLHOME/bin/mklvars.sh intel64

    export CC="icc"
    export CXX="icpc"
    export F77=ifort
    export LD=xild
    export AR=xiar
    export CPP="icc -E"

Output from python setup.py install

    running install
    running build
    running build_ext
    building 'mkl_helper' extension
    icc -fno-strict-aliasing -O3 -fp-model strict -fp-model source -xHost -ipo -prec-div -prec-sqrt -DNDEBUG -g -O3 -Wall -Wstrict-prototypes -fPIC -I%USR/lib/python2.7/site-packages/numpy-1.11.0-py2.7-linux-x86_64.egg/numpy/core/include -I%USR/include/python2.7 -c mkl_helper.c -o build/temp.linux-x86_64-2.7/mkl_helper.o -DMKL_ILP64 -qopenmp -I%MKLHOME/include
    icc: command line warning #10006: ignoring unknown option '-qopenmp'

    ... some harmless warning ...

    icc -shared -L%USR/lib/ -L%USR/lib64/ -L%USR/lib/thread2.7.3 -L%USR/lib/itcl4.0.4 -L%USR/lib/tdbc1.0.4 -L%USR/lib/tdbcmysql1.0.4 -L%USR/lib/tdbcodbc1.0.4 -L%USR/lib/tdbcpostgres1.0.4 -L%USR/lib/sqlite3.11.0 -L%USR/lib/thread2.7.3/ -L%USR/lib/ -L%USR/lib64/ -L%USR/lib/thread2.7.3 -L%USR/lib/itcl4.0.4 -L%USR/lib/tdbc1.0.4 -L%USR/lib/tdbcmysql1.0.4 -L%USR/lib/tdbcodbc1.0.4 -L%USR/lib/tdbcpostgres1.0.4 -L%USR/lib/sqlite3.11.0 -L%USR/lib/thread2.7.3/ -L%USR/lib/ -L%USR/lib64/ -L%USR/thread2.7.3 build/temp.linux-x86_64-2.7/mkl_helper.o -L%USR/lib -lpython2.7 -o build/lib.linux-x86_64-2.7/mkl_helper.so -Wl,--start-group %MKLHOME/lib/intel64/libmkl_intel_ilp64.a %MKLHOME/lib/intel64/libmkl_core.a %MKLHOME/lib/intel64/libmkl_intel_thread.a -Wl,--end-group -lpthread -lm -ldl -liomp5
    %INTELHOME/linux/compiler/lib/intel64/libimf.so: warning: warning: feupdateenv is not implemented and will always fail
    running install_lib
    copying build/lib.linux-x86_64-2.7/mkl_helper.so -> %USR/lib/python2.7/site-packages
    running install_egg_info
    Removing %USR/lib/python2.7/site-packages/mkl_helper-1.0-py2.7.egg-info
    Writing %USR/lib/python2.7/site-packages/mkl_helper-1.0-py2.7.egg-info

The problem was that distutil library of python adds a bunch of flags. Those flags seems to be the ones I use when I manually compile numpy linked to mkl.

The new error message is:

    Intel MKL ERROR: Parameter 2 was incorrect on entry to MKL_SCSRMULTCSR.

Is this an error of the code, or an error of the compiler/linker flags in setup.py?

Roger_X_ · ‎06-16-2016

In setup.py, ilp64 is used.

The 2nd parameter of csrcsc is request and has a MKL_INT type.

MKL_INT should be long long instead of integer when ilp64 libraries are used.

New setup.py that works:

from distutils.core import setup, Extension
import numpy as np

d = {}
d['MKLROOT'] = "/gpfs/rxu/intel/compilers_and_libraries_2016.3.210/linux/mkl"

extra_compile_args = "-qopenmp -I{MKLROOT}/include".format(**d).split(' ')
extra_link_args = "-Wl,--start-group {MKLROOT}/lib/intel64/libmkl_intel_lp64.a {MKLROOT}/lib/intel64/libmkl_core.a {MKLROOT}/lib/intel64/libmkl_intel_thread.a -Wl,--end-group -lpthread -lm -ldl -liomp5".format(**d).split(' ')
ext_modules = [ Extension('mkl_helper', sources = ['mkl_helper.c'], extra_link_args=extra_link_args, extra_compile_args=extra_compile_args)] 

setup(
	name = 'mkl_helper',
	version = '1.0',
	include_dirs = [np.get_include()], #Add Include path of numpy
	ext_modules = ext_modules
)

Roger_X_ · ‎06-16-2016

Yet. after all of this, mkl still just use one of the 12 cpu.

Changing setup.py to the following would link mkl as shared library.

This doesn't give any warning about icc fail to interpret any flag.

extra_compile_args = "-I${MKLROOT}/include".format(**d).split(" ")
extra_link_args = "-L{MKLROOT}/lib/intel64 -lmkl_rt -lpthread -lm -ldl".format(**d).split(" ")

It still only use one out of the 12 cpu on the machine.

Gennady_F_Intel · ‎06-17-2016

if we are talking about mkl_scsrmultcsr(...), then the problem size is pretty small and internally this computation is dispatched to 1 thread execution mode.