I've revealed a strange behaviour in working with IPP and MKL. FFTs performed with IPP are considerably faster if used after calling a MKL function.
Is there anybody able to explain this behaviour?
I append a small C code where the following steps are performed in order:
-a FFT with IPP
-a FFT with MKL
-a FFT with IPP
Where the first and the third steps are exactly the same function.
Comipiling with the options:
$ g++ ipp_mkl_ipp.cpp -g -O3 -msse2 -msse -msse3 -msse4a -L/opt/intel/Compiler/11.0/081/mkl/lib/em64t -L/opt/intel/Compiler/11.0/081/ipp/em64t/sharedlib -I/opt/intel/Compiler/11.0/081/mkl/include -I/opt/intel/Compiler/11.0/081/ipp/em64t/include -L/opt/intel/Compiler/11.0/081/lib/intel64 -lmkl -lippsem64t -liomp5 -pthread -lmkl_intel_lp64 -ffast-math
the output on the my Xeon E5440 is:
IPP: 0.487071
MKL: 0.382079
IPP: 0.352979
The same function called two times have different behaviour. Puzzling? Probably not enough. If you try compiling without using -ffast-math you obtain even stranger results.
The code is the following:
#include
#include
#include
#include
#include
#include
double MKLBenchmark(const unsigned int aOrder);
double IPPBenchmark(const unsigned int aOrder);
int main() {
if (ippStaticInit() == ippStsNoOperationInDll) {
std::cout << "WARNING: ippStaticInit failed" << std::endl;
}
const size_t myOrder = 16;
std::cout << "IPP: " << IPPBenchmark(myOrder) << std::endl;
std::cout << "MKL: " << MKLBenchmark(myOrder) << std::endl;
std::cout << "IPP: " << IPPBenchmark(myOrder) << std::endl;
}
// ----------------------------------------------------------------------------
double MKLBenchmark(const unsigned int aOrder) {
const unsigned int myLength = (1 << aOrder);
std::complex myA[myLength];
std::complex myB[myLength];
struct timeval myStartingTime;
struct timeval myEndingTime;
DFTI_DESCRIPTOR *my_desc_handle;
DftiCreateDescriptor( &my_desc_handle, DFTI_SINGLE,
DFTI_COMPLEX, 1, myLength);
DftiSetValue( my_desc_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
DftiCommitDescriptor( my_desc_handle );
gettimeofday(&myStartingTime, 0);
for(unsigned int myRepetition = 0; myRepetition < 1000; ++myRepetition) {
DftiComputeForward( my_desc_handle, myA, myB);
}
gettimeofday(&myEndingTime, 0);
DftiFreeDescriptor(&my_desc_handle);
return (myEndingTime.tv_sec - myStartingTime.tv_sec +
(myEndingTime.tv_usec - myStartingTime.tv_usec) / 1000000.0);
}
// ----------------------------------------------------------------------------
double IPPBenchmark(const unsigned int aOrder) {
IppsFFTSpec_C_32fc* mySpec;
ippsFFTInitAlloc_C_32fc(&mySpec, aOrder, IPP_FFT_NODIV_BY_ANY,
ippAlgHintFast);
int myBufferSize;
ippsFFTGetBufSize_C_32fc(mySpec, &myBufferSize);
Ipp8u *myBuffer = ippsMalloc_8u(myBufferSize); // buffer used to speed up
const unsigned int myLength = (1 << aOrder);
Ipp32fc *myA = ippsMalloc_32fc(myLength);
Ipp32fc *myB = ippsMalloc_32fc(myLength);
struct timeval myStartingTime;
struct timeval myEndingTime;
gettimeofday(&myStartingTime, 0);
for (unsigned int myRepetition = 0; myRepetition < 1000; ++myRepetition) {
ippsFFTFwd_CToC_32fc(myA, myB, mySpec, myBuffer);
}
gettimeofday(&myEndingTime, 0);
ippsFree(myA);
ippsFree(myB);
ippsFree(myBuffer);
ippsFFTFree_C_32fc(mySpec);
return(myEndingTime.tv_sec - myStartingTime.tv_sec +
(myEndingTime.tv_usec - myStartingTime.tv_usec) / 1000000.0);
}