- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I've revealed a strange behaviour in working with IPP and MKL. FFTs performed with IPP are considerably faster if used after calling a MKL function.
Is there anybody able to explain this behaviour?
I append a small C code where the following steps are performed in order:
-a FFT with IPP
-a FFT with MKL
-a FFT with IPP
Where the first and the third steps are exactly the same function.
Comipiling with the options:
$ g++ ipp_mkl_ipp.cpp -g -O3 -msse2 -msse -msse3 -msse4a -L/opt/intel/Compiler/11.0/081/mkl/lib/em64t -L/opt/intel/Compiler/11.0/081/ipp/em64t/sharedlib -I/opt/intel/Compiler/11.0/081/mkl/include -I/opt/intel/Compiler/11.0/081/ipp/em64t/include -L/opt/intel/Compiler/11.0/081/lib/intel64 -lmkl -lippsem64t -liomp5 -pthread -lmkl_intel_lp64 -ffast-math
the output on the my Xeon E5440 is:
IPP: 0.487071
MKL: 0.382079
IPP: 0.352979
The same function called two times have different behaviour. Puzzling? Probably not enough. If you try compiling without using -ffast-math you obtain even stranger results.
The code is the following:
#include
#include
#include
#include
#include
#include
double MKLBenchmark(const unsigned int aOrder);
double IPPBenchmark(const unsigned int aOrder);
int main() {
if (ippStaticInit() == ippStsNoOperationInDll) {
std::cout << "WARNING: ippStaticInit failed" << std::endl;
}
const size_t myOrder = 16;
std::cout << "IPP: " << IPPBenchmark(myOrder) << std::endl;
std::cout << "MKL: " << MKLBenchmark(myOrder) << std::endl;
std::cout << "IPP: " << IPPBenchmark(myOrder) << std::endl;
}
// ----------------------------------------------------------------------------
double MKLBenchmark(const unsigned int aOrder) {
const unsigned int myLength = (1 << aOrder);
std::complex myA[myLength];
std::complex myB[myLength];
struct timeval myStartingTime;
struct timeval myEndingTime;
DFTI_DESCRIPTOR *my_desc_handle;
DftiCreateDescriptor( &my_desc_handle, DFTI_SINGLE,
DFTI_COMPLEX, 1, myLength);
DftiSetValue( my_desc_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
DftiCommitDescriptor( my_desc_handle );
gettimeofday(&myStartingTime, 0);
for(unsigned int myRepetition = 0; myRepetition < 1000; ++myRepetition) {
DftiComputeForward( my_desc_handle, myA, myB);
}
gettimeofday(&myEndingTime, 0);
DftiFreeDescriptor(&my_desc_handle);
return (myEndingTime.tv_sec - myStartingTime.tv_sec +
(myEndingTime.tv_usec - myStartingTime.tv_usec) / 1000000.0);
}
// ----------------------------------------------------------------------------
double IPPBenchmark(const unsigned int aOrder) {
IppsFFTSpec_C_32fc* mySpec;
ippsFFTInitAlloc_C_32fc(&mySpec, aOrder, IPP_FFT_NODIV_BY_ANY,
ippAlgHintFast);
int myBufferSize;
ippsFFTGetBufSize_C_32fc(mySpec, &myBufferSize);
Ipp8u *myBuffer = ippsMalloc_8u(myBufferSize); // buffer used to speed up
const unsigned int myLength = (1 << aOrder);
Ipp32fc *myA = ippsMalloc_32fc(myLength);
Ipp32fc *myB = ippsMalloc_32fc(myLength);
struct timeval myStartingTime;
struct timeval myEndingTime;
gettimeofday(&myStartingTime, 0);
for (unsigned int myRepetition = 0; myRepetition < 1000; ++myRepetition) {
ippsFFTFwd_CToC_32fc(myA, myB, mySpec, myBuffer);
}
gettimeofday(&myEndingTime, 0);
ippsFree(myA);
ippsFree(myB);
ippsFree(myBuffer);
ippsFFTFree_C_32fc(mySpec);
return(myEndingTime.tv_sec - myStartingTime.tv_sec +
(myEndingTime.tv_usec - myStartingTime.tv_usec) / 1000000.0);
}
Is there anybody able to explain this behaviour?
I append a small C code where the following steps are performed in order:
-a FFT with IPP
-a FFT with MKL
-a FFT with IPP
Where the first and the third steps are exactly the same function.
Comipiling with the options:
$ g++ ipp_mkl_ipp.cpp -g -O3 -msse2 -msse -msse3 -msse4a -L/opt/intel/Compiler/11.0/081/mkl/lib/em64t -L/opt/intel/Compiler/11.0/081/ipp/em64t/sharedlib -I/opt/intel/Compiler/11.0/081/mkl/include -I/opt/intel/Compiler/11.0/081/ipp/em64t/include -L/opt/intel/Compiler/11.0/081/lib/intel64 -lmkl -lippsem64t -liomp5 -pthread -lmkl_intel_lp64 -ffast-math
the output on the my Xeon E5440 is:
IPP: 0.487071
MKL: 0.382079
IPP: 0.352979
The same function called two times have different behaviour. Puzzling? Probably not enough. If you try compiling without using -ffast-math you obtain even stranger results.
The code is the following:
#include
#include
#include
#include
#include
#include
double MKLBenchmark(const unsigned int aOrder);
double IPPBenchmark(const unsigned int aOrder);
int main() {
if (ippStaticInit() == ippStsNoOperationInDll) {
std::cout << "WARNING: ippStaticInit failed" << std::endl;
}
const size_t myOrder = 16;
std::cout << "IPP: " << IPPBenchmark(myOrder) << std::endl;
std::cout << "MKL: " << MKLBenchmark(myOrder) << std::endl;
std::cout << "IPP: " << IPPBenchmark(myOrder) << std::endl;
}
// ----------------------------------------------------------------------------
double MKLBenchmark(const unsigned int aOrder) {
const unsigned int myLength = (1 << aOrder);
std::complex
std::complex
struct timeval myStartingTime;
struct timeval myEndingTime;
DFTI_DESCRIPTOR *my_desc_handle;
DftiCreateDescriptor( &my_desc_handle, DFTI_SINGLE,
DFTI_COMPLEX, 1, myLength);
DftiSetValue( my_desc_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
DftiCommitDescriptor( my_desc_handle );
gettimeofday(&myStartingTime, 0);
for(unsigned int myRepetition = 0; myRepetition < 1000; ++myRepetition) {
DftiComputeForward( my_desc_handle, myA, myB);
}
gettimeofday(&myEndingTime, 0);
DftiFreeDescriptor(&my_desc_handle);
return (myEndingTime.tv_sec - myStartingTime.tv_sec +
(myEndingTime.tv_usec - myStartingTime.tv_usec) / 1000000.0);
}
// ----------------------------------------------------------------------------
double IPPBenchmark(const unsigned int aOrder) {
IppsFFTSpec_C_32fc* mySpec;
ippsFFTInitAlloc_C_32fc(&mySpec, aOrder, IPP_FFT_NODIV_BY_ANY,
ippAlgHintFast);
int myBufferSize;
ippsFFTGetBufSize_C_32fc(mySpec, &myBufferSize);
Ipp8u *myBuffer = ippsMalloc_8u(myBufferSize); // buffer used to speed up
const unsigned int myLength = (1 << aOrder);
Ipp32fc *myA = ippsMalloc_32fc(myLength);
Ipp32fc *myB = ippsMalloc_32fc(myLength);
struct timeval myStartingTime;
struct timeval myEndingTime;
gettimeofday(&myStartingTime, 0);
for (unsigned int myRepetition = 0; myRepetition < 1000; ++myRepetition) {
ippsFFTFwd_CToC_32fc(myA, myB, mySpec, myBuffer);
}
gettimeofday(&myEndingTime, 0);
ippsFree(myA);
ippsFree(myB);
ippsFree(myBuffer);
ippsFFTFree_C_32fc(mySpec);
return(myEndingTime.tv_sec - myStartingTime.tv_sec +
(myEndingTime.tv_usec - myStartingTime.tv_usec) / 1000000.0);
}
Link Copied
5 Replies
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I'm naturally wondering what your timing is when you repeatedly call IPP FFTs
-a FFT with IPP
-a FFT with IPP
-a FFT with IPP
.
.
etc.
-Ozzer
-a FFT with IPP
-a FFT with IPP
-a FFT with IPP
.
.
etc.
-Ozzer
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Quoting - ozzer
I'm naturally wondering what your timing is when you repeatedly call IPP FFTs
-a FFT with IPP
-a FFT with IPP
-a FFT with IPP
.
.
etc.
-Ozzer
-a FFT with IPP
-a FFT with IPP
-a FFT with IPP
.
.
etc.
-Ozzer
IPP: 0.490795
IPP: 0.483755
IPP: 0.488267
IPP: 0.487029
IPP: 0.48782
IPP: 0.487979
IPP: 0.486707
IPP: 0.488581
Just mentioning:
IPP: 0.480533
MKL: 0.38359
IPP: 0.353908
IPP: 0.353522
IPP: 0.353394
IPP: 0.353599
IPP: 0.353558
IPP: 0.353552
IPP: 0.353574
But if compiling without using the option -ffast-math the previous results turn to:
IPP: 0.481412
IPP: 1.88403
IPP: 1.88419
IPP: 1.8837
IPP: 1.88367
IPP: 1.88404
IPP: 1.88424
IPP: 1.8842
and
IPP: 0.485796
MKL: 0.38246
IPP: 5.83902
IPP: 5.83847
IPP: 5.83876
IPP: 5.83838
IPP: 5.8386
IPP: 5.84151
IPP: 5.83828
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
1. Not initialize myA. (myA = ?)
2. See result value myB. ( cout<
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Quoting - ipp_agor
1. Not initialize myA. (myA = ?)
2. See result value myB. ( cout<
I don't initialize myA because I'm just interested in FFT-time, whatever is in the memory is fine with me. Anyway even trying your suggestion nothinh changes. I've verified the values of myB and they are number.
(I'm fbasile, I posted by mistake with this different account)
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Quoting - gmendolambigroup.it
I don't initialize myA because I'm just interested in FFT-time, whatever is in the memory is fine with me. Anyway even trying your suggestion nothinh changes. I've verified the values of myB and they are number.
(I'm fbasile, I posted by mistake with this different account)
Hi fbasile,
Just let you know
I've reproduced the problem.MKLFFT threading is responsible for the low efficiency.
The problem is fixed inlatest MKL 10.2 version. You arewelcomed to try it.
Thedownload package can be get from <https://registrationcenter.intel.com/regcenter/register.aspx>
or get 30-days trial version from <http://software.intel.com/en-us/intel-mkl/>
Here're the details.
IPP FFT run in slower after call MKL FFT.
FFT use two cpu before call MKL FFT, but only use one cpu after call MKL FFT.(attached the test code)
MKL really does ippSetNumThreads(1) inside, and doesn't restore the IPP number of threads. It is incorrect. We fix it in MKL 10.2 version.
Thanks
Ying

Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page