- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
I am trying to perform 2D FFT on an Intel i5-6500 CPU@3.20 GHz.
I have coded a program to benchmark FFTW and Intel MKL, surprisingly, the FFTW outperformed the Intel FFTW library by 3 times, that is the FFTW time is 3 times less than Intel MKL.
Here are the results:
2023-04-26T11:01:31+05:30
Running FFTWCompare.exe
Run on (4 X 3192 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x4)
L1 Instruction 32 KiB (x4)
L2 Unified 256 KiB (x4)
L3 Unified 6144 KiB (x1)
------------------------------------------------------
Benchmark Time CPU Iterations
------------------------------------------------------
BM_IntelFFT 915017 ns 920348 ns 747
BM_FFTW 390155 ns 392369 ns 1792
The CMake File:
cmake_minimum_required(VERSION 3.25)
project(Benchmarking)
set(CMAKE_CXX_STANDARD 23)
set(CMAKE_BUILD_TYPE RELEASE)
find_package(benchmark REQUIRED)
set(CMAKE_C_FLAGS "-Ofast -march=native")
set(CMAKE_CXX_FLAGS "-Ofast -march=native")
set(FFTW C:/Users/IIAP-IPC/Documents/fftw-3)
set(FFTWLINK ${FFTW}/libfftw3l-3.lib ${FFTW}/libfftw3f-3.lib ${FFTW}/libfftw3-3.lib)
set(INTEL_FFTWLINK C:/PROGRA~2/Intel/oneAPI/mkl/2023.1.0/lib/intel64)
set(INTEL_FFTW_INCLUDE C:/PROGRA~2/Intel/oneAPI/mkl/2023.1.0/include)
set(INTEL_FFTWLINKLIB ${INTEL_FFTWLINK}/mkl_intel_lp64.lib ${INTEL_FFTWLINK}/mkl_core.lib ${INTEL_FFTWLINK}/mkl_sequential.lib)
add_executable(FFTWCompare main.cpp)
target_include_directories(FFTWCompare PRIVATE ${INTEL_FFTW_INCLUDE} ${FFTW})
target_link_libraries(FFTWCompare PRIVATE ${INTEL_FFTWLINKLIB} ${FFTWLINK} benchmark::benchmark)
#include <iostream>
#include <fstream>
#include <mkl.h>
#include <complex>
#include <chrono>
#include <fftw3.h>
#include <benchmark/benchmark.h>
#define NN 256
#define NPIXFFT NN * (1 + (NN / 2))
using namespace std;
typedef struct {
double re;
double im;
} mkl_double_complex;
fftw_plan PlanForward;
fftw_plan PlanInverse;
int getIntelFFTWPlans(DFTI_DESCRIPTOR_HANDLE *descHandle);
static void BM_FFTW(benchmark::State& state) {
double *forIN;
forIN = new double[NN * NN];
fftw_complex *forOUT;
forOUT = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * NPIXFFT);
PlanForward = fftw_plan_dft_r2c_2d(NN, NN, forIN, forOUT, FFTW_MEASURE);
// Inverse plan
fftw_complex *invIN;
invIN = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * NPIXFFT);
double *invOUT;
invOUT = new double[NN * NN];
PlanInverse = fftw_plan_dft_c2r_2d(NN, NN, invIN, invOUT, FFTW_MEASURE);
double *image = (double*)malloc(sizeof(double) * NN * NN);
double *recoveredImage = (double*)malloc(sizeof(double) * NN * NN);
fftw_complex *imageFT = (fftw_complex*) mkl_malloc(
NPIXFFT * sizeof(fftw_complex),64);
for (unsigned int i=0; i<NN * NN; i++){
image[i] = i * i + i * 2 + 1;
}
for (auto _ : state) {
// This code gets timed
fftw_execute_dft_r2c( PlanForward, image, imageFT);
fftw_execute_dft_c2r( PlanInverse, imageFT, recoveredImage);
benchmark::DoNotOptimize(recoveredImage);
benchmark::ClobberMemory();
}
}
static void BM_IntelFFT(benchmark::State& state) {
DFTI_DESCRIPTOR_HANDLE descHandle;
getIntelFFTWPlans(&descHandle);
double *image = (double*)malloc(sizeof(double) * NN * NN);
double *recoveredImage = (double*)malloc(sizeof(double) * NN * NN);
fftw_complex *imageFT = (fftw_complex*) mkl_malloc(
NPIXFFT * sizeof(fftw_complex),64);
for (unsigned int i=0; i<NN * NN; i++){
image[i] = i * i + i * 2 + 1;
}
for (auto _ : state) {
// This code gets timed
DftiComputeForward(descHandle, image, imageFT);
DftiComputeBackward(descHandle, imageFT, recoveredImage);
benchmark::DoNotOptimize(recoveredImage);
benchmark::ClobberMemory();
}
}
// Register the function as a benchmark
BENCHMARK(BM_IntelFFT);
BENCHMARK(BM_FFTW);
// Run the benchmark
BENCHMARK_MAIN();
int getIntelFFTWPlans(DFTI_DESCRIPTOR_HANDLE *descHandle){
MKL_LONG lengths[2];
lengths[0] = NN;
lengths[1] = NN;
MKL_LONG status = DftiCreateDescriptor(descHandle, DFTI_DOUBLE, DFTI_REAL, 2, lengths);
if (status != 0) {
cout << "DftiCreateDescriptor failed : " << status << endl;
return -1;
}
status = DftiSetValue(*descHandle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
if (status != 0) {
cout << "DftiSetValue DFTI_PLACEMENT failed : " << status << endl;
return -2;
}
status = DftiSetValue(*descHandle, DFTI_THREAD_LIMIT, 1);
if (status != 0) {
cout << "DftiSetValue DFTI_THREAD_LIMIT failed : " << status << endl;
return -3;
}
status = DftiSetValue(*descHandle, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
if (status != 0) {
cout << "DftiSetValue DFTI_CONJUGATE_EVEN_STORAGE failed : " << status << endl;
return -4;
}
status = DftiSetValue(*descHandle, DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT);
if (status != 0) {
cout << "DftiSetValue DFTI_PACKED_FORMAT failed : " << status << endl;
return -5;
}
MKL_LONG strides[3];
strides[0] = 0;
strides[1] = 1;
strides[2] = NN;
status = DftiSetValue(*descHandle, DFTI_INPUT_STRIDES, strides);
if (status != 0) {
cout << "DftiSetValue DFTI_INPUT_STRIDES failed : " << status << endl;
return -6;
}
status = DftiSetValue(*descHandle, DFTI_OUTPUT_STRIDES, strides);
if (status != 0) {
cout << "DftiSetValue DFTI_OUTPUT_STRIDES failed : " << status << endl;
return -7;
}
MKL_LONG format;
status = DftiGetValue(*descHandle, DFTI_PACKED_FORMAT, &format);
if (status != 0) {
cout << "DftiGetValue DFTI_PACKED_FORMAT failed : " << status << endl;
return -8;
}
// cout << "DftiGetValue DFTI_PACKED_FORMAT : " << format << endl;
status = DftiCommitDescriptor(*descHandle);
if (status != 0) {
cout << "DftiCommitDescriptor failed : " << status << endl;
return -9;
}
return status;
}
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Harsh,
Thanks for posting in Intel communities.
We have tried running your code with the given cmakelists file after installing Google Benchmark and fftw3, but we are encountering errors (Please see the log file attached).
Could you please let us know what we are missing here?
Thanks and Regards,
Praneeth Achanta
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Harsh,
Thanks for sharing the information. We were able to observe a similar issue at our end as well. We are looking into your issue internally and will get back to you soon with an update.
Thanks and Regards,
Praneeth Achanta
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I am looking forward to your response.
Regards,
Harsh
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
after updating strides as the following example:
// Input Strides
strides[0] = 0;
strides[1] = 1;
strides[2] = NN;
//Output Strides
strides[0] = 0;
strides[1] = 1;
strides[2] = 1 + (NN / 2);
The Bechmark slightly improved, now Intel MKL FFT takes 0.6 ms and FFTW as usual takes 0.34 ms.
FFTW still beats Intel MKL by 2 times.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Harsh,
Thanks for sharing the information. As informed earlier, we are looking into your issue internally and will get back to you soon with an update.
Thanks and Regards,
Praneeth Achanta
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Also I would request you along with the solution a basic program example which does forward and backward FFT of a 2D image stored in row major order, similar to I have given.
The major issue I believe with the Intel MKL is, there is tons of intertia with initiation. Though there is tons of information and documentation, not many examples are provided for common use cases, instead 4 liner snippets are present everywhere. The examples given in the examples folder also are too convoluted to understand who is not so familiar with programming. For example, in FFTW, we just create a plan, execute a function, no business of setting strides or any other useless options, its straight forward.
Sorry my comment is a bit rude, but I think an API which is easy to grasp always charms programmers.
Thank you.
Regards,
Harsh
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Harsh,
We found the example "basic_dp_complex_dft_2d.c" in the Intel MKL examples folder "C:\Program Files (x86)\Intel\oneAPI\mkl\2023.1.0\examples\examples_core_c.zip\c\dft\source". Could you please let us know if this example meets your requirement?
>>The examples given in the examples folder also are too convoluted to understand who is not so familiar with programming
Thankyou for the input, we have passed on your feedback regarding sample quality to the dev team.
Thanks and Regards,
Praneeth Achanta
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Harsh,
Thanks for helping us improve our products! We’ve submitted the feature request to the dev team, they will consider it based on multiple factors including, but not limited to priority and criticality of the feature. Once it is included in an upcoming release, it would be documented in the release notes.
Thanks and Regards,
Praneeth Achanta

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page