How to configure intel MKL FFT for best performance?

HarshM · ‎04-25-2023

Hi,

I am trying to perform 2D FFT on an Intel i5-6500 CPU@3.20 GHz.

I have coded a program to benchmark FFTW and Intel MKL, surprisingly, the FFTW outperformed the Intel FFTW library by 3 times, that is the FFTW time is 3 times less than Intel MKL.

Here are the results:

2023-04-26T11:01:31+05:30
Running FFTWCompare.exe
Run on (4 X 3192 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x4)
L1 Instruction 32 KiB (x4)
L2 Unified 256 KiB (x4)
L3 Unified 6144 KiB (x1)
------------------------------------------------------
Benchmark Time CPU Iterations
------------------------------------------------------
BM_IntelFFT 915017 ns 920348 ns 747
BM_FFTW 390155 ns 392369 ns 1792

The CMake File:

cmake_minimum_required(VERSION 3.25)
project(Benchmarking)

set(CMAKE_CXX_STANDARD 23)

set(CMAKE_BUILD_TYPE RELEASE)

find_package(benchmark REQUIRED)

set(CMAKE_C_FLAGS "-Ofast -march=native")
set(CMAKE_CXX_FLAGS "-Ofast -march=native")

set(FFTW C:/Users/IIAP-IPC/Documents/fftw-3)
set(FFTWLINK ${FFTW}/libfftw3l-3.lib ${FFTW}/libfftw3f-3.lib ${FFTW}/libfftw3-3.lib)
set(INTEL_FFTWLINK C:/PROGRA~2/Intel/oneAPI/mkl/2023.1.0/lib/intel64)
set(INTEL_FFTW_INCLUDE C:/PROGRA~2/Intel/oneAPI/mkl/2023.1.0/include)
set(INTEL_FFTWLINKLIB ${INTEL_FFTWLINK}/mkl_intel_lp64.lib ${INTEL_FFTWLINK}/mkl_core.lib ${INTEL_FFTWLINK}/mkl_sequential.lib)

add_executable(FFTWCompare main.cpp)
target_include_directories(FFTWCompare PRIVATE ${INTEL_FFTW_INCLUDE} ${FFTW})
target_link_libraries(FFTWCompare PRIVATE ${INTEL_FFTWLINKLIB} ${FFTWLINK} benchmark::benchmark)

The C++ Code:

#include <iostream>
#include <fstream>
#include <mkl.h>
#include <complex>
#include <chrono>
#include <fftw3.h>
#include <benchmark/benchmark.h>
#define NN 256
#define NPIXFFT NN * (1 + (NN / 2))
using namespace std;

typedef struct {
double re;
double im;
} mkl_double_complex;

fftw_plan PlanForward;
fftw_plan PlanInverse;

int getIntelFFTWPlans(DFTI_DESCRIPTOR_HANDLE *descHandle);

static void BM_FFTW(benchmark::State& state) {
double *forIN;
forIN = new double[NN * NN];
fftw_complex *forOUT;
forOUT = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * NPIXFFT);
PlanForward = fftw_plan_dft_r2c_2d(NN, NN, forIN, forOUT, FFTW_MEASURE);

// Inverse plan
 fftw_complex *invIN;
invIN = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * NPIXFFT);
double *invOUT;
invOUT = new double[NN * NN];
PlanInverse = fftw_plan_dft_c2r_2d(NN, NN, invIN, invOUT, FFTW_MEASURE);

double *image = (double*)malloc(sizeof(double) * NN * NN);
double *recoveredImage = (double*)malloc(sizeof(double) * NN * NN);
fftw_complex *imageFT = (fftw_complex*) mkl_malloc(
NPIXFFT * sizeof(fftw_complex),64);

for (unsigned int i=0; i<NN * NN; i++){
image[i] = i * i + i * 2 + 1;
}

for (auto _ : state) {
// This code gets timed
 fftw_execute_dft_r2c( PlanForward, image, imageFT);
fftw_execute_dft_c2r( PlanInverse, imageFT, recoveredImage);
benchmark::DoNotOptimize(recoveredImage);
benchmark::ClobberMemory();
}
}

static void BM_IntelFFT(benchmark::State& state) {

DFTI_DESCRIPTOR_HANDLE descHandle;
getIntelFFTWPlans(&descHandle);

double *image = (double*)malloc(sizeof(double) * NN * NN);
double *recoveredImage = (double*)malloc(sizeof(double) * NN * NN);
fftw_complex *imageFT = (fftw_complex*) mkl_malloc(
NPIXFFT * sizeof(fftw_complex),64);

for (unsigned int i=0; i<NN * NN; i++){
image[i] = i * i + i * 2 + 1;
}

for (auto _ : state) {
// This code gets timed
 DftiComputeForward(descHandle, image, imageFT);
DftiComputeBackward(descHandle, imageFT, recoveredImage);
benchmark::DoNotOptimize(recoveredImage);
benchmark::ClobberMemory();
}
}
// Register the function as a benchmark
BENCHMARK(BM_IntelFFT);
BENCHMARK(BM_FFTW);
// Run the benchmark
BENCHMARK_MAIN();

int getIntelFFTWPlans(DFTI_DESCRIPTOR_HANDLE *descHandle){

MKL_LONG lengths[2];
lengths[0] = NN;
lengths[1] = NN;
MKL_LONG status = DftiCreateDescriptor(descHandle, DFTI_DOUBLE, DFTI_REAL, 2, lengths);

if (status != 0) {
cout << "DftiCreateDescriptor failed : " << status << endl;
return -1;
}
status = DftiSetValue(*descHandle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
if (status != 0) {
cout << "DftiSetValue DFTI_PLACEMENT failed : " << status << endl;
return -2;
}

status = DftiSetValue(*descHandle, DFTI_THREAD_LIMIT, 1);
if (status != 0) {
cout << "DftiSetValue DFTI_THREAD_LIMIT failed : " << status << endl;
return -3;
}

status = DftiSetValue(*descHandle, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
if (status != 0) {
cout << "DftiSetValue DFTI_CONJUGATE_EVEN_STORAGE failed : " << status << endl;
return -4;
}

status = DftiSetValue(*descHandle, DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT);
if (status != 0) {
cout << "DftiSetValue DFTI_PACKED_FORMAT failed : " << status << endl;
return -5;
}

MKL_LONG strides[3];
strides[0] = 0;
strides[1] = 1;
strides[2] = NN;

status = DftiSetValue(*descHandle, DFTI_INPUT_STRIDES, strides);
if (status != 0) {
cout << "DftiSetValue DFTI_INPUT_STRIDES failed : " << status << endl;
return -6;
}

status = DftiSetValue(*descHandle, DFTI_OUTPUT_STRIDES, strides);
if (status != 0) {
cout << "DftiSetValue DFTI_OUTPUT_STRIDES failed : " << status << endl;
return -7;
}

MKL_LONG format;
status = DftiGetValue(*descHandle, DFTI_PACKED_FORMAT, &format);
if (status != 0) {
cout << "DftiGetValue DFTI_PACKED_FORMAT failed : " << status << endl;
return -8;
}
// cout << "DftiGetValue DFTI_PACKED_FORMAT : " << format << endl;

 status = DftiCommitDescriptor(*descHandle);
if (status != 0) {
cout << "DftiCommitDescriptor failed : " << status << endl;
return -9;
}

return status;
}

PraneethA_Intel · ‎05-02-2023

Hi Harsh,

Thanks for posting in Intel communities.

We have tried running your code with the given cmakelists file after installing Google Benchmark and fftw3, but we are encountering errors (Please see the log file attached).

Could you please let us know what we are missing here?

Thanks and Regards,

Praneeth Achanta

HarshM · ‎05-02-2023

The error says that if you are building RELEASE version of benchmark, then you must do CMAKE_BUILD_TYPE as RELEASE. If you want to compile in Debug mode, build the google benchmark in debug mode. Just replace the release with debug in the cmake commands while building

PraneethA_Intel · ‎05-09-2023

Hi Harsh,

Thanks for sharing the information. We were able to observe a similar issue at our end as well. We are looking into your issue internally and will get back to you soon with an update.

Thanks and Regards,

Praneeth Achanta

HarshM · ‎05-09-2023

I am glad to know you are able to run this example.

I am looking forward to your response.

Regards,
Harsh

HarshM · ‎05-17-2023

after updating strides as the following example:

https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Unable-to-perform-2D-FFT-with-NX-NY-128/m-p/1478771/emcs_t/S2h8ZW1haWx8dG9waWNfc3Vic2NyaXB0aW9ufExHUERWVEs1OFZPREY3fDE0Nzg3NzF8U1VCU0NSSVBUSU9OU3xoSw#M7988

// Input Strides
strides[0] = 0;
strides[1] = 1;
strides[2] = NN;

//Output Strides
strides[0] = 0;
strides[1] = 1;
strides[2] = 1 + (NN / 2);

The Bechmark slightly improved, now Intel MKL FFT takes 0.6 ms and FFTW as usual takes 0.34 ms.

FFTW still beats Intel MKL by 2 times.

PraneethA_Intel · ‎05-19-2023

Hi Harsh,

Thanks for sharing the information. As informed earlier, we are looking into your issue internally and will get back to you soon with an update.

Thanks and Regards,

Praneeth Achanta

HarshM · ‎05-19-2023

Also I would request you along with the solution a basic program example which does forward and backward FFT of a 2D image stored in row major order, similar to I have given.

The major issue I believe with the Intel MKL is, there is tons of intertia with initiation. Though there is tons of information and documentation, not many examples are provided for common use cases, instead 4 liner snippets are present everywhere. The examples given in the examples folder also are too convoluted to understand who is not so familiar with programming. For example, in FFTW, we just create a plan, execute a function, no business of setting strides or any other useless options, its straight forward.

Sorry my comment is a bit rude, but I think an API which is easy to grasp always charms programmers.

Thank you.

Regards,

Harsh

PraneethA_Intel · ‎05-26-2023

Hi Harsh,

We found the example "basic_dp_complex_dft_2d.c" in the Intel MKL examples folder "C:\Program Files (x86)\Intel\oneAPI\mkl\2023.1.0\examples\examples_core_c.zip\c\dft\source". Could you please let us know if this example meets your requirement?

>>The examples given in the examples folder also are too convoluted to understand who is not so familiar with programming

Thankyou for the input, we have passed on your feedback regarding sample quality to the dev team.

Thanks and Regards,

Praneeth Achanta

PraneethA_Intel · ‎06-28-2023

Hi Harsh,

Thanks for helping us improve our products! We’ve submitted the feature request to the dev team, they will consider it based on multiple factors including, but not limited to priority and criticality of the feature. Once it is included in an upcoming release, it would be documented in the release notes.

Thanks and Regards,

Praneeth Achanta