- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
int _calc_fft_batch(int num_points, int batchsize, double* T, double* F) {
DFTI_DESCRIPTOR_HANDLE fft_handle = NULL;
MKL_LONG retval = DFTI_NO_ERROR;
const int alignment = 64;
retval = DftiCreateDescriptor(&fft_handle, DFTI_DOUBLE, DFTI_REAL, 1, (MKL_LONG)num_points);
//CheckDftiError(retval, "DftiCreateDescriptor");
retval = DftiSetValue(fft_handle, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
retval = DftiSetValue(fft_handle, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)batchsize);
retval = DftiSetValue(fft_handle, DFTI_INPUT_DISTANCE, (MKL_LONG)num_points);
retval = DftiSetValue(fft_handle, DFTI_OUTPUT_DISTANCE, (MKL_LONG)num_points);
//retval = DftiSetValue(fft_handle, DFTI_INPUT_STRIDES, (MKL_LONG)1);
//retval = DftiSetValue(fft_handle, DFTI_OUTPUT_STRIDES, (MKL_LONG)1);
retval = DftiSetValue(fft_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
//CheckDftiError(retval, "DftiSetValue");
retval = DftiCommitDescriptor(fft_handle);
//CheckDftiError(retval, "DftiCommitDescriptor");
MKL_Complex16* C = (MKL_Complex16*)MKL_malloc(static_cast<size_t>(num_points * batchsize) * sizeof(MKL_Complex16), alignment);
retval = DftiComputeForward(fft_handle, T, C);
//CheckDftiError(retval, "DftiComputeForward");
retval = DftiFreeDescriptor(&fft_handle);
//CheckDftiError(retval, "DftiFreeDescriptor");
// unpack
//size_t index = 0;
//for (int i = 0; i < batchsize; i++) {
// for (int j = num_points / 2 + 1; j < num_points; j++) {
// index = static_cast<size_t>(i * num_points);
// C[index + j].real = C[index + num_points - j].real;
// C[index + j].imag = (-1) * C[index + num_points - j].imag;
// }
//}
// norm2 fft
vzAbs(num_points, C, F);
MKL_free(C);
return retval;
}
I have a double* c-array T, has shape as 1000*2048.I want to implement 1d-dft for 1000 times on T using DFTI_NUMBER_OF_TRANSFORMS instead of for loop, however only the first 2048 elements in T are computed correctly. The others are all 0.
- Tags:
- DFT FFT
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Thanks for posting on Intel Communities.
Could you please share us a working reproducer and steps(if any) so that we could try reproducing the issue at our end.
Best Regards,
Shanmukh.SS
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Thank u for replying. I'm appreciated it.
Code blew is my function for the single 1d c-array.
int _calc_fft(int num_points, double *t, double *f) {
DFTI_DESCRIPTOR_HANDLE fft_handle = NULL;
MKL_LONG retval = DFTI_NO_ERROR;
const int alignment = 64;
retval = DftiCreateDescriptor(&fft_handle, DFTI_DOUBLE, DFTI_REAL, 1, (MKL_LONG)num_points);
//CheckDftiError(retval, "DftiCreateDescriptor");
retval = DftiSetValue(fft_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
retval = DftiSetValue(fft_handle, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
//CheckDftiError(retval, "DftiSetValue");
retval = DftiCommitDescriptor(fft_handle);
//CheckDftiError(retval, "DftiCommitDescriptor");
MKL_Complex16* c = (MKL_Complex16*)MKL_malloc(num_points * sizeof(MKL_Complex16), alignment);
retval = DftiComputeForward(fft_handle, t, c);
//CheckDftiError(retval, "DftiComputeForward");
retval = DftiFreeDescriptor(&fft_handle);
//CheckDftiError(retval, "DftiFreeDescriptor");
// unpack
for (int i = num_points / 2 + 1; i < num_points; i++)
{
c[i].real = c[num_points - i].real;
c[i].imag = (-1) * c[num_points - i].imag;
}
// norm2 fft
vzAbs(num_points, c, f);
MKL_free(c);
return retval;
}
Here is example for how I use the function in for-loop.
h = 1000; w = 800; n = 2048;
Y21 is the input 1d double* c-array (1000*800*2048) read from the file.
Here is part of Y21 (10*2048), and the input array is correct.
In this case, n=2048 length input data is handled once we call this function.
double* F1 = (double*)MKL_malloc((h * w * n) * sizeof(double), 64);
t0 = std::chrono::high_resolution_clock::now();
iter = 0;
for (int i = 0; i < h; i++) {
for (int j = 0; j < w; j++) {
_calc_fft(n, &Y21[iter * n], &F1[iter * n]);
iter += 1;
}
}
t1 = std::chrono::high_resolution_clock::now();
dbg(std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0).count() * 1000 , "ms\n");
Here is an example of how I use DFTI_NUMBER_OF_TRANSFORMS function for the same input. In this case, w*n=800*2048 length input data is handled once we call this function.
t0 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < h; i++) {
_calc_fft_batch(n, w, &Y21[i * (w * n)], &F2[i * (w * n)]);
}
t1 = std::chrono::high_resolution_clock::now();
dbg(std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0).count() * 1000 , "ms\n");
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Thank u for replying. Appreciate it.
Here is my function for one-time calculation.
int _calc_fft(int num_points, double *t, double *f) {
DFTI_DESCRIPTOR_HANDLE fft_handle = NULL;
MKL_LONG retval = DFTI_NO_ERROR;
const int alignment = 64;
retval = DftiCreateDescriptor(&fft_handle, DFTI_DOUBLE, DFTI_REAL, 1, (MKL_LONG)num_points);
//CheckDftiError(retval, "DftiCreateDescriptor");
// 参数设置
retval = DftiSetValue(fft_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
retval = DftiSetValue(fft_handle, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
//CheckDftiError(retval, "DftiSetValue");
retval = DftiCommitDescriptor(fft_handle);
//CheckDftiError(retval, "DftiCommitDescriptor");
MKL_Complex16* c = (MKL_Complex16*)MKL_malloc(num_points * sizeof(MKL_Complex16), alignment);
retval = DftiComputeForward(fft_handle, t, c);
//CheckDftiError(retval, "DftiComputeForward");
retval = DftiFreeDescriptor(&fft_handle);
//CheckDftiError(retval, "DftiFreeDescriptor");
// unpack
for (int i = num_points / 2 + 1; i < num_points; i++)
{
c[i].real = c[num_points - i].real;
c[i].imag = (-1) * c[num_points - i].imag;
}
// norm2 fft
vzAbs(num_points, c, f);
MKL_free(c);
return retval;
}
Here is the example.
Y is 1d double*c-array with length=h*w*n, read from file, and [h, w, n] = [30, 1000, 2048].
Here is part of my input array Y (10*2048) .
double* F1 = (double*)MKL_malloc((h * w * n) * sizeof(double), 64);
t0 = std::chrono::high_resolution_clock::now();
iter = 0;
for (int i = 0; i < h; i++) {
for (int j = 0; j < w; j++) {
_calc_fft(n, &Y[iter * n], &F1[iter * n]);
iter += 1;
}
}
t1 = std::chrono::high_resolution_clock::now();
dbg(std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0).count() * 1000 , "ms\n");
Here is part of my output array F (10*2048) . U can see in for-loop it's correct.
Here is the example of using DFTI_NUMBER_OF_TRANSFORMS.
double* F2 = (double*)MKL_malloc((h * w * n) * sizeof(double), 64);
t0 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < h; i++) {
_calc_fft_batch(n, w, &Y[i * (w * n)], &F2[i * (w * n)]);
}
t1 = std::chrono::high_resolution_clock::now();
dbg(std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0).count() * 1000 , "ms\n");
Here is part of my output array F (10*2048). Only the 1st 2048 output data is correct and others remain 0.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Thank you for sharing the code snippets.
Could you please share us your environment details and VS project file or a working code, as with the shared snippets we are unable to compile and build the entire code.
Best Regards,
Shanmukh.SS
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
A gentle reminder:
Could you please share us your environment details and VS project file or a working code, as with the shared snippets we are unable to compile and build the entire code.
Best Regards,
Shanmukh.SS
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
My env:
os: WIN10
VS: 2022
MKL: 2022.0.2
CMake: 3.22
I use CMake to build the project.
Btw, I use fftw_plan_many_dft_r2c in MKL successfully, but I am still confused about why it fails when using dfti DFTI_NUMBER_OF_TRANSFORMS for multiple-arrays.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Thanks for sharing the environment details.
We would request you to share with us a working reproducer or a Visual Studio project file so that we could look into your issue further.
In addition, Could you please check your configuration settings/storage schemes and the type of array of elements which are been computed as mentioned in the below link?
Best Regards,
Shanmukh.SS
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
A gentle reminder:
We would request you to share us a working reproducer or a Visual studio working project file, so that we could investigate the issue further.
Best Regards,
Shanmukh.SS
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
I am also having the OP's problem.
My descriptor is as follows:
auto status = DftiCreateDescriptor(&rangeHandle, DFTI_DOUBLE, DFTI_REAL, 1, rangePadding);
status = DftiSetValue(rangeHandle, DFTI_NUMBER_OF_TRANSFORMS, rxNum);
status = DftiSetValue(rangeHandle, DFTI_INPUT_DISTANCE, rangePadding);
status = DftiSetValue(rangeHandle, DFTI_OUTPUT_DISTANCE, rangePadding);
status = DftiSetValue(rangeHandle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
status = DftiCommitDescriptor(rangeHandle);
where rangePadding = 4096, rxNum = 8.
The input is flat array of doubles, its size is rangePadding * rxNum;
The output is flat array of std::complex (I tried with Ipp64fc, makes no difference), its size is rangePadding * rxNum.
What I am experiencing is that only half of the array is filled.
What could be the problem here?
Am I missing something about the way OUTPUT_DISTANCE works?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
// unpack
for (int i = num_points / 2 + 1; i < num_points; i++)
{
c[i].real = c[num_points - i].real;
c[i].imag = (-1) * c[num_points - i].imag;
}
U may need this. The rest of the conjugate parts are not calculated yet for speeding up.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Thanks for the information. I will test this.
For those who don't quite follow: this link will explain what's up.
UPD: I posted a solution to my (and probably this) problem in a separate thread .
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
@Nevermore, We assume that your issue is resolved. If you need any additional information, please post a new question as this thread will no longer be monitored by Intel.
@sudoLife, We would like to request you to share the required details in a new thread and a reproducer, so that we could help you in assisting the query.
Best Regards,
Shanmukh,SS
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page