Hm, I don't understand your

Hans-Christian_S_ · ‎02-19-2015

Hello,

I'm new to MIC programming and trying to get a grip on how to do things with the beast. I stumbled accros very bad FFT performance (using a matrix size often used at our institution) for dfti complex to complex transforms. In the following. no OMP, KMP, MKL variables are set, except when stated. Setting the number of threads or specifying the placement does not change much for this comparison: The mic is much slower than the host!

Any hints how to improve the situation?

Sincerely,

HC

PS: I followed some FFT guide for mic from Intel giving these weird padding advices (without stating the reason, but I suspect it has to do with the mic cache properties)

[l_stadler_h@merlinx01 fft-test]$ export MIC_ENV_PREFIX=PHI
[l_stadler_h@merlinx01 fft-test]$ ./ft-off 800 10000 s <<< compiled with -DUSE_OFFLOAD (native version is not much different)
single precision
initializing ...
dimensions: 800x800
adjusted dimension: 808
offload ...
Create descriptor
Setup descriptor
fft loop: 10000
calculate error ...
cleanup ...
time: 39.6966s / error: 0.15946

[l_stadler_h@merlinx01 fft-test]$ ./ft 800 10000 s <<<< compiled for the host
single precision
initializing ...
dimensions: 800x800
adjusted dimension: 808
Create descriptor
Setup descriptor
fft loop: 10000
calculate error ...
cleanup ...
time: 9.55195s / error: 0.180659

[l_stadler_h@merlinx01 fft-test]$ cat fft-test.cc
// icc -finline-functions -fno-exceptions -fno-alias -mkl -qopenmp -Ofast (-mmic|-xHost) [-DUSE_OFFLOAD -wd2568]
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <complex>
#include <algorithm>
#include <cstdio>
#include "omp.h"
#include "mkl.h"

using namespace std;

namespace {
const unsigned int alignment=64;

template <typename fft_t>
struct fft_type { // provide dfti type id and rng for fft type
};

template <>
struct fft_type<float> {
const static DFTI_CONFIG_VALUE id=DFTI_SINGLE;
inline static int rng (const MKL_INT t, VSLStreamStatePtr sp, const MKL_INT n, float p[], const float a, const float b) { return vsRngUniform(t, sp, n, p, a, b); }
};

template <>
struct fft_type<double> {
    const static DFTI_CONFIG_VALUE id=DFTI_DOUBLE;
    inline static int rng (const MKL_INT t, VSLStreamStatePtr sp, const MKL_INT n, double p[], const double a, const double b) { return vdRngUniform(t, sp, n, p, a, b); }
};

template <typename fft_t>
int fft_test (unsigned int dim, unsigned int count)
{
    unsigned int i, j;
    MKL_LONG status;

cout << "initializing ..." << endl;
cout << "dimensions: " << dim << 'x' << dim << '\n';

    // (adim * sizeof(fft_t)) divisible by alignment, but not by (2 * alignment)
    const unsigned int adim = (((dim * sizeof(complex<fft_t>) + alignment-1) / alignment) | 1) * alignment / sizeof(complex<fft_t>);
    cout << "adjusted dimension: " << adim << '\n';

    complex<fft_t> *a = (complex<fft_t> *)mkl_malloc(dim*adim*sizeof(complex<fft_t>), alignment);
    complex<fft_t> * const b = (complex<fft_t> *)mkl_malloc(dim*adim*sizeof(complex<fft_t>), alignment);
    __assume_aligned(a, alignment);
    __assume_aligned(b, alignment);

    {
      VSLStreamStatePtr rng_stream;
      status = vslNewStream(&rng_stream, VSL_BRNG_MCG31, 1);
      if (status != VSL_STATUS_OK) {
        cerr << "rng stream creation error!\n";
        return EXIT_FAILURE;
      }

      status = fft_type<fft_t>::rng(VSL_RNG_METHOD_UNIFORM_STD, rng_stream, 2*dim*adim, (fft_t *)a, -1., 1.);
      if (status != VSL_STATUS_OK) {
        cerr << "rng error!\n";
        return EXIT_FAILURE;
      }

      vslDeleteStream(&rng_stream);
    }
    copy(a, a+dim*adim, b);

    double time;
    const unsigned int buf_len = 1024;
    char buf[buf_len];
#ifdef USE_OFFLOAD
    cout << "offload ...\n";
#pragma offload target(mic:-1) in(dim, adim) out(time, buf) inout(a: length(dim*adim))
#endif
    {
      int nc = 0;
      char *msg = buf, *end = buf+buf_len;
      *msg = 0;

      msg += snprintf(msg, end-msg, "Create descriptor\n");
      DFTI_DESCRIPTOR_HANDLE fft_handle;
      {
        MKL_LONG fft_dim=2, fft_length[2]={dim, dim};
        status = DftiCreateDescriptor(&fft_handle, fft_type<fft_t>::id, DFTI_COMPLEX, fft_dim, fft_length);
      }
      if (status != DFTI_NO_ERROR) {
        msg += snprintf(msg, end-msg, "dfti create error!\n");
        goto finished;
      }

      msg += snprintf(msg, end-msg, "Setup descriptor\n");
      {
        fft_t scale = 1. / sqrt(1. * dim * dim);
        status = DftiSetValue(fft_handle, DFTI_FORWARD_SCALE, scale);
        status |= DftiSetValue(fft_handle, DFTI_BACKWARD_SCALE, scale);
      }
      {
        MKL_LONG fft_stride[3]={0, adim, 1};
        status |= DftiSetValue(fft_handle, DFTI_INPUT_STRIDES, fft_stride);
      }
      status |= DftiCommitDescriptor(fft_handle);
      if (status != DFTI_NO_ERROR) {
        msg += snprintf(msg, end-msg, "dfti setup error!\n");
        goto cleanup;
      }

      msg += snprintf(msg, end-msg, "fft loop: %u\n", count);
      time = omp_get_wtime();
      for (i=0; i<count; i++) {
        status = DftiComputeForward(fft_handle, a);
        status |= DftiComputeBackward(fft_handle, a);
        if (status != DFTI_NO_ERROR) {
          msg += snprintf(msg, end-msg, "dfti compute error!\n");
          goto cleanup;
        }
      }
      time = omp_get_wtime() - time;

    cleanup:
      status = DftiFreeDescriptor(&fft_handle);
    finished:
      ;
    } // end pragma offload

    cout << buf;
    if (status != DFTI_NO_ERROR) {
      cout << "error in offload region!\n";
      return EXIT_FAILURE;
    }

    cout << "calculate error ..." << endl;
    double error = .0;
#pragma omp parallel for collapse(2) reduction(+: error)
    for (i=0; i<dim; i++) {
      for (j=0; j<dim; j++) {
        error += norm(b[i*adim+j] - a[i*adim+j]);
      }
    }

    cout << "cleanup ..." << endl;
    mkl_free(b);
    mkl_free(a);

cout << "time: " << time << "s / error: " << error << endl;
return EXIT_SUCCESS;
}
} // end namespace

int main (int argc, char *argv[])
{
if (argc != 4) {
    cerr << "argument error, use: " << argv[0] << " N I d|s\n"
         << "      N matrix dimesnion NxN\n"
         << "      I number of iterations\n"
         << "      s use single precision\n"
         << "      d use double precision\n";
    return EXIT_FAILURE;
}

unsigned int n_iterations, matrix_dim;
{
    istringstream iss(argv[1]);
    if (! (iss >> matrix_dim)) {
      cerr << "arg1 error: invalid matrix dimension\n";
      return EXIT_FAILURE;
    }
}
{
    istringstream iss(argv[2]);
    if (! (iss >> n_iterations)) {
      cerr << "arg2 error: invalid iteration count\n";
      return EXIT_FAILURE;
    }
}

switch (argv[3][0]) {
case 's':
    cout << "single precision\n";
    return fft_test<float>(matrix_dim, n_iterations);
case 'd':
    cout << "double precision\n";
    return fft_test<double>(matrix_dim, n_iterations);
default:
    cerr << "arg3 error: use s for single and d for double precision\n";
}
return EXIT_FAILURE;
}

- The MIC is a 7120P (there are actually 2 of them in the system)

- The CPUs are 2 x

Intel(R) Xeon(R) CPU E5-2660 v3 @ 2.60GHz (20 cores in total)

Sunny_G_Intel · ‎02-23-2015

Hello Hans-Christian,

For the dimensions you are using for DFT with Intel(R) MKL, the default number of threads is chosen to be 32. This is because the MKL_DYNAMIC is set to "true" by default and depending on the dimensions number of threads is chosen to be the best suitable power of 2.

For your reference, I would like to recommend you to read about tuning Intel MKL DFT functions in the following thread. You can also refer to the following articles for getting improved performance with Intel MKL DFT functions.

https://software.intel.com/en-us/articles/different-parallelization-techniques-and-intel-mkl-fft

https://software.intel.com/en-us/articles/intel-math-kernel-library-intel-mkl-intel-mkl-100-threading

If required, you can override the MKL_DYNAMIC environment variable as follows:

export MKL_DYNAMIC=false
export OMP_NUM_THREADS=64

For the dimensions chosen, the performance improves to certain extent by changing the default dynamic number of threads to 64 from 32 but again starts to drop when changed to 128 threads.

I hope this helps.

Hans-Christian_S_ · ‎02-25-2015

Hello Sunny,

thanks for the reply. I actually followed the recommendations in the cited thread about MKL DFT functions. I'm also familiar with choosing the number of threads in MKL. I even tried the code with different number of threads and thread placements. However - the fact remains, that my MIC is much slower than my Host CPUs. Even for a 1024x1024 matrix, the MIC is very slow compared to the normal CPUs. The situation gets slowly better when the matrix sizes increase, but I never saw speed on par with the CPUs. Especially with the matrix sizes we use at our institution (they are determined by the number of pixels of photon detectors), the MIC is a real FFT snail.

It simply looks like the MKL team at Intel has not optimized complex to complex FFT on the MIC well, or even worse: that the MIC is slow by architecture for this task. When I do not set any MKL/OMP/KMP variables I would expect that the library uses this freedom to choose the settings optimally, but as you also pointed out, this is not the case.

Good FFT performance is important to us, because we work a lot with photon detectors. Slow FFT performance simply means we will not put MICs into our future compute clusters. If anyone has data for complex to complex FFT on modern Nvidia GPUs, I would be happy to hear about it, because this would be another option to accelerate our codes.

Evgueni_P_Intel · ‎02-25-2015

Hi Hans-Christian S.

On MIC, Intel MKL FFTs perform best with the default scale factor.

Switching to the default scale factor improves performance more than 4x, so that MIC and Xeon show the similar performance.

If we set affinity to "scatter,granularity=fine", number of threads to 122, and MIC_USE_2MB_BUFFERS to 64K, then MIC is 1.1x faster than the host.

If we warm up the MIC OMP runtime before FFTs, then MIC is 1.25x faster than the host.

If use of custom scale factors is critical for your application, please submit a request at premier.intel.com.

Thank you for trying Intel MKL on MIC.

Evgueni.

Hans-Christian_S_ · ‎02-26-2015

Thanks a lot Evgueni,

after following your excellent post, I modified the code to scale the code outside of the MKL FFT transforms. Indeed the situation looks now much better for the single precision case. I haven't found a way to tame the double precision case yet, however.

Thanks again, HC

l_stadler_h@merlinx01-mic1:~/fft-test$ env | grep -E 'KMP|OMP|MIC'
KMP_PLACE_THREADS=60
MIC_USE_2MB_BUFFERS=64K
KMP_AFFINITY=scatter,granularity=fine
OMP_NUM_THREADS=120
OMP_SCHEDULE=guided
l_stadler_h@merlinx01-mic1:~/fft-test$ ./fft-test-2-mic 800 10000 s
single precision
initializing ...
dimensions: 800x800
adjusted dimension: 808
Create descriptor
Setup descriptor
fft loop: 10000
calculate error ...
cleanup ...
time: (9.95216, 8.59827, 1.33985)s / error: 0.195753
l_stadler_h@merlinx01-mic1:~/fft-test$ ./fft-test-2-mic 800 10000 d
double precision
initializing ...
dimensions: 800x800
adjusted dimension: 804
Create descriptor
Setup descriptor
fft loop: 10000
calculate error ...
cleanup ...
time: (40.4342, 27.8215, 12.5847)s / error: 3.36794e-18

[l_stadler_h@merlinx01 fft-test]$ printenv | grep -E 'KMP|OMP'
KMP_AFFINITY=scatter,granularity=fine
OMP_NUM_THREADS=10
OMP_SCHEDULE=static
[l_stadler_h@merlinx01 fft-test]$ ./fft-test-2 800 10000 s
single precision
initializing ...
dimensions: 800x800
adjusted dimension: 808
Create descriptor
Setup descriptor
fft loop: 10000
calculate error ...
cleanup ...
time: (9.4969, 8.49957, 0.995746)s / error: 0.214589
[l_stadler_h@merlinx01 fft-test]$ ./fft-test-2 800 10000 d
double precision
initializing ...
dimensions: 800x800
adjusted dimension: 804
Create descriptor
Setup descriptor
fft loop: 10000
calculate error ...
cleanup ...
time: (17.2939, 15.7528, 1.53992)s / error: 3.53584e-18

Evgueni_P_Intel · ‎02-26-2015

To make Intel offload runtime set an environment variable X on MIC, we should set MIC_X on host.

Please check Intel compiler documentation below.

Setting Environment Variables on the CPU to Modify the Coprocessor's Execution Environment

Hans-Christian_S_ · ‎02-26-2015

Hm, I don't understand your comment right now.

I was not using offload in my previous post, I compiled and ran the code natively on the mic and also natively on the host (not defining the USE_OFFLOAD macro, thus filtering out the offload pragma). Therefore, the MIC prefix should be irrelevant in this case, as far as I understand the doc.

I guess I need to read the System Software Developers Guide and the Instruction Set Architecture Reference Manual to really understand where the bottleneck is in the double precision case.

HC

Evgueni_P_Intel · ‎02-26-2015

I assumed that you define USE_OFFLOAD.

If the benchmark runs in the native mode, then your environment setting is OK.

MIC_USE_2MB_BUFFERS can be omitted for native runs because it is meant for offload.

Poor MKL Dfti complex to complex performance