Hi Bohan,

bohan_w_ · ‎05-11-2017

Recently, when I tried to use MKL BLAS with tbb, I found conflicts between TLS provided by tbb and MKL BLAS APIs. The details are as follows. I used MKL CBLAS APIs within a tbb::parallel_for block to do some calculation. In addition, I used TLS within the same parallel_for to store intermediate results. MKL is compiled using tbb thread. I found that the final result is essentially different from the result of the serial implementation (no parallel_for) of the same algorithm. Moreover, if I replaced TLS with a thread local std::vector, the final result is the same as the serial implementation. Also, if I compiled MKL without threading but use (parallel_for), the final result is the same as the serial implementation.

The code is attached below. I have tested it on g++ 5.4 and g++ 7.1 on Ubuntu 16.04. When MKL is compiled using TBB threading, the result is as follows:

List squre root of normal:
cache aligned allocator vs No TLS: 9.3337e+36
std allocator vs No TLS: 7.89346e+36
serial vs No TLS: 0

Any ideas?

#include <mkl.h>
#include <tbb/tbb.h>

#include <cstring>
#include <cstdlib>

#include <iostream>
#include <fstream>
#include <vector>

using namespace std;

const int matrixN = 60;
const int num = 10000;

// given an array of matrix A and an array of matrix B
// for each Ai and Bi, compute Di=Ai^T Bi Ai

// TLS version
template<typename Allocator>
void testProcedural(const std::vector<double> &A, const std::vector<double> &B, std::vector<double> &D)
{
  tbb::enumerable_thread_specific<std::vector<double, Allocator>> tls(std::vector<double, Allocator>(matrixN * matrixN * 2));

  tbb::parallel_for(tbb::blocked_range<int>(0, num), [&](const tbb::blocked_range<int> &rng) {
    std::vector<double, Allocator> &C = tls.local();

    double *I = C.data();
    double *J = I + matrixN * matrixN;

    for (int i = rng.begin(); i != rng.end(); i++) {
      // C = Bi * Ai
      cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, B.data() + i * matrixN * matrixN, matrixN, A.data() + i * matrixN * matrixN, matrixN, 0, I, matrixN);

      cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, I, matrixN, A.data() + i * matrixN * matrixN, matrixN, 0, J, matrixN);
      // D = Ai^T * C
      cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, A.data() + i * matrixN * matrixN, matrixN, J, matrixN, 0, D.data() + i * matrixN * matrixN, matrixN);
    }
  });
}

// not TLS version
void testProceduralNoTLS(const std::vector<double> &A, const std::vector<double> &B, std::vector<double> &D)
{
  tbb::parallel_for(tbb::blocked_range<int>(0, num), [&](const tbb::blocked_range<int> &rng) {
    std::vector<double> C(matrixN * matrixN * 2);

    double *I = C.data();
    double *J = I + matrixN * matrixN;

    for (int i = rng.begin(); i != rng.end(); i++) {
      // C = Bi * Ai
      cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, B.data() + i * matrixN * matrixN, matrixN, A.data() + i * matrixN * matrixN, matrixN, 0, I, matrixN);

      cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, I, matrixN, A.data() + i * matrixN * matrixN, matrixN, 0, J, matrixN);
      // D = Ai^T * C
      cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, A.data() + i * matrixN * matrixN, matrixN, J, matrixN, 0, D.data() + i * matrixN * matrixN, matrixN);
    }
  });
}

// serial version
void testProceduralSerial(const std::vector<double> &A, const std::vector<double> &B, std::vector<double> &D)
{
  std::vector<double> C(matrixN * matrixN * 2);

  double *I = C.data();
  double *J = I + matrixN * matrixN;

  for (int i = 0; i != num; i++) {
    // C = Bi * Ai
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, B.data() + i * matrixN * matrixN, matrixN, A.data() + i * matrixN * matrixN, matrixN, 0, I, matrixN);

    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, I, matrixN, A.data() + i * matrixN * matrixN, matrixN, 0, J, matrixN);
    // D = Ai^T * C
    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, A.data() + i * matrixN * matrixN, matrixN, J, matrixN, 0, D.data() + i * matrixN * matrixN, matrixN);
  }
}

int main() 
{
  std::vector<double> A(matrixN * matrixN * num);
  std::vector<double> B(matrixN * matrixN * num);
  std::vector<double> D1(matrixN * matrixN * num);
  std::vector<double> D2(matrixN * matrixN * num);
  std::vector<double> D3(matrixN * matrixN * num);
  std::vector<double> D4(matrixN * matrixN * num);

  for (int i = 0; i < num; i++) {
    double *data = A.data() + i * matrixN * matrixN;
    for (int j = 0; j < matrixN; j++) {
      for (int k = 0; k < matrixN; k++) {
        data[j * matrixN + k] = rand();
      }
    }
  }

  for (int i = 0; i < num; i++) {
    double *data = B.data() + i * matrixN * matrixN;
    for (int j = 0; j < matrixN; j++) {
      for (int k = 0; k < matrixN; k++) {
        data[j * matrixN + k] = rand() / 1000000.0;
      }
    }
  }
 
  testProcedural<tbb::cache_aligned_allocator<double>>(A, B, D1);
  testProcedural<std::allocator<double>>(A, B, D2);
  testProceduralNoTLS(A, B, D3);
  testProceduralSerial(A, B, D4);

  cblas_daxpy((int)D1.size(), -1, D3.data(), 1, D1.data(), 1);
  double nrm2 = cblas_dnrm2((int)D1.size(), D1.data(), 1);

  cout << "List squre root of normal:\n";
  cout << "cache aligned allocator vs No TLS: " << nrm2 << endl;

  cblas_daxpy((int)D2.size(), -1, D3.data(), 1, D2.data(), 1);
  nrm2 = cblas_dnrm2((int)D2.size(), D2.data(), 1);

  cout << "std allocator vs No TLS: " << nrm2 << endl;

  cblas_daxpy((int)D4.size(), -1, D3.data(), 1, D4.data(), 1);
  nrm2 = cblas_dnrm2((int)D4.size(), D4.data(), 1);

  cout << "serial vs No TLS: " << nrm2 << endl;
  
 return 0;
}

Alexei_K_Intel · ‎05-12-2017

Hi Bohan,

The root cause of the issue is that Intel TBB scheduler is allowed to preempt tasks at synchronization points (when blocking API is used like tbb::parallel_for). It means that another iteration of outer level parallel loop can start before the previous iteration is completed. It leads to a situation that the memory from TLS is used with two iterations simultaneously. This problem is considered in Task Isolation section of Intel TBB documentation.

In addition, you may find interesting similar forum topics:

Regards,
Alex

Alexei_K_Intel · ‎08-21-2018

Additional information can be found in the blog article about work isolation.

Conflicts between thead local storage (TLS) in tbb and MKL BLAS