- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Recently, when I tried to use MKL BLAS with tbb, I found conflicts between TLS provided by tbb and MKL BLAS APIs. The details are as follows. I used MKL CBLAS APIs within a tbb::parallel_for block to do some calculation. In addition, I used TLS within the same parallel_for to store intermediate results. MKL is compiled using tbb thread. I found that the final result is essentially different from the result of the serial implementation (no parallel_for) of the same algorithm. Moreover, if I replaced TLS with a thread local std::vector, the final result is the same as the serial implementation. Also, if I compiled MKL without threading but use (parallel_for), the final result is the same as the serial implementation.
The code is attached below. I have tested it on g++ 5.4 and g++ 7.1 on Ubuntu 16.04. When MKL is compiled using TBB threading, the result is as follows:
List squre root of normal:
cache aligned allocator vs No TLS: 9.3337e+36
std allocator vs No TLS: 7.89346e+36
serial vs No TLS: 0
Any ideas?
#include <mkl.h> #include <tbb/tbb.h> #include <cstring> #include <cstdlib> #include <iostream> #include <fstream> #include <vector> using namespace std; const int matrixN = 60; const int num = 10000; // given an array of matrix A and an array of matrix B // for each Ai and Bi, compute Di=Ai^T Bi Ai // TLS version template<typename Allocator> void testProcedural(const std::vector<double> &A, const std::vector<double> &B, std::vector<double> &D) { tbb::enumerable_thread_specific<std::vector<double, Allocator>> tls(std::vector<double, Allocator>(matrixN * matrixN * 2)); tbb::parallel_for(tbb::blocked_range<int>(0, num), [&](const tbb::blocked_range<int> &rng) { std::vector<double, Allocator> &C = tls.local(); double *I = C.data(); double *J = I + matrixN * matrixN; for (int i = rng.begin(); i != rng.end(); i++) { // C = Bi * Ai cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, B.data() + i * matrixN * matrixN, matrixN, A.data() + i * matrixN * matrixN, matrixN, 0, I, matrixN); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, I, matrixN, A.data() + i * matrixN * matrixN, matrixN, 0, J, matrixN); // D = Ai^T * C cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, A.data() + i * matrixN * matrixN, matrixN, J, matrixN, 0, D.data() + i * matrixN * matrixN, matrixN); } }); } // not TLS version void testProceduralNoTLS(const std::vector<double> &A, const std::vector<double> &B, std::vector<double> &D) { tbb::parallel_for(tbb::blocked_range<int>(0, num), [&](const tbb::blocked_range<int> &rng) { std::vector<double> C(matrixN * matrixN * 2); double *I = C.data(); double *J = I + matrixN * matrixN; for (int i = rng.begin(); i != rng.end(); i++) { // C = Bi * Ai cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, B.data() + i * matrixN * matrixN, matrixN, A.data() + i * matrixN * matrixN, matrixN, 0, I, matrixN); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, I, matrixN, A.data() + i * matrixN * matrixN, matrixN, 0, J, matrixN); // D = Ai^T * C cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, A.data() + i * matrixN * matrixN, matrixN, J, matrixN, 0, D.data() + i * matrixN * matrixN, matrixN); } }); } // serial version void testProceduralSerial(const std::vector<double> &A, const std::vector<double> &B, std::vector<double> &D) { std::vector<double> C(matrixN * matrixN * 2); double *I = C.data(); double *J = I + matrixN * matrixN; for (int i = 0; i != num; i++) { // C = Bi * Ai cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, B.data() + i * matrixN * matrixN, matrixN, A.data() + i * matrixN * matrixN, matrixN, 0, I, matrixN); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, I, matrixN, A.data() + i * matrixN * matrixN, matrixN, 0, J, matrixN); // D = Ai^T * C cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, matrixN, matrixN, matrixN, 1, A.data() + i * matrixN * matrixN, matrixN, J, matrixN, 0, D.data() + i * matrixN * matrixN, matrixN); } } int main() { std::vector<double> A(matrixN * matrixN * num); std::vector<double> B(matrixN * matrixN * num); std::vector<double> D1(matrixN * matrixN * num); std::vector<double> D2(matrixN * matrixN * num); std::vector<double> D3(matrixN * matrixN * num); std::vector<double> D4(matrixN * matrixN * num); for (int i = 0; i < num; i++) { double *data = A.data() + i * matrixN * matrixN; for (int j = 0; j < matrixN; j++) { for (int k = 0; k < matrixN; k++) { data[j * matrixN + k] = rand(); } } } for (int i = 0; i < num; i++) { double *data = B.data() + i * matrixN * matrixN; for (int j = 0; j < matrixN; j++) { for (int k = 0; k < matrixN; k++) { data[j * matrixN + k] = rand() / 1000000.0; } } } testProcedural<tbb::cache_aligned_allocator<double>>(A, B, D1); testProcedural<std::allocator<double>>(A, B, D2); testProceduralNoTLS(A, B, D3); testProceduralSerial(A, B, D4); cblas_daxpy((int)D1.size(), -1, D3.data(), 1, D1.data(), 1); double nrm2 = cblas_dnrm2((int)D1.size(), D1.data(), 1); cout << "List squre root of normal:\n"; cout << "cache aligned allocator vs No TLS: " << nrm2 << endl; cblas_daxpy((int)D2.size(), -1, D3.data(), 1, D2.data(), 1); nrm2 = cblas_dnrm2((int)D2.size(), D2.data(), 1); cout << "std allocator vs No TLS: " << nrm2 << endl; cblas_daxpy((int)D4.size(), -1, D3.data(), 1, D4.data(), 1); nrm2 = cblas_dnrm2((int)D4.size(), D4.data(), 1); cout << "serial vs No TLS: " << nrm2 << endl; return 0; }
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Bohan,
The root cause of the issue is that Intel TBB scheduler is allowed to preempt tasks at synchronization points (when blocking API is used like tbb::parallel_for). It means that another iteration of outer level parallel loop can start before the previous iteration is completed. It leads to a situation that the memory from TLS is used with two iterations simultaneously. This problem is considered in Task Isolation section of Intel TBB documentation.
In addition, you may find interesting similar forum topics:
- Very disappointing behavior (I would say bug). See example
- Invoking parallel_for while holding a lock
- Patterns of when and where to use this_task_arena::isolate
Regards,
Alex
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page