- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I'd like to optimize matrix multiplication using BOTH vectorization and parallelization ... however I get parallel dependence remarks in optimization reports.
One thing is that I am somewhat new to auto-vectorization and parallelization (though I have studied Compiler Developer's Guide and I know OpenMP 3.0 pretty well.
Below you can find the source code, compile command, and the optimization reports.
#ifndef __SMALL_MATRIX_H__
#define __SMALL_MATRIX_H__
#define printvar(a) // std::cout << #a << " = " << a << std::endl;
#define printline // std::cout << "FILE " << __FILE__ << " LINE " << __LINE__ << std::endl;
#include<iostream>
#include <iomanip>
#include<cstdlib>
#include<cmath>
#include<ctime>
#define MEM_ALIGNE 64
#define dt_real double
// #include "ark_data_consts.h"
namespace ark{
template <class V>
class matrix;
}
using std::cout;
using std::endl;
template <class V>
class matrix{
public:
enum direction {ROW=1, COL=2};
// matrix fields are made public to optimize codes
V * data;
int nrows, ncols;
unsigned char attribs;
protected:
inline void minit(int nr, int nc){
printline
nrows = nr;
ncols = nc;
if (numel()){
data = (V*) _mm_malloc(sizeof(V)*numel(), MEM_ALIGNE);
} else {
data = (V*) NULL;
}
}
inline void swap(matrix<V>* a, matrix<V>* b){
printline
int tmpi;
tmpi = a->nrows; a->nrows = b->nrows; b->nrows = tmpi;
tmpi = a->ncols; a->ncols = b->ncols; b->ncols = tmpi;
V* tmpr;
tmpr = a->data; a->data = b->data; b->data = tmpr;
}
public:
inline int numel() const{
printline
return nrows*ncols;
}
matrix(int nr, int nc, const V& val){
printline
minit(nr,nc);
#pragma omp parallel for
for(int i = 0; i < numel() ; i++)
csi0(i) = val;
}
inline virtual V& csi0(int i) const{ return cij0(i%this->nrows, i/this->nrows); }
inline virtual V& cij0(int i, int j) const{ return this->data[i+j*this->nrows]; }
matrix<V>& operator=(const matrix<V>& m){
printline
if(this == &m)
return *this;
printline
if(numel() == m.numel()){
printline
nrows = m.nrows;
ncols = m.ncols;
#pragma omp parallel for
for(int i = 0; i < m.numel(); i++){
csi0(i) = m.csi0(i);
}
printline
}else{
printline
matrix<V> tmp(m);
swap(this,&tmp);
printline
}
printline
return *this;
}
void print(std::ostream & out = std::cerr) const{
printline
out << "matrix<" << ">(" << nrows << "x" << ncols << ")" << std::endl;
for(int i = 0; i < nrows; i++){
for(int j = 0; j < ncols; j++){
out << cij0(i,j);
if(j<ncols-1)
out << " ";
}
if(i < nrows-1)
out << std::endl;
}
if(numel() == 0)
cout << "[]";
out << std::endl;
}
friend std::ostream & operator<<(std::ostream & os, const matrix<V>& m) {
printline
m.print(os);
return os;
}
template <class V1> friend matrix<V> operator*(const matrix<V> & a, const matrix<V1> & b){
printline
matrix<V> c(a.nrows, b.ncols, 0);
const int a_nrows = a.nrows;
const int b_nrows = b.nrows;
const int c_nrows = c.nrows;
int a_size_1 = a.nrows;
int a_size_2 = a.ncols;
int b_size_2 = b.ncols;
__assume_aligned(a.data, MEM_ALIGNE);
__assume_aligned(b.data, MEM_ALIGNE);
__assume_aligned(c.data, MEM_ALIGNE);
#define CIJ0(m,i,j,m_nrows) m.data[i+j*m_nrows]
#pragma omp parallel for
#pragma ivdep
#pragma vector always
for (int i = 0 ; i < a_size_1; i++){
for(int j = 0; j < b_size_2; j++){
V vsum = 0;
#pragma omp simd reduction(+:vsum)
for(int k = 0; k < a_size_2 ; k++){
// c.cij0(i,j) += (V)(a.cij0(i,k) * b.cij0(k,j));
vsum += (V)(CIJ0(a,i,k,a_nrows) * CIJ0(b,k,j,b_nrows));
}
CIJ0(c,i,j,c_nrows) = vsum;
// CIJ0(c,i,j) = vsum;
}
printvar(i)
}
printline
return c;
}
matrix<dt_real> sum(int d = -1) const{
printline
if(d == 1 || (d == -1 && nrows != 1)){
printline
matrix<dt_real> s(1,ncols,0);
#pragma omp parallel for
for(int j = 0; j < ncols ; j++)
for(int i = 0; i < nrows; i++)
s.csi0(j) += cij0(i,j);
printvar(s)
return s;
}else if(d == 2 || d == -1){
printline
matrix<dt_real> s(nrows,1,0);
#pragma omp parallel for
for(int i = 0; i < nrows ; i++)
for(int j = 0; j < ncols; j++)
s.csi0(i) += cij0(i,j);
return s;
}
printline
return matrix<dt_real>(0,0,0.f); // never is reached
}
virtual ~matrix(){
if(data){
_mm_free(data);
nrows = ncols = 0;
}
}
};
#define MAXARRAY_SIZE (1024)
void add_arrays(int c[MAXARRAY_SIZE], int a[MAXARRAY_SIZE], int b[MAXARRAY_SIZE])
{
#pragma omp simd
for(int i = 0; i < MAXARRAY_SIZE; i++){
c[i] = a[i] + b[i];
}
}
#endif /* __SMALL_MATRIX_H__ */
#include <iostream>
#include <ctime>
#include <small_matrix.h>
// #include <ark_matrix.h>
#define MSIZE (1024)
#define KSIZE (10*1024)
#define NSIZE (1024)
#define printvar(a) // std::cout << #a << " = " << a << std::endl;
#define printline // std::cout << "FILE " << __FILE__ << " LINE " << __LINE__ << std::endl;
int main(){
printline
matrix<float> a(MSIZE, KSIZE, 1.f);
printline
// printvar(a(1 _ 5, 1 _ 5))
matrix<float> b(KSIZE, NSIZE, 1.f);
printline
double st = clock();
printline
matrix<float> c = a * b;
printline
double et = clock();
printline
printf("Time taken: %f \n", (et - st)/CLOCKS_PER_SEC);
printf("Expected sum: %f \n", float(MSIZE)*KSIZE*NSIZE*1.0);
printf("Evaluated sum: %f \n", c.sum(1).sum(2).csi0(0));
return 0;
}
Compile Command
icl -I"C:/Program Files (x86)/IntelSWTools/sw_dev_tools/compilers_and_libraries_2020.2.254/windows/compiler/include" -Qstd=c++11 -fast -Qopenmp-simd -Qipo -Qip -Qparallel -Qopt-dynamic-align -fp:fast=2 -arch:core-avx2 -Qopt-report:5 -Qopt-report-phase:all -Qopt-report-per-object -Qdiag-error-limit:3 -O2 -MD -arch:sse2 -arch:AVX -Qopenmp -Fo:DEBUG/obj/test_dsift.obj -c test_dsift.cpp
test_dsift.cpp
xilink /OPT:REF /OPT:ICF /DEBUG:FASTLINK /TLBID:1 -out:Debug/out.exe DEBUG/obj/test_dsift.obj
LOOP BEGIN at small_matrix.h(69,3) inlined into test_dsift.cpp(22,25)
remark #17104: loop was not parallelized: existence of parallel dependence
remark #17106: parallel dependence: assumed ANTI dependence between Ud_V->__vptr (74:68) and at (71:4)
remark #17106: parallel dependence: assumed FLOW dependence between at (71:4) and Ud_V->__vptr (74:68)
remark #15382: vectorization support: call to function (Indirect call) cannot be vectorized [ small_matrix.h(74,68) ]
remark #15344: loop was not vectorized: vector dependence prevents vectorization
remark #15346: vector dependence: assumed ANTI dependence between Ud_V->__vptr (74:68) and at (71:4)
remark #15346: vector dependence: assumed FLOW dependence between at (71:4) and Ud_V->__vptr (74:68)
remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END
LOOP BEGIN at small_matrix.h(142,3) inlined into test_dsift.cpp(22,25)
remark #25096: Loop Interchange not done due to: Imperfect Loop Nest (Either at Source or due to other Compiler Transformations)
remark #25451: Advice: Loop Interchange, if possible, might help loopnest. Suggested Permutation : ( 1 2 ) --> ( 2 1 )
remark #17104: loop was not parallelized: existence of parallel dependence
remark #17106: parallel dependence: assumed OUTPUT dependence between __p1[i+j*c_nrows] (150:5) and __p1[i+j*c_nrows] (150:5)
remark #17106: parallel dependence: assumed OUTPUT dependence between __p1[i+j*c_nrows] (150:5) and __p1[i+j*c_nrows] (150:5)
remark #15542: loop was not vectorized: inner loop was already vectorized
LOOP BEGIN at small_matrix.h(143,4) inlined into test_dsift.cpp(22,25)
remark #17104: loop was not parallelized: existence of parallel dependence
remark #17106: parallel dependence: assumed ANTI dependence between a (148:18) and __p1[i+j*c_nrows] (150:5)
remark #17106: parallel dependence: assumed FLOW dependence between __p1[i+j*c_nrows] (150:5) and a (148:18)
remark #15542: loop was not vectorized: inner loop was already vectorized
LOOP BEGIN at small_matrix.h(146,5) inlined into test_dsift.cpp(22,25)
<Peeled loop for vectorization>
remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
remark #25015: Estimate of max trip count of loop=7
LOOP END
LOOP BEGIN at small_matrix.h(146,5) inlined into test_dsift.cpp(22,25)
remark #17108: loop was not parallelized: insufficient computational work
remark #15388: vectorization support: reference b[k+j*b_nrows] has aligned access [ small_matrix.h(148,40) ]
remark #15328: vectorization support: non-unit strided load was emulated for the variable <a[i+k*a_nrows]>, stride is unknown to compiler [ small_matrix.h(148,18) ]
remark #15305: vectorization support: vector length 8
remark #15399: vectorization support: unroll factor set to 2
remark #15309: vectorization support: normalized vectorization overhead 1.354
remark #15355: vectorization support: vsum is float type reduction [ small_matrix.h(144,12) ]
remark #15301: SIMD LOOP WAS VECTORIZED
remark #15442: entire loop may be executed in remainder
remark #15448: unmasked aligned unit stride loads: 1
remark #15452: unmasked strided loads: 1
remark #15475: --- begin vector cost summary ---
remark #15476: scalar cost: 8
remark #15477: vector cost: 3.000
remark #15478: estimated potential speedup: 2.340
remark #15488: --- end vector cost summary ---
remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END
LOOP BEGIN at small_matrix.h(146,5) inlined into test_dsift.cpp(22,25)
<Remainder loop for vectorization>
remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END
LOOP END
LOOP END
I seem to be able to vectorize the multiplication fairly, though I am not sure why it is not parallelized.
I would appreciate your comments ....
Regards
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Mike,
Thanks for reaching out to us.
We are looking into your problem internally and will get back to you with more details.
Warm Regards,
Abhishek
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
From those remarks, there were dependencies in the code, which resulting in the loop was not vectorized/parallelized.
You would use Intel Advisor to get more details on those dependencies, and get some advises/suggestions how to change your code to be suitable for vectorized/parallelized.
Thanks,
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I guess I just need some sort of directive to clarify there is no parallel dependence ....
How do I do that?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I believe a black belt can clarify better ...
Regards
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Mike,
Since this is not a compiler bug, Intel will not monitor this topic. Hopefully others will have some suggestions.
Thanks,

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page