How to both vectorize and parallelize matrix multiplication assuming variable matrix dimensions.

mikeitexpert · ‎04-08-2021

I'd like to optimize matrix multiplication using BOTH vectorization and parallelization ... however I get parallel dependence remarks in optimization reports.

One thing is that I am somewhat new to auto-vectorization and parallelization (though I have studied Compiler Developer's Guide and I know OpenMP 3.0 pretty well.

Below you can find the source code, compile command, and the optimization reports.

#ifndef __SMALL_MATRIX_H__
#define __SMALL_MATRIX_H__

#define printvar(a) //	std::cout << #a << " = " << a << std::endl;
#define printline // std::cout << "FILE " << __FILE__ << " LINE " << __LINE__ << std::endl;


#include<iostream>
#include <iomanip>

#include<cstdlib>
#include<cmath>
#include<ctime>

#define MEM_ALIGNE 64
#define dt_real double
// #include "ark_data_consts.h"

namespace ark{
template <class V>
class matrix;
}

using std::cout;
using std::endl;

template <class V>
class matrix{
public:
	enum direction {ROW=1, COL=2};

	// matrix fields are made public to optimize codes
	V * data;	

	int nrows, ncols;
	unsigned char attribs;

protected:
	inline void minit(int nr, int nc){
		printline
		nrows = nr;
		ncols = nc;

	    if (numel()){
	        data = (V*) _mm_malloc(sizeof(V)*numel(), MEM_ALIGNE);
	    } else {
	        data = (V*) NULL;
	    }
	}
	inline void swap(matrix<V>* a, matrix<V>* b){
		printline
		int tmpi;
		tmpi = a->nrows; a->nrows = b->nrows; b->nrows = tmpi;
		tmpi = a->ncols; a->ncols = b->ncols; b->ncols = tmpi;

		V* tmpr;
		tmpr = a->data; a->data = b->data; b->data = tmpr;
	}

public:
	inline int numel() const{ 
		printline
		return nrows*ncols; 
	}
	
	matrix(int nr, int nc, const V& val){
		printline
		minit(nr,nc);
		#pragma omp parallel for
		for(int i = 0; i < numel() ; i++)
			csi0(i) = val;
	}
	
	inline virtual V& csi0(int i) const{ return cij0(i%this->nrows, i/this->nrows); }
	inline virtual V& cij0(int i, int j) const{ return this->data[i+j*this->nrows]; }

	matrix<V>& operator=(const matrix<V>& m){
		printline
	    if(this == &m)
			return *this;
		printline
		if(numel() == m.numel()){
			printline
            nrows = m.nrows;
			ncols = m.ncols;
			#pragma omp parallel for
			for(int i = 0; i < m.numel(); i++){
				csi0(i) = m.csi0(i);
			}
			printline
		}else{
			printline
		    matrix<V> tmp(m);
			swap(this,&tmp);
			printline
		}
		printline
		return *this;
	}
	
	void print(std::ostream & out = std::cerr) const{
		printline
		out << "matrix<" << ">(" << nrows << "x" << ncols << ")" << std::endl;
		for(int i = 0; i < nrows; i++){
			for(int j = 0; j < ncols; j++){
				out << cij0(i,j);
				if(j<ncols-1)
					out << " ";
			}
			if(i < nrows-1)
				out << std::endl;
		}
		if(numel() == 0)
			cout << "[]";
		out << std::endl;
	}
	friend std::ostream & operator<<(std::ostream & os, const matrix<V>& m) {
		printline
		m.print(os);
		return os;
	}

	template <class V1> friend matrix<V> operator*(const matrix<V> & a, const matrix<V1> & b){	
		printline
		matrix<V> c(a.nrows, b.ncols, 0);
		const int a_nrows = a.nrows;
		const int b_nrows = b.nrows;
		const int c_nrows = c.nrows;
		int a_size_1 = a.nrows;
		int a_size_2 = a.ncols;
		int b_size_2 = b.ncols;
			
		__assume_aligned(a.data, MEM_ALIGNE);
		__assume_aligned(b.data, MEM_ALIGNE);
		__assume_aligned(c.data, MEM_ALIGNE);

		#define CIJ0(m,i,j,m_nrows) m.data[i+j*m_nrows]
		
		#pragma omp parallel for
		#pragma ivdep
		#pragma vector always
		for (int i = 0 ; i < a_size_1; i++){
			for(int j = 0; j < b_size_2; j++){
				V vsum = 0;
				#pragma omp simd reduction(+:vsum)
				for(int k = 0; k < a_size_2 ; k++){
					// c.cij0(i,j) += (V)(a.cij0(i,k) * b.cij0(k,j));						 
					vsum += (V)(CIJ0(a,i,k,a_nrows) * CIJ0(b,k,j,b_nrows));						 
				}
				CIJ0(c,i,j,c_nrows) = vsum;
				// CIJ0(c,i,j) = vsum;
			}
			printvar(i)		
		}
		printline
		return c;
	}


	matrix<dt_real> sum(int d = -1) const{
		printline
		if(d == 1 || (d == -1 && nrows != 1)){
			printline
			matrix<dt_real> s(1,ncols,0);
			#pragma omp parallel for
			for(int j = 0; j < ncols ; j++)
				for(int i = 0; i < nrows; i++)
					s.csi0(j) += cij0(i,j);
			printvar(s)
			return s;
		}else if(d == 2 || d == -1){
			printline
			matrix<dt_real> s(nrows,1,0);
			#pragma omp parallel for
			for(int i = 0; i < nrows ; i++)
				for(int j = 0; j < ncols; j++)
					s.csi0(i) += cij0(i,j);
			return s;
		}
		printline
		return matrix<dt_real>(0,0,0.f); // never is reached
	}

	virtual ~matrix(){
		if(data){
			_mm_free(data);
			nrows = ncols = 0;
		}
	}

};

#define MAXARRAY_SIZE (1024)
void add_arrays(int c[MAXARRAY_SIZE], int a[MAXARRAY_SIZE], int b[MAXARRAY_SIZE])
{
	#pragma omp simd
	for(int i = 0; i < MAXARRAY_SIZE; i++){
		c[i] = a[i] + b[i];
	}
}

#endif /* __SMALL_MATRIX_H__ */

#include <iostream>
#include <ctime>
#include <small_matrix.h>
// #include <ark_matrix.h>

#define MSIZE (1024)
#define KSIZE (10*1024)
#define NSIZE (1024)

#define printvar(a) // std::cout << #a << " = " << a << std::endl;
#define printline // std::cout << "FILE " << __FILE__ << " LINE " << __LINE__ << std::endl;

int main(){
	printline
    matrix<float> a(MSIZE, KSIZE, 1.f);
    printline	
    // printvar(a(1 _ 5, 1 _ 5))
    matrix<float> b(KSIZE, NSIZE, 1.f);
    printline
    double st = clock();  
    printline
    matrix<float> c = a * b;
    printline
    double et = clock();
    printline
    printf("Time taken: %f \n", (et - st)/CLOCKS_PER_SEC);
	printf("Expected sum: %f \n", float(MSIZE)*KSIZE*NSIZE*1.0);    	
	printf("Evaluated sum: %f \n", c.sum(1).sum(2).csi0(0));

	return 0;
}

Compile Command

icl -I"C:/Program Files (x86)/IntelSWTools/sw_dev_tools/compilers_and_libraries_2020.2.254/windows/compiler/include" -Qstd=c++11 -fast -Qopenmp-simd  -Qipo -Qip -Qparallel  -Qopt-dynamic-align -fp:fast=2 -arch:core-avx2 -Qopt-report:5 -Qopt-report-phase:all -Qopt-report-per-object   -Qdiag-error-limit:3 -O2 -MD -arch:sse2  -arch:AVX   -Qopenmp -Fo:DEBUG/obj/test_dsift.obj -c test_dsift.cpp
test_dsift.cpp

xilink /OPT:REF /OPT:ICF /DEBUG:FASTLINK /TLBID:1 -out:Debug/out.exe DEBUG/obj/test_dsift.obj

LOOP BEGIN at small_matrix.h(69,3) inlined into test_dsift.cpp(22,25)
   remark #17104: loop was not parallelized: existence of parallel dependence
   remark #17106: parallel dependence: assumed ANTI dependence between Ud_V->__vptr (74:68) and at (71:4)
   remark #17106: parallel dependence: assumed FLOW dependence between at (71:4) and Ud_V->__vptr (74:68)
   remark #15382: vectorization support: call to function (Indirect call) cannot be vectorized   [ small_matrix.h(74,68) ]
   remark #15344: loop was not vectorized: vector dependence prevents vectorization
   remark #15346: vector dependence: assumed ANTI dependence between Ud_V->__vptr (74:68) and at (71:4)
   remark #15346: vector dependence: assumed FLOW dependence between at (71:4) and Ud_V->__vptr (74:68)
   remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

LOOP BEGIN at small_matrix.h(142,3) inlined into test_dsift.cpp(22,25)
   remark #25096: Loop Interchange not done due to: Imperfect Loop Nest (Either at Source or due to other Compiler Transformations)
   remark #25451: Advice: Loop Interchange, if possible, might help loopnest. Suggested Permutation : ( 1 2 ) --> ( 2 1 ) 
   remark #17104: loop was not parallelized: existence of parallel dependence
   remark #17106: parallel dependence: assumed OUTPUT dependence between __p1[i+j*c_nrows] (150:5) and __p1[i+j*c_nrows] (150:5)
   remark #17106: parallel dependence: assumed OUTPUT dependence between __p1[i+j*c_nrows] (150:5) and __p1[i+j*c_nrows] (150:5)
   remark #15542: loop was not vectorized: inner loop was already vectorized

   LOOP BEGIN at small_matrix.h(143,4) inlined into test_dsift.cpp(22,25)
      remark #17104: loop was not parallelized: existence of parallel dependence
      remark #17106: parallel dependence: assumed ANTI dependence between a (148:18) and __p1[i+j*c_nrows] (150:5)
      remark #17106: parallel dependence: assumed FLOW dependence between __p1[i+j*c_nrows] (150:5) and a (148:18)
      remark #15542: loop was not vectorized: inner loop was already vectorized

      LOOP BEGIN at small_matrix.h(146,5) inlined into test_dsift.cpp(22,25)
      <Peeled loop for vectorization>
         remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
         remark #25015: Estimate of max trip count of loop=7
      LOOP END

      LOOP BEGIN at small_matrix.h(146,5) inlined into test_dsift.cpp(22,25)
         remark #17108: loop was not parallelized: insufficient computational work
         remark #15388: vectorization support: reference b[k+j*b_nrows] has aligned access   [ small_matrix.h(148,40) ]
         remark #15328: vectorization support: non-unit strided load was emulated for the variable <a[i+k*a_nrows]>, stride is unknown to compiler   [ small_matrix.h(148,18) ]
         remark #15305: vectorization support: vector length 8
         remark #15399: vectorization support: unroll factor set to 2
         remark #15309: vectorization support: normalized vectorization overhead 1.354
         remark #15355: vectorization support: vsum is float type reduction   [ small_matrix.h(144,12) ]
         remark #15301: SIMD LOOP WAS VECTORIZED
         remark #15442: entire loop may be executed in remainder
         remark #15448: unmasked aligned unit stride loads: 1 
         remark #15452: unmasked strided loads: 1 
         remark #15475: --- begin vector cost summary ---
         remark #15476: scalar cost: 8 
         remark #15477: vector cost: 3.000 
         remark #15478: estimated potential speedup: 2.340 
         remark #15488: --- end vector cost summary ---
         remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
      LOOP END

      LOOP BEGIN at small_matrix.h(146,5) inlined into test_dsift.cpp(22,25)
      <Remainder loop for vectorization>
         remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
      LOOP END
   LOOP END
LOOP END

I seem to be able to vectorize the multiplication fairly, though I am not sure why it is not parallelized.

I would appreciate your comments ....

Regards

AbhishekD_Intel · ‎04-09-2021

Hi Mike,

Thanks for reaching out to us.

We are looking into your problem internally and will get back to you with more details.

Warm Regards,

Abhishek

Viet_H_Intel · ‎04-09-2021

From those remarks, there were dependencies in the code, which resulting in the loop was not vectorized/parallelized.

You would use Intel Advisor to get more details on those dependencies, and get some advises/suggestions how to change your code to be suitable for vectorized/parallelized.

Thanks,

mikeitexpert · ‎04-09-2021

I guess I just need some sort of directive to clarify there is no parallel dependence ....

How do I do that?

mikeitexpert · ‎04-09-2021

I believe a black belt can clarify better ...

Regards

Viet_H_Intel · ‎04-30-2021

Hi Mike,

Since this is not a compiler bug, Intel will not monitor this topic. Hopefully others will have some suggestions.

Thanks,