Intel® C++ Compiler
Support and discussions for creating C++ code that runs on platforms based on Intel® processors.
7650 Discussions

How to both vectorize and parallelize matrix multiplication assuming variable matrix dimensions.

mikeitexpert
New Contributor II
427 Views

I'd like to optimize matrix multiplication using BOTH vectorization and parallelization ... however I get parallel dependence remarks in optimization reports.

One thing is that I am somewhat new to auto-vectorization and parallelization (though I have studied Compiler Developer's Guide and  I know OpenMP 3.0 pretty well.

Below you can find the source code, compile command, and the optimization reports.

#ifndef __SMALL_MATRIX_H__
#define __SMALL_MATRIX_H__

#define printvar(a) //	std::cout << #a << " = " << a << std::endl;
#define printline // std::cout << "FILE " << __FILE__ << " LINE " << __LINE__ << std::endl;


#include<iostream>
#include <iomanip>

#include<cstdlib>
#include<cmath>
#include<ctime>

#define MEM_ALIGNE 64
#define dt_real double
// #include "ark_data_consts.h"

namespace ark{
template <class V>
class matrix;
}

using std::cout;
using std::endl;

template <class V>
class matrix{
public:
	enum direction {ROW=1, COL=2};

	// matrix fields are made public to optimize codes
	V * data;	

	int nrows, ncols;
	unsigned char attribs;

protected:
	inline void minit(int nr, int nc){
		printline
		nrows = nr;
		ncols = nc;

	    if (numel()){
	        data = (V*) _mm_malloc(sizeof(V)*numel(), MEM_ALIGNE);
	    } else {
	        data = (V*) NULL;
	    }
	}
	inline void swap(matrix<V>* a, matrix<V>* b){
		printline
		int tmpi;
		tmpi = a->nrows; a->nrows = b->nrows; b->nrows = tmpi;
		tmpi = a->ncols; a->ncols = b->ncols; b->ncols = tmpi;

		V* tmpr;
		tmpr = a->data; a->data = b->data; b->data = tmpr;
	}

public:
	inline int numel() const{ 
		printline
		return nrows*ncols; 
	}
	
	matrix(int nr, int nc, const V& val){
		printline
		minit(nr,nc);
		#pragma omp parallel for
		for(int i = 0; i < numel() ; i++)
			csi0(i) = val;
	}
	
	inline virtual V& csi0(int i) const{ return cij0(i%this->nrows, i/this->nrows); }
	inline virtual V& cij0(int i, int j) const{ return this->data[i+j*this->nrows]; }

	matrix<V>& operator=(const matrix<V>& m){
		printline
	    if(this == &m)
			return *this;
		printline
		if(numel() == m.numel()){
			printline
            nrows = m.nrows;
			ncols = m.ncols;
			#pragma omp parallel for
			for(int i = 0; i < m.numel(); i++){
				csi0(i) = m.csi0(i);
			}
			printline
		}else{
			printline
		    matrix<V> tmp(m);
			swap(this,&tmp);
			printline
		}
		printline
		return *this;
	}
	
	void print(std::ostream & out = std::cerr) const{
		printline
		out << "matrix<" << ">(" << nrows << "x" << ncols << ")" << std::endl;
		for(int i = 0; i < nrows; i++){
			for(int j = 0; j < ncols; j++){
				out << cij0(i,j);
				if(j<ncols-1)
					out << " ";
			}
			if(i < nrows-1)
				out << std::endl;
		}
		if(numel() == 0)
			cout << "[]";
		out << std::endl;
	}
	friend std::ostream & operator<<(std::ostream & os, const matrix<V>& m) {
		printline
		m.print(os);
		return os;
	}

	template <class V1> friend matrix<V> operator*(const matrix<V> & a, const matrix<V1> & b){	
		printline
		matrix<V> c(a.nrows, b.ncols, 0);
		const int a_nrows = a.nrows;
		const int b_nrows = b.nrows;
		const int c_nrows = c.nrows;
		int a_size_1 = a.nrows;
		int a_size_2 = a.ncols;
		int b_size_2 = b.ncols;
			
		__assume_aligned(a.data, MEM_ALIGNE);
		__assume_aligned(b.data, MEM_ALIGNE);
		__assume_aligned(c.data, MEM_ALIGNE);

		#define CIJ0(m,i,j,m_nrows) m.data[i+j*m_nrows]
		
		#pragma omp parallel for
		#pragma ivdep
		#pragma vector always
		for (int i = 0 ; i < a_size_1; i++){
			for(int j = 0; j < b_size_2; j++){
				V vsum = 0;
				#pragma omp simd reduction(+:vsum)
				for(int k = 0; k < a_size_2 ; k++){
					// c.cij0(i,j) += (V)(a.cij0(i,k) * b.cij0(k,j));						 
					vsum += (V)(CIJ0(a,i,k,a_nrows) * CIJ0(b,k,j,b_nrows));						 
				}
				CIJ0(c,i,j,c_nrows) = vsum;
				// CIJ0(c,i,j) = vsum;
			}
			printvar(i)		
		}
		printline
		return c;
	}


	matrix<dt_real> sum(int d = -1) const{
		printline
		if(d == 1 || (d == -1 && nrows != 1)){
			printline
			matrix<dt_real> s(1,ncols,0);
			#pragma omp parallel for
			for(int j = 0; j < ncols ; j++)
				for(int i = 0; i < nrows; i++)
					s.csi0(j) += cij0(i,j);
			printvar(s)
			return s;
		}else if(d == 2 || d == -1){
			printline
			matrix<dt_real> s(nrows,1,0);
			#pragma omp parallel for
			for(int i = 0; i < nrows ; i++)
				for(int j = 0; j < ncols; j++)
					s.csi0(i) += cij0(i,j);
			return s;
		}
		printline
		return matrix<dt_real>(0,0,0.f); // never is reached
	}

	virtual ~matrix(){
		if(data){
			_mm_free(data);
			nrows = ncols = 0;
		}
	}

};

#define MAXARRAY_SIZE (1024)
void add_arrays(int c[MAXARRAY_SIZE], int a[MAXARRAY_SIZE], int b[MAXARRAY_SIZE])
{
	#pragma omp simd
	for(int i = 0; i < MAXARRAY_SIZE; i++){
		c[i] = a[i] + b[i];
	}
}

#endif /* __SMALL_MATRIX_H__ */

 

#include <iostream>
#include <ctime>
#include <small_matrix.h>
// #include <ark_matrix.h>

#define MSIZE (1024)
#define KSIZE (10*1024)
#define NSIZE (1024)

#define printvar(a) // std::cout << #a << " = " << a << std::endl;
#define printline // std::cout << "FILE " << __FILE__ << " LINE " << __LINE__ << std::endl;

int main(){
	printline
    matrix<float> a(MSIZE, KSIZE, 1.f);
    printline	
    // printvar(a(1 _ 5, 1 _ 5))
    matrix<float> b(KSIZE, NSIZE, 1.f);
    printline
    double st = clock();  
    printline
    matrix<float> c = a * b;
    printline
    double et = clock();
    printline
    printf("Time taken: %f \n", (et - st)/CLOCKS_PER_SEC);
	printf("Expected sum: %f \n", float(MSIZE)*KSIZE*NSIZE*1.0);    	
	printf("Evaluated sum: %f \n", c.sum(1).sum(2).csi0(0));

	return 0;
}

 

Compile Command

icl -I"C:/Program Files (x86)/IntelSWTools/sw_dev_tools/compilers_and_libraries_2020.2.254/windows/compiler/include" -Qstd=c++11 -fast -Qopenmp-simd  -Qipo -Qip -Qparallel  -Qopt-dynamic-align -fp:fast=2 -arch:core-avx2 -Qopt-report:5 -Qopt-report-phase:all -Qopt-report-per-object   -Qdiag-error-limit:3 -O2 -MD -arch:sse2  -arch:AVX   -Qopenmp -Fo:DEBUG/obj/test_dsift.obj -c test_dsift.cpp
test_dsift.cpp

xilink /OPT:REF /OPT:ICF /DEBUG:FASTLINK /TLBID:1 -out:Debug/out.exe DEBUG/obj/test_dsift.obj 

 

LOOP BEGIN at small_matrix.h(69,3) inlined into test_dsift.cpp(22,25)
   remark #17104: loop was not parallelized: existence of parallel dependence
   remark #17106: parallel dependence: assumed ANTI dependence between Ud_V->__vptr (74:68) and at (71:4)
   remark #17106: parallel dependence: assumed FLOW dependence between at (71:4) and Ud_V->__vptr (74:68)
   remark #15382: vectorization support: call to function (Indirect call) cannot be vectorized   [ small_matrix.h(74,68) ]
   remark #15344: loop was not vectorized: vector dependence prevents vectorization
   remark #15346: vector dependence: assumed ANTI dependence between Ud_V->__vptr (74:68) and at (71:4)
   remark #15346: vector dependence: assumed FLOW dependence between at (71:4) and Ud_V->__vptr (74:68)
   remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

LOOP BEGIN at small_matrix.h(142,3) inlined into test_dsift.cpp(22,25)
   remark #25096: Loop Interchange not done due to: Imperfect Loop Nest (Either at Source or due to other Compiler Transformations)
   remark #25451: Advice: Loop Interchange, if possible, might help loopnest. Suggested Permutation : ( 1 2 ) --> ( 2 1 ) 
   remark #17104: loop was not parallelized: existence of parallel dependence
   remark #17106: parallel dependence: assumed OUTPUT dependence between __p1[i+j*c_nrows] (150:5) and __p1[i+j*c_nrows] (150:5)
   remark #17106: parallel dependence: assumed OUTPUT dependence between __p1[i+j*c_nrows] (150:5) and __p1[i+j*c_nrows] (150:5)
   remark #15542: loop was not vectorized: inner loop was already vectorized

   LOOP BEGIN at small_matrix.h(143,4) inlined into test_dsift.cpp(22,25)
      remark #17104: loop was not parallelized: existence of parallel dependence
      remark #17106: parallel dependence: assumed ANTI dependence between a (148:18) and __p1[i+j*c_nrows] (150:5)
      remark #17106: parallel dependence: assumed FLOW dependence between __p1[i+j*c_nrows] (150:5) and a (148:18)
      remark #15542: loop was not vectorized: inner loop was already vectorized

      LOOP BEGIN at small_matrix.h(146,5) inlined into test_dsift.cpp(22,25)
      <Peeled loop for vectorization>
         remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
         remark #25015: Estimate of max trip count of loop=7
      LOOP END

      LOOP BEGIN at small_matrix.h(146,5) inlined into test_dsift.cpp(22,25)
         remark #17108: loop was not parallelized: insufficient computational work
         remark #15388: vectorization support: reference b[k+j*b_nrows] has aligned access   [ small_matrix.h(148,40) ]
         remark #15328: vectorization support: non-unit strided load was emulated for the variable <a[i+k*a_nrows]>, stride is unknown to compiler   [ small_matrix.h(148,18) ]
         remark #15305: vectorization support: vector length 8
         remark #15399: vectorization support: unroll factor set to 2
         remark #15309: vectorization support: normalized vectorization overhead 1.354
         remark #15355: vectorization support: vsum is float type reduction   [ small_matrix.h(144,12) ]
         remark #15301: SIMD LOOP WAS VECTORIZED
         remark #15442: entire loop may be executed in remainder
         remark #15448: unmasked aligned unit stride loads: 1 
         remark #15452: unmasked strided loads: 1 
         remark #15475: --- begin vector cost summary ---
         remark #15476: scalar cost: 8 
         remark #15477: vector cost: 3.000 
         remark #15478: estimated potential speedup: 2.340 
         remark #15488: --- end vector cost summary ---
         remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
      LOOP END

      LOOP BEGIN at small_matrix.h(146,5) inlined into test_dsift.cpp(22,25)
      <Remainder loop for vectorization>
         remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
      LOOP END
   LOOP END
LOOP END

 

I seem to be able to vectorize the multiplication fairly, though I am not sure why it is not parallelized.

I would appreciate your comments ....

Regards

0 Kudos
5 Replies
AbhishekD_Intel
Moderator
379 Views

Hi Mike,


Thanks for reaching out to us.

We are looking into your problem internally and will get back to you with more details.



Warm Regards,

Abhishek


Viet_H_Intel
Moderator
368 Views

From those remarks, there were dependencies in the code, which resulting in the loop was not vectorized/parallelized.

You would use Intel Advisor to get more details on those dependencies, and get some advises/suggestions how to change your code to be suitable for vectorized/parallelized.

Thanks,


mikeitexpert
New Contributor II
360 Views

I guess I just need some sort of directive to clarify there is no parallel dependence ....

How do I do that?

 

mikeitexpert
New Contributor II
358 Views

I believe a black belt can clarify better ... 

 

Regards

Viet_H_Intel
Moderator
296 Views

Hi Mike,


Since this is not a compiler bug, Intel will not monitor this topic. Hopefully others will have some suggestions.


Thanks,


Reply