Intel® C++ Compiler
Support and discussions for creating C++ code that runs on platforms based on Intel® processors.
7646 Discussions

SIMD enabled function not parallelized

mikeitexpert
New Contributor II
514 Views

Hi Everyone,

I have a matrix class with one virtual method (csi0) to access elements of the column major data pointer. I have declared it as SIMD-enabled function; however, when I check the IPO reports the overloaded binary operators are not vectorized. It complains there is vector dependence in  the code.

Below you can find snippets from the class along with the optimization report.

 

 

#  define MAT_ASSERT(cond, msg) if(cond){ } else { std::cerr << std::endl << "Error, assertion failed in file:'" << __FILE__ << "' at line:" << 
#  define MAT_ASSERT_SHORT(cond) MAT_ASSERT(cond, #cond)


template <class V>
class matrix{
	V * data;
	int nrows, ncols;
	unsigned char attribs;
// some constructor definitions

        inline int numel() const{ return nrows*ncols; }
	inline int size(int d) const{ return (d==1) ? nrows : ncols; }
	

        #pragma omp declare simd 
	#pragma omp declare simd uniform(this)
	#pragma omp declare simd uniform(this) simdlen(4) linear(i)		
	virtual inline V& csi0(int i) const{
		return data[i];
	}


	template <class V1> friend matrix<V> operator-(const matrix<V> & a, const matrix<V1> & b) {
	    DEBUG_VERBOSE("op matrix<V> - matrix<V1>");
		if(a.isscalar()){
			matrix<V> c(b.size(1),b.size(2));
			V v0 = (V)a.csi0(0);
			#pragma omp parallel for simd
			for (int i = 0 ; i < b.numel(); i++)
				c.csi0(i) = v0 - b.csi0(i);
			return c.set_attribute(TEMPORARY);
		}else if(b.isscalar()){
			matrix<V> c(a.size(1),a.size(2));
			V v0 = (V)b.csi0(0);
			#pragma omp parallel for simd
			for (int i = 0 ; i < a.numel(); i++)
				c.csi0(i) = a.csi0(i) - v0;
			return c.set_attribute(TEMPORARY);
		}else if(a.samesize(b)){
			matrix<V> c(a.size(1),a.size(2));
			#pragma omp parallel for simd
			for (int i = 0 ; i < a.numel(); i++)
				c.csi0(i) = a.csi0(i) - b.csi0(i);
			return c.set_attribute(TEMPORARY);
		}else
			MAT_ABORT("Matrix dimension mismatch in operation '-'.");
		return matrix<V>::empty(); // never is reached
	}
};

 

 

Here is the report ...

 

 


===========================================================================

Begin optimization report for: operator-<double>(matrix<double> *, const matrix<double> &, const matrix<double> &)

    Report from: Interprocedural optimizations [ipo]

INLINE REPORT: (operator-<double>(matrix<double> *, const matrix<double> &, const matrix<double> &)) [30/1774=1.7%] C:\Users\Mehdi-laptop\sico\pplab\pplab\atrix\atrix.h(694,92)
  -> INLINE (MANUAL): (696,3) matrix<double>::isscalar(const matrix<double> *) const (isz = 5) (sz = 12)
    -> INLINE (MANUAL): (1238,3) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
  -> INLINE (MANUAL): (697,14) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
  -> INLINE (MANUAL): (697,14) matrix<double>::matrix(matrix<double> *, int, int, attribute) (isz = 59) (sz = 72)
    -> INLINE (MANUAL): (409,3) matrix<double>::minit(matrix<double> *, int, int, attribute) (isz = 55) (sz = 66)
      -> INLINE (MANUAL): (343,3) matrix<double>::isempty(const matrix<double> *) const (isz = 5) (sz = 12)
        -> INLINE (MANUAL): (1241,3) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> INLINE (MANUAL): (344,7) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> INLINE (MANUAL): (348,11) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> (348,11) alloc(unsigned int) (isz = 601) (sz = 609)
         [[ Inlining would exceed -Qinline-max-size value (609>230) <1>]]
      -> INLINE: (356,4) mem_profile_new(int, bool) (isz = 10) (sz = 17)
        -> DELETED: (105,2) getProcId() (isz = 0) (sz = 4)
        -> DELETED: (106,2) getProcId() (isz = 0) (sz = 4)
      -> INLINE (MANUAL): (356,4) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
  -> INLINE (MANUAL): (697,14) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
  -> INDIRECT- (VIRTUAL): (698,9)  matrix<double>::csi0(const matrix<double> *, int) const  (isz = 1) (sz = 9)
     [[ Callee not m with inlining pragma  <2>]]
  -> INLINE (MANUAL): (700,15) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
  -> INLINE (MANUAL): (701,5) matrix<double>::csi0(const matrix<double> *, int) const (isz = 1) (sz = 9)
  -> INDIRECT- (VIRTUAL): (701,5)  matrix<double>::csi0(const matrix<double> *, int) const  (isz = 1) (sz = 9)
     [[ Callee not m with inlining pragma  <2>]]
  -> INLINE (MANUAL): (702,4) matrix<double>::matrix(matrix<double> *, const matrix<double> &) (isz = 130) (sz = 139)
    -> INLINE (MANUAL): (446,13) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
    -> INLINE (MANUAL): (446,13) matrix<double>::minit(matrix<double> *, int, int, attribute) (isz = 55) (sz = 66)
      -> INLINE (MANUAL): (343,3) matrix<double>::isempty(const matrix<double> *) const (isz = 5) (sz = 12)
        -> INLINE (MANUAL): (1241,3) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> INLINE (MANUAL): (344,7) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> INLINE (MANUAL): (348,11) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> (348,11) alloc(unsigned int) (isz = 601) (sz = 609)
         [[ Inlining would exceed -Qinline-max-size value (609>230) <1>]]
      -> INLINE: (356,4) mem_profile_new(int, bool) (isz = 10) (sz = 17)
        -> DELETED: (105,2) getProcId() (isz = 0) (sz = 4)
        -> DELETED: (106,2) getProcId() (isz = 0) (sz = 4)
      -> INLINE (MANUAL): (356,4) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
    -> INLINE (MANUAL): (446,13) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
    -> INLINE (MANUAL): (448,23) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
    -> INLINE (MANUAL): (449,17) matrix<double>::csi0(const matrix<double> *, int) const (isz = 1) (sz = 9)
    -> INDIRECT- (VIRTUAL): (449,17)  matrix<double>::csi0(const matrix<double> *, int) const  (isz = 1) (sz = 9)
       [[ Callee not m with inlining pragma  <2>]]
  -> INLINE (MANUAL): (702,4) matrix<double>::set_attribute(matrix<double> *, attribute) (isz = 5) (sz = 14)
  -> INLINE (MANUAL): (702,4) matrix<double>::~matrix(matrix<double> *) (isz = 39) (sz = 44)
    -> (509,7) ree(void *, unsigned int) (isz = 52) (sz = 59)
       [[ Callee not m with inlining pragma  <2>]]
    -> INLINE (MANUAL): (509,7) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
    -> INLINE (MANUAL): (512,4) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
    -> INLINE: (512,4) mem_profile_del(int, bool) (isz = 10) (sz = 17)
      -> DELETED: (109,2) getProcId() (isz = 0) (sz = 4)
      -> DELETED: (110,2) getProcId() (isz = 0) (sz = 4)
  -> INLINE (MANUAL): (703,9) matrix<double>::isscalar(const matrix<double> *) const (isz = 5) (sz = 12)
    -> INLINE (MANUAL): (1238,3) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
  -> INLINE (MANUAL): (704,14) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
  -> INLINE (MANUAL): (704,14) matrix<double>::matrix(matrix<double> *, int, int, attribute) (isz = 59) (sz = 72)
    -> INLINE (MANUAL): (409,3) matrix<double>::minit(matrix<double> *, int, int, attribute) (isz = 55) (sz = 66)
      -> INLINE (MANUAL): (343,3) matrix<double>::isempty(const matrix<double> *) const (isz = 5) (sz = 12)
        -> INLINE (MANUAL): (1241,3) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> INLINE (MANUAL): (344,7) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> INLINE (MANUAL): (348,11) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> (348,11) alloc(unsigned int) (isz = 601) (sz = 609)
         [[ Inlining would exceed -Qinline-max-size value (609>230) <1>]]
      -> INLINE: (356,4) mem_profile_new(int, bool) (isz = 10) (sz = 17)
        -> DELETED: (105,2) getProcId() (isz = 0) (sz = 4)
        -> DELETED: (106,2) getProcId() (isz = 0) (sz = 4)
      -> INLINE (MANUAL): (356,4) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
  -> INLINE (MANUAL): (704,14) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
  -> INDIRECT- (VIRTUAL): (705,9)  matrix<double>::csi0(const matrix<double> *, int) const  (isz = 1) (sz = 9)
     [[ Callee not m with inlining pragma  <2>]]
  -> INLINE (MANUAL): (707,15) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
  -> INDIRECT- (VIRTUAL): (708,5)  matrix<double>::csi0(const matrix<double> *, int) const  (isz = 1) (sz = 9)
     [[ Callee not m with inlining pragma  <2>]]
  -> INLINE (MANUAL): (708,5) matrix<double>::csi0(const matrix<double> *, int) const (isz = 1) (sz = 9)
  -> INLINE (MANUAL): (709,4) matrix<double>::set_attribute(matrix<double> *, attribute) (isz = 5) (sz = 14)
  -> INLINE (MANUAL): (709,4) matrix<double>::matrix(matrix<double> *, const matrix<double> &) (isz = 130) (sz = 139)
    -> INLINE (MANUAL): (446,13) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
    -> INLINE (MANUAL): (446,13) matrix<double>::minit(matrix<double> *, int, int, attribute) (isz = 55) (sz = 66)
      -> INLINE (MANUAL): (343,3) matrix<double>::isempty(const matrix<double> *) const (isz = 5) (sz = 12)
        -> INLINE (MANUAL): (1241,3) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> INLINE (MANUAL): (344,7) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> INLINE (MANUAL): (348,11) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> (348,11) alloc(unsigned int) (isz = 601) (sz = 609)
         [[ Inlining would exceed -Qinline-max-size value (609>230) <1>]]
      -> INLINE: (356,4) mem_profile_new(int, bool) (isz = 10) (sz = 17)
        -> DELETED: (105,2) getProcId() (isz = 0) (sz = 4)
        -> DELETED: (106,2) getProcId() (isz = 0) (sz = 4)
      -> INLINE (MANUAL): (356,4) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
    -> INLINE (MANUAL): (446,13) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
    -> INLINE (MANUAL): (448,23) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
    -> INLINE (MANUAL): (449,17) matrix<double>::csi0(const matrix<double> *, int) const (isz = 1) (sz = 9)
    -> INDIRECT- (VIRTUAL): (449,17)  matrix<double>::csi0(const matrix<double> *, int) const  (isz = 1) (sz = 9)
       [[ Callee not m with inlining pragma  <2>]]
  -> INLINE (MANUAL): (709,4) matrix<double>::~matrix(matrix<double> *) (isz = 39) (sz = 44)
    -> (509,7) ree(void *, unsigned int) (isz = 52) (sz = 59)
       [[ Callee not m with inlining pragma  <2>]]
    -> INLINE (MANUAL): (509,7) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
    -> INLINE (MANUAL): (512,4) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
    -> INLINE: (512,4) mem_profile_del(int, bool) (isz = 10) (sz = 17)
      -> DELETED: (109,2) getProcId() (isz = 0) (sz = 4)
      -> DELETED: (110,2) getProcId() (isz = 0) (sz = 4)
  -> INLINE (MANUAL): (710,9) matrix<double>::samesize<double>(const matrix<double> *, const matrix<double> &) const (isz = 21) (sz = 30)
    -> INLINE (MANUAL): (1244,3) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
    -> INLINE (MANUAL): (1244,3) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
  -> INLINE (MANUAL): (711,14) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
  -> INLINE (MANUAL): (711,14) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
  -> INLINE (MANUAL): (711,14) matrix<double>::matrix(matrix<double> *, int, int, attribute) (isz = 59) (sz = 72)
    -> INLINE (MANUAL): (409,3) matrix<double>::minit(matrix<double> *, int, int, attribute) (isz = 55) (sz = 66)
      -> INLINE (MANUAL): (343,3) matrix<double>::isempty(const matrix<double> *) const (isz = 5) (sz = 12)
        -> INLINE (MANUAL): (1241,3) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> INLINE (MANUAL): (344,7) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> INLINE (MANUAL): (348,11) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> (348,11) alloc(unsigned int) (isz = 601) (sz = 609)
         [[ Inlining would exceed -Qinline-max-size value (609>230) <1>]]
      -> INLINE: (356,4) mem_profile_new(int, bool) (isz = 10) (sz = 17)
        -> DELETED: (105,2) getProcId() (isz = 0) (sz = 4)
        -> DELETED: (106,2) getProcId() (isz = 0) (sz = 4)
      -> INLINE (MANUAL): (356,4) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
  -> INLINE (MANUAL): (713,15) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
  -> INLINE (MANUAL): (714,5) matrix<double>::csi0(const matrix<double> *, int) const (isz = 1) (sz = 9)
  -> INDIRECT- (VIRTUAL): (714,5)  matrix<double>::csi0(const matrix<double> *, int) const  (isz = 1) (sz = 9)
     [[ Callee not m with inlining pragma  <2>]]
  -> INDIRECT- (VIRTUAL): (714,5)  matrix<double>::csi0(const matrix<double> *, int) const  (isz = 1) (sz = 9)
     [[ Callee not m with inlining pragma  <2>]]
  -> INLINE (MANUAL): (715,4) matrix<double>::set_attribute(matrix<double> *, attribute) (isz = 5) (sz = 14)
  -> INLINE (MANUAL): (715,4) matrix<double>::~matrix(matrix<double> *) (isz = 39) (sz = 44)
    -> (509,7) ree(void *, unsigned int) (isz = 52) (sz = 59)
       [[ Callee not m with inlining pragma  <2>]]
    -> INLINE (MANUAL): (509,7) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
    -> INLINE (MANUAL): (512,4) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
    -> INLINE: (512,4) mem_profile_del(int, bool) (isz = 10) (sz = 17)
      -> DELETED: (109,2) getProcId() (isz = 0) (sz = 4)
      -> DELETED: (110,2) getProcId() (isz = 0) (sz = 4)
  -> INLINE (MANUAL): (715,4) matrix<double>::matrix(matrix<double> *, const matrix<double> &) (isz = 130) (sz = 139)
    -> INLINE (MANUAL): (446,13) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
    -> INLINE (MANUAL): (446,13) matrix<double>::minit(matrix<double> *, int, int, attribute) (isz = 55) (sz = 66)
      -> INLINE (MANUAL): (343,3) matrix<double>::isempty(const matrix<double> *) const (isz = 5) (sz = 12)
        -> INLINE (MANUAL): (1241,3) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> INLINE (MANUAL): (344,7) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> INLINE (MANUAL): (348,11) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
      -> (348,11) alloc(unsigned int) (isz = 601) (sz = 609)
         [[ Inlining would exceed -Qinline-max-size value (609>230) <1>]]
      -> INLINE: (356,4) mem_profile_new(int, bool) (isz = 10) (sz = 17)
        -> DELETED: (105,2) getProcId() (isz = 0) (sz = 4)
        -> DELETED: (106,2) getProcId() (isz = 0) (sz = 4)
      -> INLINE (MANUAL): (356,4) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
    -> INLINE (MANUAL): (446,13) matrix<double>::size(const matrix<double> *, int) const (isz = 3) (sz = 12)
    -> INLINE (MANUAL): (448,23) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
    -> INLINE (MANUAL): (449,17) matrix<double>::csi0(const matrix<double> *, int) const (isz = 1) (sz = 9)
    -> INDIRECT- (VIRTUAL): (449,17)  matrix<double>::csi0(const matrix<double> *, int) const  (isz = 1) (sz = 9)
       [[ Callee not m with inlining pragma  <2>]]
  -> INLINE (MANUAL): (718,3) matrix<double>::empty(const matrix<double> *) (isz = 63) (sz = 70)
    -> INLINE (MANUAL): (1156,3) matrix<double>::matrix(matrix<double> *, int, int, attribute) (isz = 59) (sz = 72)
      -> INLINE (MANUAL): (409,3) matrix<double>::minit(matrix<double> *, int, int, attribute) (isz = 55) (sz = 66)
        -> INLINE (MANUAL): (343,3) matrix<double>::isempty(const matrix<double> *) const (isz = 5) (sz = 12)
          -> INLINE (MANUAL): (1241,3) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
        -> INLINE (MANUAL): (344,7) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
        -> INLINE (MANUAL): (348,11) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
        -> (348,11) alloc(unsigned int) (isz = 601) (sz = 609)
           [[ Inlining would exceed -Qinline-max-size value (609>230) <1>]]
        -> INLINE (MANUAL): (356,4) matrix<double>::numel(const matrix<double> *) const (isz = 2) (sz = 8)
        -> INLINE: (356,4) mem_profile_new(int, bool) (isz = 10) (sz = 17)
          -> DELETED: (105,2) getProcId() (isz = 0) (sz = 4)
          -> DELETED: (106,2) getProcId() (isz = 0) (sz = 4)


    Report from: Loop nest, Vector & Auto-parallelization optimizations [loop, vec, par]


LOOP BEGIN at C:\prjdir\matrix.h(699,4)
   rem17104: loop was not parallelized: existence of parallel dependence
   rem15382: vectorization support: call to function virtual call: matrix<double>::csi0(const matrix<double> *, int) const cannot be vectorized   [ C:\prjdir\matrix.h(701,5) ]
   rem15344: loop was not vectorized: vector dependence prevents vectorization
   rem25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

LOOP BEGIN at C:\prjdir\matrix.h(447,13) inlined into C:\prjdir\matrix.h(703,4)
   rem17104: loop was not parallelized: existence of parallel dependence
   rem15382: vectorization support: call to function virtual call: matrix<double>::csi0(const matrix<double> *, int) const cannot be vectorized   [ C:\prjdir\matrix.h(449,17) ]
   rem15344: loop was not vectorized: vector dependence prevents vectorization
   rem25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

LOOP BEGIN at C:\prjdir\matrix.h(706,4)
   rem17104: loop was not parallelized: existence of parallel dependence
   rem15382: vectorization support: call to function virtual call: matrix<double>::csi0(const matrix<double> *, int) const cannot be vectorized   [ C:\prjdir\matrix.h(708,5) ]
   rem15344: loop was not vectorized: vector dependence prevents vectorization
   rem25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

LOOP BEGIN at C:\prjdir\matrix.h(447,13) inlined into C:\prjdir\matrix.h(710,4)
   rem17104: loop was not parallelized: existence of parallel dependence
   rem15382: vectorization support: call to function virtual call: matrix<double>::csi0(const matrix<double> *, int) const cannot be vectorized   [ C:\prjdir\matrix.h(449,17) ]
   rem15344: loop was not vectorized: vector dependence prevents vectorization
   rem25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

LOOP BEGIN at C:\prjdir\matrix.h(712,4)
   rem17104: loop was not parallelized: existence of parallel dependence
   rem15382: vectorization support: call to function virtual call: matrix<double>::csi0(const matrix<double> *, int) const cannot be vectorized   [ C:\prjdir\matrix.h(714,5) ]
   rem15382: vectorization support: call to function virtual call: matrix<double>::csi0(const matrix<double> *, int) const cannot be vectorized   [ C:\prjdir\matrix.h(714,5) ]
   rem15344: loop was not vectorized: vector dependence prevents vectorization
   rem25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

LOOP BEGIN at C:\prjdir\matrix.h(447,13) inlined into C:\prjdir\matrix.h(716,4)
   rem17104: loop was not parallelized: existence of parallel dependence
   rem15382: vectorization support: call to function virtual call: matrix<double>::csi0(const matrix<double> *, int) const cannot be vectorized   [ C:\prjdir\matrix.h(449,17) ]
   rem15344: loop was not vectorized: vector dependence prevents vectorization
   rem25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

 

 

I have also studied the below section as discussed here ...

mikeitexpert_0-1621251366391.png

Nothings seem to me out of order ... 

 

Any comment is much appreciate it.

0 Kudos
1 Solution
jimdempseyatthecove
Black Belt
390 Views

Try this untested code:


#  define MAT_ASSERT(cond, msg) if(cond){ } else { std::cerr << std::endl << "Error, assertion failed in file:'" << __FILE__ << "' at line:" << 
#  define MAT_ASSERT_SHORT(cond) MAT_ASSERT(cond, #cond)


template <class V>
class matrix{
	V * data;
	int nrows, ncols;
	unsigned char attribs;
// some constructor definitions

        inline int numel() const{ return nrows*ncols; }
	inline int size(int d) const{ return (d==1) ? nrows : ncols; }
	

        #pragma omp declare simd 
	#pragma omp declare simd uniform(this)
	#pragma omp declare simd uniform(this) simdlen(4) linear(i)		
	virtual inline V& csi0(int i) const{
		return data[i];
	}


	template <class V1> friend matrix<V> operator-(const matrix<V> & a, const matrix<V1> & b) {
	    DEBUG_VERBOSE("op matrix<V> - matrix<V1>");
		if(a.isscalar()){
			matrix<V> c(b.size(1),b.size(2));
			V v0 = (V)a.csi0(0);
			V * _c = &c.csi0(0);
			V * _b = &b.csi0(0);
			#pragma omp parallel for simd
			for (int i = 0 ; i < b.numel(); i++)
				_c[i] = v0 - _b[i];
			return c.set_attribute(TEMPORARY);
		}else if(b.isscalar()){
			matrix<V> c(a.size(1),a.size(2));
			V v0 = (V)b.csi0(0);
			V * _c = &c.csi0(0);
			V * _a = &a.csi0(0);
			#pragma omp parallel for simd
			for (int i = 0 ; i < a.numel(); i++)
				_c[i] = _a[i] - v0;
			return c.set_attribute(TEMPORARY);
		}else if(a.samesize(b)){
			matrix<V> c(a.size(1),a.size(2));
			V * _c = &c.csi0(0);
			V * _a = &a.csi0(0);
			V * _b = &b.csi0(0);
			#pragma omp parallel for simd
			for (int i = 0 ; i < a.numel(); i++)
				_c[i] = _a[i] - _b[i];
			return c.set_attribute(TEMPORARY);
		}else
			MAT_ABORT("Matrix dimension mismatch in operation '-'.");
		return matrix<V>::empty(); // never is reached
	}
};


Note, by making data public, you could use c.data[i] = a.data[i] - b.data[i];

etc... for the +/*,...

 

Jim Dempsey

View solution in original post

7 Replies
jimdempseyatthecove
Black Belt
490 Views

I think the compiler can assume the &'s (references) can (potentially) be aliased, had you used * (pointer) then you could apply the "restrict" attribute. Try using a temporary pointer to the array, then call an inlined templated function using restrict on the pointers to perform the operation. Not sure if you can use restrict on a reference.

Jim Dempsey

mikeitexpert
New Contributor II
466 Views

Thank you for the clue ... however, the code is pretty big and changing signatures would be cumbersome at this point. Interestingly enough I can get pretty smooth vectorization using simplified examples (I was hopping I could reproduce the issue, which apparently failed ).

So I am guessing, there are other parts of the application that could possibly cause the issue (perhaps some sort of side effect maybe cause by template instantiation). In other words, in the smaller example, I can vectorize the operator+/- just fine, but in the main application vectorization fails due to vector dependence. 

Do you thing inlining would be a hinderance for vectorization (causing vector dependence perhaps)? Hypothetically, Let say the overloaded operator+/- is hinted to have a simd-able loop (ie.  using #pragma omp parallel for simd and #pragam omp declare simd for all functions called inside the + operator main for loop and all other fucntion called nested-ly in the loop) and I forgot to declare different simd variants of the inner functions (using #pragam omp declare simd) and the compiler tries to choose from smaller set of the simd-enabled inner function variants and then trying to inline the operator loop throughout the application. Then it realizes that it has to ignore vectorization to be able to inline the operator's code all across the application so that the operator+/- main operator's loop can be inline-able across the application. Would such sort of (probably ill-posed) scenario potentially cause vector dependence? 

Many thanks for your valuable comments.

 

 

VidyalathaB_Intel
Moderator
407 Views

Hi,

Could you please provide us a minimal reproducer so that we can work on it from our end

Regards,

Vidya.


jimdempseyatthecove
Black Belt
391 Views

Try this untested code:


#  define MAT_ASSERT(cond, msg) if(cond){ } else { std::cerr << std::endl << "Error, assertion failed in file:'" << __FILE__ << "' at line:" << 
#  define MAT_ASSERT_SHORT(cond) MAT_ASSERT(cond, #cond)


template <class V>
class matrix{
	V * data;
	int nrows, ncols;
	unsigned char attribs;
// some constructor definitions

        inline int numel() const{ return nrows*ncols; }
	inline int size(int d) const{ return (d==1) ? nrows : ncols; }
	

        #pragma omp declare simd 
	#pragma omp declare simd uniform(this)
	#pragma omp declare simd uniform(this) simdlen(4) linear(i)		
	virtual inline V& csi0(int i) const{
		return data[i];
	}


	template <class V1> friend matrix<V> operator-(const matrix<V> & a, const matrix<V1> & b) {
	    DEBUG_VERBOSE("op matrix<V> - matrix<V1>");
		if(a.isscalar()){
			matrix<V> c(b.size(1),b.size(2));
			V v0 = (V)a.csi0(0);
			V * _c = &c.csi0(0);
			V * _b = &b.csi0(0);
			#pragma omp parallel for simd
			for (int i = 0 ; i < b.numel(); i++)
				_c[i] = v0 - _b[i];
			return c.set_attribute(TEMPORARY);
		}else if(b.isscalar()){
			matrix<V> c(a.size(1),a.size(2));
			V v0 = (V)b.csi0(0);
			V * _c = &c.csi0(0);
			V * _a = &a.csi0(0);
			#pragma omp parallel for simd
			for (int i = 0 ; i < a.numel(); i++)
				_c[i] = _a[i] - v0;
			return c.set_attribute(TEMPORARY);
		}else if(a.samesize(b)){
			matrix<V> c(a.size(1),a.size(2));
			V * _c = &c.csi0(0);
			V * _a = &a.csi0(0);
			V * _b = &b.csi0(0);
			#pragma omp parallel for simd
			for (int i = 0 ; i < a.numel(); i++)
				_c[i] = _a[i] - _b[i];
			return c.set_attribute(TEMPORARY);
		}else
			MAT_ABORT("Matrix dimension mismatch in operation '-'.");
		return matrix<V>::empty(); // never is reached
	}
};


Note, by making data public, you could use c.data[i] = a.data[i] - b.data[i];

etc... for the +/*,...

 

Jim Dempsey

VidyalathaB_Intel
Moderator
319 Views

Hi Mike,

Could you please confirm if your issue is resolved and whether we can close this thread.

Regards,

Vidya


mikeitexpert
New Contributor II
289 Views
VidyalathaB_Intel
Moderator
275 Views

Hi Mike,

Thanks for the confirmation!

As this issue has been resolved, we will no longer respond to this thread. 

If you require any additional assistance from Intel, please start a new thread. 

Any further interaction in this thread will be considered community only. 

Have a Good day.

Regards,

Vidya.


Reply