- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Hi Everyone,

I am familiarizing myself with simd-enabled functions. While I was testing the simdlen clause, I come across an observation which I have a hard time to justify for myself. Here is what is going on ...

I have a matrix class which overloads binary operator + and I have **csi0(int i)** method returning the i-th element of the matrix assuming it is stored in a column major manner.

To benefit from vectorization, I declare **csi0 simd-enabled using "#pragma omp declare simd simdlen(32)" **and for operator+, I use **"#pragma omp parallel for simd" **right before the addition loop as you see below:

**test_vec.cpp**

```
#include <iostream>
#include <ctime>
#include "data_types.h"
using std::ostream;
using std::cout;
using std::endl;
#define MEM_ALIGN (512/8)
#define USE_ALIGNED_MEM
#ifdef MAT_DEBUG_VERBOSE
# define DEBUG_VERBOSE(msg) std::cerr << std::endl << msg << " file:'" << __FILE__ << "' line:" << __LINE__ << std::endl
# define DEBUG_VERBOSE(msg) std::cerr << std::endl << msg << " line:" << __LINE__ << std::endl
#else
# define DEBUG_VERBOSE(msg)
#endif
enum attribute{
NONE = 0, // no attributes set
COMPLEX = 1, SPARSE = 2, UNUSED_1 = 4, UNUSED_2 = 8, // data type attributes
DOT = 16, LEFT = 32, TEMPORARY = 64, UNUSED_3 = 128 // internal operational attributes
};
template <class V>
class matrix{
public:
// typedef __declspec(align(64)) V VA;
typedef V VA;
VA * data;
int nrows, ncols;
unsigned char attribs;
inline matrix<V>& set_attribute(attribute attrib){
attribs |= (unsigned char) attrib;
return *this;
}
inline matrix<V>& clear_attribute(attribute attrib){
attribs &= ~((unsigned char) attrib);
return *this;
}
inline bool get_attribute(attribute attrib) const{
return (attribs & (unsigned char)attrib) != 0;
}
inline void minit(int nr, int nc){
nrows = nr;
ncols = nc;
if (numel()){
data = (VA*) _mm_malloc(nr*nc*sizeof(V), MEM_ALIGN);
} else {
data = NULL;
}
}
// default constructor
matrix(){
minit(0,0);
}
matrix(int nr, int nc){
minit(nr, nc);
nrows = nr;
ncols = nc;
}
matrix(int nr, int nc, V v0){
matrix<V> m(nr, nc);
for(int i = 0; i < m.numel(); i++){
m.csi0(i) = v0;
}
nrows = m.nrows;
ncols = m.ncols;
data = m.data;
}
#pragma omp declare simd uniform(this) linear(i:1) simdlen(32)
virtual inline V& csi0(int i) const{
return data[i];
}
// binary +
template <class V1> friend matrix<V> operator+(const matrix<V> & a, const matrix<V1> & b) {
DEBUG_VERBOSE("op matrix<V> + matrix<V1>");
if(a.isscalar()){
matrix<V> c(b.size(1),b.size(2));
V v0 = (V)a.csi0(0);
#pragma omp parallel for simd
for (int i = 0 ; i < b.numel(); i++)
c.csi0(i) = v0 + b.csi0(i) ;
return c.set_attribute(TEMPORARY);
}else if(b.isscalar()){
matrix<V> c(a.size(1), a.size(2));
V v0 = (V)b.csi0(0);
#pragma omp parallel for simd
for (int i = 0 ; i < a.numel(); i++)
c.csi0(i) = a.csi0(i) + v0;
return c.set_attribute(TEMPORARY);
}else if(a.samesize(b)){
matrix<V> c(a.size(1), a.size(2));
#pragma omp parallel for simd
for (int i = 0 ; i < a.numel(); i++)
c.csi0(i) = a.csi0(i) + b.csi0(i);
return c.set_attribute(TEMPORARY);
}else
return matrix<V>::empty(); // never is reached
}
inline int numel() const{ return nrows*ncols; }
inline int size(int d) const{ return (d==1) ? nrows : ncols; }
inline dt_bool isempty() const{
return (numel() == 0) ? 1 : 0;
}
static inline const matrix<V> empty() {
return matrix<V>(0,0);
}
template <class V1> inline dt_bool samesize(const matrix<V1>& m) const{
return nrows == m.size(1) && ncols == m.size(2);
}
inline dt_bool isscalar() const{
return (numel() == 1) ? 1 : 0;
}
};
#define printline std::cout << " LINE: " << __LINE__ << std::endl;
#define NROW 1400
#define NCOL 1400
int main(){
clock_t st = clock();
printline
{
matrix<dt_single> A(NROW, NCOL, 1);
// std::cout << A << std::endl;
matrix<dt_single> B(NROW, NCOL, 2);
matrix<dt_single> C;
printline
C = A + B;
}
clock_t et = clock();
printf("Time ellapsed: %f \n", (et-st)/double(CLOCKS_PER_SEC));
printline
// std::cout << C << std::endl;
std::cout << "Program Done!" << std::endl;
return 0;
}
```

And I build it using the below comand:

```
C:\prjdir\tmp\simdtests>make clean all && test_vec.exe
rm -f *.exp *.lib *.supp *.exe *.obj *.optrpt
icl.exe -Qopt-report:5 -Qopt-report-phase:all -Qopenmp -Qsimd -Qopenmp-simd -arch:avx -arch:core-avx2 -axcode:core-i7-avx -Qdiag-error-limit:5 -I. -DUSE_LOCAL_HEAP -DUSE_MM_MALLOC -c test_vec.cpp -Fo:test_vec.obj
Intel(R) C++ Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.1.2.254 Build 20200623
Copyright (C) 1985-2020 Intel Corporation. All rights reserved.
icl: command line warning #10121: overriding '/archavx' with '/archcore-avx2'
icl: command line warning #10006: ignoring unknown option '/axcode:core-i7-avx'
icl: remark #10397: optimization reports are generated in *.optrpt files in the output location
test_vec.cpp
xilink.exe test_vec.obj -LIBPATH:../../Debug/lib -out:test_vec.exe libutils.lib libmemory.lib
xilink: executing 'link'
Microsoft (R) Incremental Linker Version 14.28.29913.0
Copyright (C) Microsoft Corporation. All rights reserved.
test_vec.obj
-LIBPATH:../../Debug/lib
-out:test_vec.exe
libutils.lib
libmemory.lib
LINE: 140
LINE: 146
Time ellapsed: 0.025000
LINE: 152
Program Done!
C:\prjdir\tmp\simdtests>
```

**Here is the optimization report file**

```
LOOP BEGIN at C:\prjdir\tmp\simdtests\test_vec.cpp(103,13) inlined into C:\prjdir\tmp\simdtests\test_vec.cpp(147,15)
remark #15388: vectorization support: reference c has aligned access [ C:\prjdir\tmp\simdtests\test_vec.cpp(105,19) ]
remark #15415: vectorization support: indirect load was generated for the variable <*__dereg>, 64-bit indexed, part of address is result of call to function [ C:\prjdir\tmp\simdtests\test_vec.cpp(105,36) ]
remark #15415: vectorization support: indirect load was generated for the variable <at (105:48)>, 64-bit indexed, part of address is result of call to function [ C:\prjdir\tmp\simdtests\test_vec.cpp(105,48) ]
remark #15305: vectorization support: vector length 32
remark #15309: vectorization support: normalized vectorization overhead 0.051
remark #15301: SIMD LOOP WAS VECTORIZED
remark #15442: entire loop may be executed in remainder
remark #15449: unmasked aligned unit stride stores: 1
remark #15462: unmasked indexed (or gather) loads: 2
remark #15475: --- begin vector cost summary ---
remark #15476: scalar cost: 210
remark #15477: vector cost: 35.710
remark #15478: estimated potential speedup: 4.890
remark #15484: vector function calls: 2
remark #15488: --- end vector cost summary ---
remark #15489: --- begin vector function matching report ---
remark #15490: Function call: (unknown virtual call) with simdlen=32, actual parameter types: (uniform,linear:1) [ C:\prjdir\tmp\simdtests\test_vec.cpp(105,36) ]
remark #15492: A suitable vector variant was found (out of 2) with xmm, simdlen=32, unmasked, formal parameter types: (uniform,linear:1)
remark #26014: The function ISA does not match the compilation target. For better SIMD performance, consider using -Qvecabi=cmdtarget compiler switch or "processor" clause in vector function declaration
remark #15490: Function call: (unknown virtual call) with simdlen=32, actual parameter types: (uniform,linear:1) [ C:\prjdir\tmp\simdtests\test_vec.cpp(105,48) ]
remark #15492: A suitable vector variant was found (out of 2) with xmm, simdlen=32, unmasked, formal parameter types: (uniform,linear:1)
remark #26014: The function ISA does not match the compilation target. For better SIMD performance, consider using -Qvecabi=cmdtarget compiler switch or "processor" clause in vector function declaration
remark #15493: --- end vector function matching report ---
remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END
LOOP BEGIN at C:\prjdir\tmp\simdtests\test_vec.cpp(103,13) inlined into C:\prjdir\tmp\simdtests\test_vec.cpp(147,15)
<Remainder loop for vectorization>
remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END
```

When I check the optimization reports generated during compiling and linking, I get to see **simdlen=32 **in which really odd because __my laptop CPU supports only AVX2__ which has only 256 bits per SIMD Registers. If simd len is 32 after optimization this means that each simd register is 1024 bit long which sounds odd.

**SIMD Register Size (in bits) = 32** x **sizeof **(**float**)** x sizeof(1 byte) = 32 x 4 x 8 = 2^10 =1024.**

I would appreciate if an expert elaborate on this.

Link Copied

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Hi,

Thanks for reaching out to us.

Could you please provide us the details of **data_types.h** file so that we can try it from our end.

Regards,

Vidya.

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Hi,

**data_types.h **is not relevant just a macro definitions for dt_bool.

Thanks for the follow up.

```
#ifndef __DATA_TYPES_H__
#define __DATA_TYPES_H__
#define dt_bool bool
#define dt_logical bool
#endif /* __DATA_TYPES_H__ */
```

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

You mentioned dt_bool data type is only a byte, but on your calculation you used a float.

Have you tested with other compilers to see if it does any difference?

For some reasons, I got error ("dt_single" is undefined) when trying to compiler your code. Can you attach a preprocessed file?

Thanks,

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

I am not sure if your calculation is correct; but if your vector length is big, you would see a warning.

$ gcc -fopenmp test3.c

$ icc -fopenmp test3.c

$ gcc -fopenmp test3.c -D_LEN64

test3.c:11:8: warning: unsupported simdlen 64

double temp ( float *A, float *B , int i ) {

^~~~

$ icc -fopenmp test3.c -D_LEN64

test3.c(11): (col. 44) remark: function can't be vectorized: too many registers required to return value (big vector length)

test3.c(11): (col. 44) error #13397: vector function was not vectorized

test3.c(11): (col. 44) remark: function can't be vectorized: too many registers required to return value (big vector length)

test3.c(11): (col. 44) error #13397: vector function was not vectorized

compilation aborted for test3.c (code 1)

$ cat test3.c

#include <stdio.h>

#include <math.h>

#define SIZE 512*512

float foo(float A ) {

return 1.0f / (A*A);

}

#ifdef _LEN64

#pragma omp declare simd uniform(A, B ) linear( i : 1 ) simdlen ( 64)

double temp ( float *A, float *B , int i ) {

return B[i] * foo(A[i]) ;

}

#else

#pragma omp declare simd uniform(A, B ) linear( i : 1 ) simdlen (32)

double temp ( float *A, float *B , int i ) {

return B[i] * foo(A[i]) ;

}

#endif

int main ( ) {

float A[SIZE] , B [SIZE] ;

double C [SIZE] ;

for ( int i = 0 ; i < SIZE ; i ++){

C [i] = temp (A, B , i ) ;

}

return 0;

}

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Thanks for the example ... though I am not sure how my case is justified because there is no error and at least based on reports the loop in my code is vectorized just fine. I am not sure it if is just a sth showing up in the report or there is some sort of an abstraction layer between hardware and compiler generated code to virtually simulate higher vector sizes ... I am a software guy with couple of years of embedded programming experience (though not an expert in hardware standards, abstractions, etc)

I hope @jimdempseyatthe

Regards

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Hi Mike,

I don't have any more inputs on this topic and hope others can give us some feedbacks.

Thanks,

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Hi Mike,

Is is OK to close this thread from our end?

Thanks,

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Hi Mike,

Thank for the kudos. We will no longer respond to this thread. If you require additional assistance from Intel, please start a new thread. Any further interaction in this thread will be considered community only.

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

*>>>When I check the optimization reports generated during compiling and linking, I get to see simdlen=32 in which really odd because my laptop CPU supports only AVX2 which has only 256 bits per SIMD Registers. If simd len is 32 after optimization this means that each simd register is 1024 bit long which sounds odd.>>>*

Architecturally the *PRF probably is of length 256 (per single register) bits divided into a two 128-bit lanes. There are no 1024-bit software visible registers neither on current Intel Processors nor on AMD Processors. The large PRF had a larger presence in the era of the Vector Processors (you may look up an ALPHA extension, i.e. Tarantula VP).

Now in regards to "simdlen=32" I suppose, that it might have been used mainly for loop unrolling purpose (e.g. 4x unrolling).

*Physical Register File

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Thanks for the valuable comments on Tarantula extensions. I am curious how comfortable they are to work with. Is there any good compiler for them? How well are they supported by intel community (or perhaps other popular vendors)?

I doubt it has anything to do with unrolling cuz they are reported separately as part of vectorization report. (see below)

```
LOOP BEGIN at matrix.h(1645,3) inlined into myapp.h(732,9)
remark #25084: Preprocess Loopnests: Moving Out Store [ matrix.h(1645,32) ]
remark #15389: vectorization support: reference this->data[i] has unaligned access [ matrix.h(1646,4) ]
remark #15389: vectorization support: reference this->data[i] has unaligned access [ matrix.h(1646,4) ]
remark #15381: vectorization support: unaligned access used inside loop body
remark #15305: vectorization support: vector length 8
remark #15399: vectorization support: unroll factor set to 2
remark #15309: vectorization support: normalized vectorization overhead 1.067
remark #15355: vectorization support: nrm is double type reduction [ matrix.h(1646,4) ]
remark #15301: SIMD LOOP WAS VECTORIZED
remark #15442: entire loop may be executed in remainder
remark #15450: unmasked unaligned unit stride loads: 1
remark #15475: --- begin vector cost summary ---
remark #15476: scalar cost: 7
remark #15477: vector cost: 1.870
remark #15478: estimated potential speedup: 3.200
remark #15488: --- end vector cost summary ---
remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END
```

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

@mikeitexpert wrote:

Thanks for the valuable comments on Tarantula extensions. I am curious how comfortable they are to work with. Is there any good compiler for them? How well are they supported by intel community (or perhaps other popular vendors)?

I doubt it has anything to do with unrolling cuz they are reported separately as part of vectorization report. (see below)

`LOOP BEGIN at matrix.h(1645,3) inlined into myapp.h(732,9) remark #25084: Preprocess Loopnests: Moving Out Store [ matrix.h(1645,32) ] remark #15389: vectorization support: reference this->data[i] has unaligned access [ matrix.h(1646,4) ] remark #15389: vectorization support: reference this->data[i] has unaligned access [ matrix.h(1646,4) ] remark #15381: vectorization support: unaligned access used inside loop body remark #15305: vectorization support: vector length 8 remark #15399: vectorization support: unroll factor set to 2 remark #15309: vectorization support: normalized vectorization overhead 1.067 remark #15355: vectorization support: nrm is double type reduction [ matrix.h(1646,4) ] remark #15301: SIMD LOOP WAS VECTORIZED remark #15442: entire loop may be executed in remainder remark #15450: unmasked unaligned unit stride loads: 1 remark #15475: --- begin vector cost summary --- remark #15476: scalar cost: 7 remark #15477: vector cost: 1.870 remark #15478: estimated potential speedup: 3.200 remark #15488: --- end vector cost summary --- remark #25456: Number of Array Refs Scalar Replaced In Loop: 1 LOOP END`

@mikeitexpert wrote:

Thanks for the valuable comments on Tarantula extensions. I am curious how comfortable they are to work with. Is there any good compiler for them? How well are they supported by intel community (or perhaps other popular vendors)?

I doubt it has anything to do with unrolling cuz they are reported separately as part of vectorization report. (see below)

`LOOP BEGIN at matrix.h(1645,3) inlined into myapp.h(732,9) remark #25084: Preprocess Loopnests: Moving Out Store [ matrix.h(1645,32) ] remark #15389: vectorization support: reference this->data[i] has unaligned access [ matrix.h(1646,4) ] remark #15389: vectorization support: reference this->data[i] has unaligned access [ matrix.h(1646,4) ] remark #15381: vectorization support: unaligned access used inside loop body remark #15305: vectorization support: vector length 8 remark #15399: vectorization support: unroll factor set to 2 remark #15309: vectorization support: normalized vectorization overhead 1.067 remark #15355: vectorization support: nrm is double type reduction [ matrix.h(1646,4) ] remark #15301: SIMD LOOP WAS VECTORIZED remark #15442: entire loop may be executed in remainder remark #15450: unmasked unaligned unit stride loads: 1 remark #15475: --- begin vector cost summary --- remark #15476: scalar cost: 7 remark #15477: vector cost: 1.870 remark #15478: estimated potential speedup: 3.200 remark #15488: --- end vector cost summary --- remark #25456: Number of Array Refs Scalar Replaced In Loop: 1 LOOP END`

@mikeitexpert wrote:

`LOOP BEGIN at matrix.h(1645,3) inlined into myapp.h(732,9) remark #25084: Preprocess Loopnests: Moving Out Store [ matrix.h(1645,32) ] remark #15389: vectorization support: reference this->data[i] has unaligned access [ matrix.h(1646,4) ] remark #15389: vectorization support: reference this->data[i] has unaligned access [ matrix.h(1646,4) ] remark #15381: vectorization support: unaligned access used inside loop body remark #15305: vectorization support: vector length 8 remark #15399: vectorization support: unroll factor set to 2 remark #15309: vectorization support: normalized vectorization overhead 1.067 remark #15355: vectorization support: nrm is double type reduction [ matrix.h(1646,4) ] remark #15301: SIMD LOOP WAS VECTORIZED remark #15442: entire loop may be executed in remainder remark #15450: unmasked unaligned unit stride loads: 1 remark #15475: --- begin vector cost summary --- remark #15476: scalar cost: 7 remark #15477: vector cost: 1.870 remark #15478: estimated potential speedup: 3.200 remark #15488: --- end vector cost summary --- remark #25456: Number of Array Refs Scalar Replaced In Loop: 1 LOOP END`

Tarantula Vector Extension probably were not implemented into a working silicon (it is an academic work). These extensions are not supported by the Intel community. The closest Intel product (to those extensions) was/is Xeon Phi. Bear in mind that Tarantula operated on large 8KiB vector register file and was a single core unit.

The vectorization report states: "2x unroll factor". In regards to "simdlen" integer argument (a large ones) one possibility is to look at the OpenMP implementation and search for keyword "simdlen", maybe source code will have some clues.

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page