Re:how is simdlen=32 is even possible?!!!

mikeitexpert · ‎05-20-2021

Hi Everyone,

I am familiarizing myself with simd-enabled functions. While I was testing the simdlen clause, I come across an observation which I have a hard time to justify for myself. Here is what is going on ...

I have a matrix class which overloads binary operator + and I have csi0(int i) method returning the i-th element of the matrix assuming it is stored in a column major manner.

To benefit from vectorization, I declare csi0 simd-enabled using "#pragma omp declare simd simdlen(32)" and for operator+, I use "#pragma omp parallel for simd" right before the addition loop as you see below:

test_vec.cpp

#include <iostream>
#include <ctime>

#include "data_types.h"

using std::ostream;
using std::cout;
using std::endl;

#define MEM_ALIGN (512/8)
#define USE_ALIGNED_MEM
#ifdef MAT_DEBUG_VERBOSE
#  define DEBUG_VERBOSE(msg) std::cerr << std::endl << msg << " file:'" << __FILE__ << "' line:" << __LINE__ << std::endl
#  define DEBUG_VERBOSE(msg) std::cerr << std::endl << msg << " line:" << __LINE__ << std::endl
#else
#  define DEBUG_VERBOSE(msg)
#endif

enum attribute{
    NONE = 0,  // no attributes set
    COMPLEX = 1,  SPARSE = 2,  UNUSED_1 = 4,   UNUSED_2 = 8,   // data type attributes
    DOT     = 16, LEFT   = 32, TEMPORARY = 64, UNUSED_3 = 128  // internal operational attributes
};


template <class V>
class matrix{
public:
    // typedef __declspec(align(64)) V VA;
    typedef V VA;
    VA * data;   
    int nrows, ncols;
    unsigned char attribs;

    inline matrix<V>& set_attribute(attribute attrib){
        attribs |= (unsigned char) attrib;
        return *this;
    }
    inline matrix<V>& clear_attribute(attribute attrib){
        attribs &= ~((unsigned char) attrib);
        return *this;
    }
    inline bool get_attribute(attribute attrib) const{
           return (attribs & (unsigned char)attrib) != 0;
    }


    inline void minit(int nr, int nc){        
        nrows = nr;
        ncols = nc;
        if (numel()){
            data = (VA*) _mm_malloc(nr*nc*sizeof(V), MEM_ALIGN);
        } else {
            data = NULL;
        }
    }

    // default constructor
    matrix(){
        minit(0,0);
    }

    matrix(int nr, int nc){
        minit(nr, nc);
        nrows = nr;
    	ncols = nc;
    }

    matrix(int nr, int nc, V v0){
        matrix<V> m(nr, nc);
        for(int i = 0; i < m.numel(); i++){
            m.csi0(i) = v0;
        }
        nrows = m.nrows;
        ncols = m.ncols;
        data = m.data;
    }

    #pragma omp declare simd uniform(this) linear(i:1) simdlen(32)
    virtual inline V& csi0(int i) const{
        return data[i];
    }

    // binary +
    template <class V1> friend matrix<V> operator+(const matrix<V> & a, const matrix<V1> & b) {
            DEBUG_VERBOSE("op matrix<V> + matrix<V1>");
        if(a.isscalar()){
            matrix<V> c(b.size(1),b.size(2));
            V v0 = (V)a.csi0(0);
            #pragma omp parallel for simd
            for (int i = 0 ; i < b.numel(); i++)
                c.csi0(i) = v0 + b.csi0(i) ;
            return c.set_attribute(TEMPORARY);
        }else if(b.isscalar()){
            matrix<V> c(a.size(1), a.size(2));
            V v0 = (V)b.csi0(0);
            #pragma omp parallel for simd
            for (int i = 0 ; i < a.numel(); i++)
                c.csi0(i) = a.csi0(i) + v0;
                        return c.set_attribute(TEMPORARY);
        }else if(a.samesize(b)){
            matrix<V> c(a.size(1), a.size(2));
            #pragma omp parallel for simd
            for (int i = 0 ; i < a.numel(); i++)
                c.csi0(i) = a.csi0(i) + b.csi0(i);
            return c.set_attribute(TEMPORARY);
        }else
            return matrix<V>::empty(); // never is reached
    }

    inline int numel() const{ return nrows*ncols; }
    inline int size(int d) const{ return (d==1) ? nrows : ncols; }
    inline dt_bool isempty() const{
        return (numel() == 0) ? 1 : 0;
    }
    
    static inline const matrix<V> empty() {
        return matrix<V>(0,0);
    }

    template <class V1> inline dt_bool samesize(const matrix<V1>& m) const{
        return nrows == m.size(1) && ncols == m.size(2);
    }

    inline dt_bool isscalar() const{
        return (numel() == 1) ? 1 : 0;
    }

    
};


#define printline	std::cout << " LINE: " << __LINE__ << std::endl;
#define NROW 1400
#define NCOL 1400


int main(){
    clock_t st = clock();
    printline
    {
        matrix<dt_single> A(NROW, NCOL, 1);
        // std::cout << A << std::endl;
        matrix<dt_single> B(NROW, NCOL, 2);	
        matrix<dt_single> C;
        printline
        C = A + B;
    }
    clock_t et = clock();
    printf("Time ellapsed: %f \n", (et-st)/double(CLOCKS_PER_SEC));
    
    printline
    // std::cout << C << std::endl;
    std::cout << "Program Done!" << std::endl;
	return 0;
}

And I build it using the below comand:

C:\prjdir\tmp\simdtests>make clean all && test_vec.exe
rm -f *.exp *.lib *.supp *.exe *.obj *.optrpt
icl.exe -Qopt-report:5 -Qopt-report-phase:all  -Qopenmp -Qsimd -Qopenmp-simd  -arch:avx -arch:core-avx2 -axcode:core-i7-avx -Qdiag-error-limit:5 -I. -DUSE_LOCAL_HEAP -DUSE_MM_MALLOC -c test_vec.cpp -Fo:test_vec.obj
Intel(R) C++ Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.1.2.254 Build 20200623
Copyright (C) 1985-2020 Intel Corporation.  All rights reserved.
icl: command line warning #10121: overriding '/archavx' with '/archcore-avx2'
icl: command line warning #10006: ignoring unknown option '/axcode:core-i7-avx'

icl: remark #10397: optimization reports are generated in *.optrpt files in the output location
test_vec.cpp
xilink.exe test_vec.obj -LIBPATH:../../Debug/lib -out:test_vec.exe libutils.lib libmemory.lib
xilink: executing 'link'
Microsoft (R) Incremental Linker Version 14.28.29913.0
Copyright (C) Microsoft Corporation.  All rights reserved.

test_vec.obj
-LIBPATH:../../Debug/lib
-out:test_vec.exe
libutils.lib
libmemory.lib
 LINE: 140
 LINE: 146
Time ellapsed: 0.025000
 LINE: 152
Program Done!

C:\prjdir\tmp\simdtests>

Here is the optimization report file

LOOP BEGIN at C:\prjdir\tmp\simdtests\test_vec.cpp(103,13) inlined into C:\prjdir\tmp\simdtests\test_vec.cpp(147,15)
   remark #15388: vectorization support: reference c has aligned access   [ C:\prjdir\tmp\simdtests\test_vec.cpp(105,19) ]
   remark #15415: vectorization support: indirect load was generated for the variable <*__dereg>, 64-bit indexed, part of address is result of call to function   [ C:\prjdir\tmp\simdtests\test_vec.cpp(105,36) ]
   remark #15415: vectorization support: indirect load was generated for the variable <at (105:48)>, 64-bit indexed, part of address is result of call to function   [ C:\prjdir\tmp\simdtests\test_vec.cpp(105,48) ]
   remark #15305: vectorization support: vector length 32
   remark #15309: vectorization support: normalized vectorization overhead 0.051
   remark #15301: SIMD LOOP WAS VECTORIZED
   remark #15442: entire loop may be executed in remainder
   remark #15449: unmasked aligned unit stride stores: 1 
   remark #15462: unmasked indexed (or gather) loads: 2 
   remark #15475: --- begin vector cost summary ---
   remark #15476: scalar cost: 210 
   remark #15477: vector cost: 35.710 
   remark #15478: estimated potential speedup: 4.890 
   remark #15484: vector function calls: 2 
   remark #15488: --- end vector cost summary ---
   remark #15489: --- begin vector function matching report ---
   remark #15490: Function call: (unknown virtual call) with simdlen=32, actual parameter types: (uniform,linear:1)   [ C:\prjdir\tmp\simdtests\test_vec.cpp(105,36) ]
   remark #15492: A suitable vector variant was found (out of 2) with xmm, simdlen=32, unmasked, formal parameter types: (uniform,linear:1)
   remark #26014: The function ISA does not match the compilation target. For better SIMD performance, consider using -Qvecabi=cmdtarget compiler switch or "processor" clause in vector function declaration
   remark #15490: Function call: (unknown virtual call) with simdlen=32, actual parameter types: (uniform,linear:1)   [ C:\prjdir\tmp\simdtests\test_vec.cpp(105,48) ]
   remark #15492: A suitable vector variant was found (out of 2) with xmm, simdlen=32, unmasked, formal parameter types: (uniform,linear:1)
   remark #26014: The function ISA does not match the compilation target. For better SIMD performance, consider using -Qvecabi=cmdtarget compiler switch or "processor" clause in vector function declaration
   remark #15493: --- end vector function matching report ---
   remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

LOOP BEGIN at C:\prjdir\tmp\simdtests\test_vec.cpp(103,13) inlined into C:\prjdir\tmp\simdtests\test_vec.cpp(147,15)
<Remainder loop for vectorization>
   remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

When I check the optimization reports generated during compiling and linking, I get to see simdlen=32 in which really odd because my laptop CPU supports only AVX2 which has only 256 bits per SIMD Registers. If simd len is 32 after optimization this means that each simd register is 1024 bit long which sounds odd.

SIMD Register Size (in bits) = 32 x sizeof (float) x sizeof(1 byte) = 32 x 4 x 8 = 2^10 =1024.

I would appreciate if an expert elaborate on this.

VidyalathaB_Intel · ‎05-21-2021

Hi,

Thanks for reaching out to us.

Could you please provide us the details of data_types.h file so that we can try it from our end.

Regards,

Vidya.

mikeitexpert · ‎05-21-2021

Hi,

data_types.h is not relevant just a macro definitions for dt_bool.

Thanks for the follow up.

#ifndef __DATA_TYPES_H__
#define __DATA_TYPES_H__

#define dt_bool    bool
#define dt_logical bool

#endif /* __DATA_TYPES_H__ */

Viet_H_Intel · ‎05-21-2021

You mentioned dt_bool data type is only a byte, but on your calculation you used a float.

Have you tested with other compilers to see if it does any difference?

For some reasons, I got error ("dt_single" is undefined) when trying to compiler your code. Can you attach a preprocessed file?

Thanks,

Viet_H_Intel · ‎05-21-2021

I am not sure if your calculation is correct; but if your vector length is big, you would see a warning.

$ gcc -fopenmp test3.c

$ icc -fopenmp test3.c

$ gcc -fopenmp test3.c -D_LEN64

test3.c:11:8: warning: unsupported simdlen 64

double temp ( float *A, float *B , int i ) {

^~~~

$ icc -fopenmp test3.c -D_LEN64

test3.c(11): (col. 44) remark: function can't be vectorized: too many registers required to return value (big vector length)

test3.c(11): (col. 44) error #13397: vector function was not vectorized

test3.c(11): (col. 44) remark: function can't be vectorized: too many registers required to return value (big vector length)

test3.c(11): (col. 44) error #13397: vector function was not vectorized

compilation aborted for test3.c (code 1)

$ cat test3.c

#include <stdio.h>

#include <math.h>

#define SIZE 512*512

float foo(float A ) {

return 1.0f / (A*A);

}

#ifdef _LEN64

#pragma omp declare simd uniform(A, B ) linear( i : 1 ) simdlen ( 64)

double temp ( float *A, float *B , int i ) {

return B[i] * foo(A[i]) ;

}

#else

#pragma omp declare simd uniform(A, B ) linear( i : 1 ) simdlen (32)

double temp ( float *A, float *B , int i ) {

return B[i] * foo(A[i]) ;

}

#endif

int main ( ) {

float A[SIZE] , B [SIZE] ;

double C [SIZE] ;

for ( int i = 0 ; i < SIZE ; i ++){

C [i] = temp (A, B , i ) ;

}

return 0;

}

mikeitexpert · ‎05-22-2021

Thanks for the example ... though I am not sure how my case is justified because there is no error and at least based on reports the loop in my code is vectorized just fine. I am not sure it if is just a sth showing up in the report or there is some sort of an abstraction layer between hardware and compiler generated code to virtually simulate higher vector sizes ... I am a software guy with couple of years of embedded programming experience (though not an expert in hardware standards, abstractions, etc)

I hope @jimdempseyatthecove can clarify further for us.

Regards

Viet_H_Intel · ‎06-08-2021

Hi Mike,

I don't have any more inputs on this topic and hope others can give us some feedbacks.

Thanks,

Viet_H_Intel · ‎06-23-2021

Hi Mike,

Is is OK to close this thread from our end?

Thanks,

Viet_H_Intel · ‎06-28-2021

Hi Mike,

Thank for the kudos. We will no longer respond to this thread. If you require additional assistance from Intel, please start a new thread. Any further interaction in this thread will be considered community only.

Bernard · ‎07-01-2021

>>>When I check the optimization reports generated during compiling and linking, I get to see simdlen=32 in which really odd because my laptop CPU supports only AVX2 which has only 256 bits per SIMD Registers. If simd len is 32 after optimization this means that each simd register is 1024 bit long which sounds odd.>>>

Architecturally the *PRF probably is of length 256 (per single register) bits divided into a two 128-bit lanes. There are no 1024-bit software visible registers neither on current Intel Processors nor on AMD Processors. The large PRF had a larger presence in the era of the Vector Processors (you may look up an ALPHA extension, i.e. Tarantula VP).

Now in regards to "simdlen=32" I suppose, that it might have been used mainly for loop unrolling purpose (e.g. 4x unrolling).

*Physical Register File

mikeitexpert · ‎07-03-2021

Thanks for the valuable comments on Tarantula extensions. I am curious how comfortable they are to work with. Is there any good compiler for them? How well are they supported by intel community (or perhaps other popular vendors)?

I doubt it has anything to do with unrolling cuz they are reported separately as part of vectorization report. (see below)

LOOP BEGIN at matrix.h(1645,3) inlined into myapp.h(732,9)
   remark #25084: Preprocess Loopnests: Moving Out Store    [ matrix.h(1645,32) ]
   remark #15389: vectorization support: reference this->data[i] has unaligned access   [ matrix.h(1646,4) ]
   remark #15389: vectorization support: reference this->data[i] has unaligned access   [ matrix.h(1646,4) ]
   remark #15381: vectorization support: unaligned access used inside loop body
   remark #15305: vectorization support: vector length 8
   remark #15399: vectorization support: unroll factor set to 2
   remark #15309: vectorization support: normalized vectorization overhead 1.067
   remark #15355: vectorization support: nrm is double type reduction   [ matrix.h(1646,4) ]
   remark #15301: SIMD LOOP WAS VECTORIZED
   remark #15442: entire loop may be executed in remainder
   remark #15450: unmasked unaligned unit stride loads: 1 
   remark #15475: --- begin vector cost summary ---
   remark #15476: scalar cost: 7 
   remark #15477: vector cost: 1.870 
   remark #15478: estimated potential speedup: 3.200 
   remark #15488: --- end vector cost summary ---
   remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

Bernard · ‎07-04-2021

@mikeitexpert wrote:

Thanks for the valuable comments on Tarantula extensions. I am curious how comfortable they are to work with. Is there any good compiler for them? How well are they supported by intel community (or perhaps other popular vendors)?

I doubt it has anything to do with unrolling cuz they are reported separately as part of vectorization report. (see below)

LOOP BEGIN at matrix.h(1645,3) inlined into myapp.h(732,9)
   remark #25084: Preprocess Loopnests: Moving Out Store    [ matrix.h(1645,32) ]
   remark #15389: vectorization support: reference this->data[i] has unaligned access   [ matrix.h(1646,4) ]
   remark #15389: vectorization support: reference this->data[i] has unaligned access   [ matrix.h(1646,4) ]
   remark #15381: vectorization support: unaligned access used inside loop body
   remark #15305: vectorization support: vector length 8
   remark #15399: vectorization support: unroll factor set to 2
   remark #15309: vectorization support: normalized vectorization overhead 1.067
   remark #15355: vectorization support: nrm is double type reduction   [ matrix.h(1646,4) ]
   remark #15301: SIMD LOOP WAS VECTORIZED
   remark #15442: entire loop may be executed in remainder
   remark #15450: unmasked unaligned unit stride loads: 1 
   remark #15475: --- begin vector cost summary ---
   remark #15476: scalar cost: 7 
   remark #15477: vector cost: 1.870 
   remark #15478: estimated potential speedup: 3.200 
   remark #15488: --- end vector cost summary ---
   remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

@mikeitexpert wrote:

Thanks for the valuable comments on Tarantula extensions. I am curious how comfortable they are to work with. Is there any good compiler for them? How well are they supported by intel community (or perhaps other popular vendors)?

I doubt it has anything to do with unrolling cuz they are reported separately as part of vectorization report. (see below)

LOOP BEGIN at matrix.h(1645,3) inlined into myapp.h(732,9)
   remark #25084: Preprocess Loopnests: Moving Out Store    [ matrix.h(1645,32) ]
   remark #15389: vectorization support: reference this->data[i] has unaligned access   [ matrix.h(1646,4) ]
   remark #15389: vectorization support: reference this->data[i] has unaligned access   [ matrix.h(1646,4) ]
   remark #15381: vectorization support: unaligned access used inside loop body
   remark #15305: vectorization support: vector length 8
   remark #15399: vectorization support: unroll factor set to 2
   remark #15309: vectorization support: normalized vectorization overhead 1.067
   remark #15355: vectorization support: nrm is double type reduction   [ matrix.h(1646,4) ]
   remark #15301: SIMD LOOP WAS VECTORIZED
   remark #15442: entire loop may be executed in remainder
   remark #15450: unmasked unaligned unit stride loads: 1 
   remark #15475: --- begin vector cost summary ---
   remark #15476: scalar cost: 7 
   remark #15477: vector cost: 1.870 
   remark #15478: estimated potential speedup: 3.200 
   remark #15488: --- end vector cost summary ---
   remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

@mikeitexpert wrote:

Thanks for the valuable comments on Tarantula extensions. I am curious how comfortable they are to work with. Is there any good compiler for them? How well are they supported by intel community (or perhaps other popular vendors)?

I doubt it has anything to do with unrolling cuz they are reported separately as part of vectorization report. (see below)

LOOP BEGIN at matrix.h(1645,3) inlined into myapp.h(732,9)
   remark #25084: Preprocess Loopnests: Moving Out Store    [ matrix.h(1645,32) ]
   remark #15389: vectorization support: reference this->data[i] has unaligned access   [ matrix.h(1646,4) ]
   remark #15389: vectorization support: reference this->data[i] has unaligned access   [ matrix.h(1646,4) ]
   remark #15381: vectorization support: unaligned access used inside loop body
   remark #15305: vectorization support: vector length 8
   remark #15399: vectorization support: unroll factor set to 2
   remark #15309: vectorization support: normalized vectorization overhead 1.067
   remark #15355: vectorization support: nrm is double type reduction   [ matrix.h(1646,4) ]
   remark #15301: SIMD LOOP WAS VECTORIZED
   remark #15442: entire loop may be executed in remainder
   remark #15450: unmasked unaligned unit stride loads: 1 
   remark #15475: --- begin vector cost summary ---
   remark #15476: scalar cost: 7 
   remark #15477: vector cost: 1.870 
   remark #15478: estimated potential speedup: 3.200 
   remark #15488: --- end vector cost summary ---
   remark #25456: Number of Array Refs Scalar Replaced In Loop: 1
LOOP END

Tarantula Vector Extension probably were not implemented into a working silicon (it is an academic work). These extensions are not supported by the Intel community. The closest Intel product (to those extensions) was/is Xeon Phi. Bear in mind that Tarantula operated on large 8KiB vector register file and was a single core unit.

The vectorization report states: "2x unroll factor". In regards to "simdlen" integer argument (a large ones) one possibility is to look at the OpenMP implementation and search for keyword "simdlen", maybe source code will have some clues.