Intel® C++ Compiler
Community support and assistance for creating C++ code that runs on platforms based on Intel® processors.
7956 Discussions

Corruption with the optimization compiler option

DLake1
New Contributor I
1,069 Views

Why does the optimization crush the code so much it changes the outcome of the code and how can I stop it?

0 Kudos
35 Replies
DLake1
New Contributor I
336 Views

I managed to get 21GB/s read and 19.3GB/s write with 2 arrays I think that must be as fast as a single thread can go:

#include <memory.h>
#include <ia32intrin.h>
unsigned long long* data0;
unsigned long long* data1;
bool Create(){
    data0=new unsigned long long[67108864];
    data1=new unsigned long long[67108864];
    return true;
}
bool Destroy(){
    delete[] data0;
    delete[] data1;
    return true;
}
bool Write(unsigned long long dval){
    #pragma simd
    for(long i=0;i<67108864;++i){data1=data0=dval;}
    return true;
}
//volatile unsigned long long errxor;
bool Read(unsigned long long dval, unsigned long long errxor){
    #pragma simd
    for(long i=0;i<67108864;++i){errxor=errxor^data1^data0;}
    return errxor!=dval;
}

Write disassembly:

vmovdqu     xmmword ptr [ebx+eax*8],xmm0
vmovntdq    xmmword ptr [ecx+eax*8],xmm0
add         eax,2  
cmp         eax,edx  
jb          Write+50h (0FDA10A0h)  

Read disassembly:

vpxor       xmm0,xmm0,xmmword ptr [ebx+eax*8]  
vpxor       xmm1,xmm0,xmmword ptr [edx+eax*8]  
vpxor       xmm2,xmm1,xmmword ptr [ebx+eax*8+10h]
vpxor       xmm3,xmm2,xmmword ptr [edx+eax*8+10h]
vpxor       xmm4,xmm3,xmmword ptr [ebx+eax*8+20h]
vpxor       xmm5,xmm4,xmmword ptr [edx+eax*8+20h]
vpxor       xmm6,xmm5,xmmword ptr [ebx+eax*8+30h]
vpxor       xmm0,xmm6,xmmword ptr [edx+eax*8+30h]
add         eax,8  
cmp         eax,ecx  
jb          Read+47h (0FDA1127h)  

0 Kudos
DLake1
New Contributor I
336 Views

Bandwidth analysis attached, Qparallel is off and loop unrolling is decided by the compiler, this is the code used:

#include <memory.h>
#include <ia32intrin.h>
unsigned long long* data0;
unsigned long long* data1;
bool Create(){
    data0=new unsigned long long[67108864];
    data1=new unsigned long long[67108864];
    return true;
}
bool Destroy(){
    delete[] data0;
    delete[] data1;
    return true;
}
bool Write(unsigned long long dval){
#pragma simd
//#pragma parallel
    for(long i=0;i<67108864;++i){data1=data0=dval;}
    return true;
}
bool Read(unsigned long long dval, unsigned long long errxor){
#pragma simd
//#pragma parallel
    for(long i=0;i<67108864;++i){errxor=errxor^data1^data0;}
    return errxor!=dval;
}

I still can't make the error checking work on the Read method please help.

0 Kudos
DLake1
New Contributor I
336 Views

Check this out I maxed out the Write speed at about 55GB/s by adding my own AVX intrinsics and enabling Qparallel:

bool Write(double dval){
    __m256d ymm0=_mm256_set1_pd(dval);
#pragma parallel
    for(long i=0;i<134217728;i+=4){
        _mm256_store_pd(data0+i, ymm0);
    }
    return true;
}

and heres those big juicy ymm's!

inc             ebx 
vmovntpd    ymmword ptr [edi+edx],ymm0  
vmovntpd    ymmword ptr [edi+edx+20h],ymm0
add            edi,40h  
cmp           ebx,esi  
jb               Write+17Bh (0F2C11BBh)  

0 Kudos
Bernard
Valued Contributor I
336 Views

It seems that you finally got real improvement in increasing memory speed of your test case.

 

0 Kudos
DLake1
New Contributor I
336 Views

I'm still having trouble with the Read method and error checking, how do I make this parallelizable(if that's even a word):

bool Read(double dval, double errxor){
    double errxord[4]={errxor, errxor, errxor, errxor};
    __m256d ymm0=_mm256_loadu_pd(errxord);
    for(long i=0;i<134217728;i+=4){
        ymm0=_mm256_xor_pd(ymm0, _mm256_loadu_pd(data0+i));
    }
    if(ymm0.m256d_f64[0]!=dval||ymm0.m256d_f64[1]!=dval||ymm0.m256d_f64[2]!=dval||ymm0.m256d_f64[3]!=dval)return true; else return false;
}

0 Kudos
TimP
Honored Contributor III
336 Views

OpenMP provides for xor reduction; if you don't like OpenMP, you will have to think about the analogy.  Each thread does a private reduction, results combined at the end, probably in tree fashion if using a significant number of threads.

0 Kudos
Bernard
Valued Contributor I
336 Views

You could do parallelization over a for loop by giving to each thread iterate over the loop index.For example threadID 0 (i = 1<i =2n<arrayLen),threadID 1(i = 2<i = 2n<arrayLen),threadID 2 (i = 3<i =2n<arrayLen),threadID 4 (i = 4<i = 2n<arrayLen).The hardest part will be thread synchronisation.

0 Kudos
DLake1
New Contributor I
336 Views

Iv dumped the error checking now I just want the fastest way to read RAM, how do I do that?

This is what iv tried:

for(long i=0;i<134217728;i+=4){
    volatile __m256i ymm0=_mm256_set_epi64x(data0, data0[i+1], data0[i+2], data0[i+3]);
}

for(long i=0;i<134217728;++i){
    volatile __int64 ptr=data0;
}

both of them read into then out of the register and I think thats whats slowing it down, why cant I just read memory without doing anything else?

0 Kudos
DLake1
New Contributor I
336 Views

Hey look I got the assembly from the memory read test loop in aida64, how do they do that?:

sub         rsi,200h  
movntdqa    xmm0,xmmword ptr [rsi+170h]
movntdqa    xmm1,xmmword ptr [rsi+160h]
movntdqa    xmm2,xmmword ptr [rsi+150h]
movntdqa    xmm3,xmmword ptr [rsi+140h]
movntdqa    xmm4,xmmword ptr [rsi+130h]
movntdqa    xmm5,xmmword ptr [rsi+120h]
movntdqa    xmm6,xmmword ptr [rsi+110h]
movntdqa    xmm7,xmmword ptr [rsi+100h]
movntdqa    xmm8,xmmword ptr [rsi+0F0h]
movntdqa    xmm9,xmmword ptr [rsi+0E0h]
movntdqa    xmm10,xmmword ptr [rsi+0D0h]
movntdqa    xmm11,xmmword ptr [rsi+0C0h]
movntdqa    xmm12,xmmword ptr [rsi+0B0h]
movntdqa    xmm13,xmmword ptr [rsi+0A0h]
movntdqa    xmm14,xmmword ptr [rsi+90h]
movntdqa    xmm15,xmmword ptr [rsi+80h]
movntdqa    xmm0,xmmword ptr [rsi+70h]  
movntdqa    xmm1,xmmword ptr [rsi+60h]  
movntdqa    xmm2,xmmword ptr [rsi+50h]  
movntdqa    xmm3,xmmword ptr [rsi+40h]  
movntdqa    xmm4,xmmword ptr [rsi+30h]  
movntdqa    xmm5,xmmword ptr [rsi+20h]  
movntdqa    xmm6,xmmword ptr [rsi+10h]  
movntdqa    xmm7,xmmword ptr [rsi]  
movntdqa    xmm8,xmmword ptr [rsi-10h]  
movntdqa    xmm9,xmmword ptr [rsi-20h]  
movntdqa    xmm10,xmmword ptr [rsi-30h]
movntdqa    xmm11,xmmword ptr [rsi-40h]
movntdqa    xmm12,xmmword ptr [rsi-50h]
movntdqa    xmm13,xmmword ptr [rsi-60h]
movntdqa    xmm14,xmmword ptr [rsi-70h]
movntdqa    xmm15,xmmword ptr [rsi-80h]
sub         rsp,200h  
jne         0000000000490180  

 

0 Kudos
Bernard
Valued Contributor I
336 Views

I do not understand what you want to achieve.Do not use registers for memory transfer?

0 Kudos
DLake1
New Contributor I
336 Views

As I said in post #3 I just want to know how to benchmark RAM, it seems I will have to use inline assembly can you tell me how I can read and write RAM using inline AVX assembly without the optimization interfering or doing any sort of processing that will slow it down?

0 Kudos
Bernard
Valued Contributor I
336 Views

Code inside _asm{} block is not optimised.Compiler will probably add xmmword ptr or ymmword ptr to your code.You  can write a set of simple SSE and AVX inline assembly based routines to read and write memory only.Allocate memory with _mm_malloc() aligned on 64 bytes and allocating memory in blocks of page size.For measuring speed of execution my favourite option is to use rdtsc intrinsic.

0 Kudos
Bernard
Valued Contributor I
336 Views

Btw. You could get a better answer on ISA forum by asking Dr. McCalpin.He is author of STREAM benchmark.

0 Kudos
DLake1
New Contributor I
336 Views

What do you mean by "allocating memory in blocks of page size"?

Post edited because I just realized my mistake I shouldn't have used "#pragma vector always".

0 Kudos
Bernard
Valued Contributor I
336 Views

Allocating memory in multiplies of page size (4KB).

0 Kudos
Reply