#define _N_GP //#define _N_YMM #include #include #include // uint64_t needed #include #ifdef _N_YMM #include // SSE2 intrinsics #include // SSE4.1 intrinsics #include // AVX intrinsics #endif #ifdef _N_YMM void SlowCopy256bit (const char *SOURCE, char *TARGET) { _mm256_storeu_si256((__m256i *)(TARGET), _mm256_loadu_si256((const __m256i *)(SOURCE))); } #endif #ifndef NULL #ifdef __cplusplus #define NULL 0 #else #define NULL ((void*)0) #endif #endif int main( int argc, char *argv[] ) { unsigned char* retLOCAL; unsigned char* srcLOCAL; unsigned int DWORDtrio; unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' //uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; srcLOCAL = (unsigned char*)malloc(512); retLOCAL = (unsigned char*)malloc((1LL<<32)+512); if( srcLOCAL == NULL ) { printf("Needed memory allocation denied!\n"); return(1); } if( retLOCAL == NULL ) { printf("Needed memory allocation denied!\n"); return(1); } DWORDtrio = *(unsigned int*)srcLOCAL; DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif printf("retLOCAL = %p\n", retLOCAL); printf("((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) = %p\n", ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated)); //00000001 3F 9C 00 3B exit(0); } // Output when non-workaround is in effect (i.e. 'unsigned int Flag;' is active): /* D:\_KAZE\Instructions_per_tick_during_branchless_decompression_32-threaded>icl /O3 losingHigh4bytes.c Intel(R) C++ Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 15.0.0.108 Build 20140726 Copyright (C) 1985-2014 Intel Corporation. All rights reserved. losingHigh4bytes.c Microsoft (R) Incremental Linker Version 10.00.30319.01 Copyright (C) Microsoft Corporation. All rights reserved. -out:losingHigh4bytes.exe losingHigh4bytes.obj D:\_KAZE\Instructions_per_tick_during_branchless_decompression_32-threaded>losingHigh4bytes.exe !!! CRASH !!! D:\_KAZE\Instructions_per_tick_during_branchless_decompression_32-threaded> */ // Output when workaround is in effect (i.e. 'uint64_t Flag;' is active): /* D:\_KAZE\Instructions_per_tick_during_branchless_decompression_32-threaded>icl /O3 losingHigh4bytes.c Intel(R) C++ Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 15.0.0.108 Build 20140726 Copyright (C) 1985-2014 Intel Corporation. All rights reserved. losingHigh4bytes.c Microsoft (R) Incremental Linker Version 10.00.30319.01 Copyright (C) Microsoft Corporation. All rights reserved. -out:losingHigh4bytes.exe losingHigh4bytes.obj D:\_KAZE\Instructions_per_tick_during_branchless_decompression_32-threaded>losingHigh4bytes.exe retLOCAL = 000000013F920040 ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) = 000000013F92003B D:\_KAZE\Instructions_per_tick_during_branchless_decompression_32-threaded> */