- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I am creating a simple matrix multiplication procedure, operating on the Intel Xeon Phi architecture.
After many attempts with autovectorization, trying to get better performances, I had to use Intel Intrinsics.
Until now, the matrix size was given by a #define in the source code, but when I try to give it at run time, I have a huge performance degradation.
The source code is the following:
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> #include <stddef.h> #include <chrono> #include <ctime> #include <mmintrin.h> #include <xmmintrin.h> // SSE #include <pmmintrin.h> // SSE2 #include <emmintrin.h> // SSE3 #include <immintrin.h> #include <zmmintrin.h> #define ALIGNMENT 64 #ifndef SIZE #define SIZE 960 #endif #define vZero(c) {(c) = _mm512_setzero_pd();} #define start_time() \ auto start = std::chrono::high_resolution_clock::now(); /** Shows the elapsed time. See start_time for usage*/ #define elapsed_time(STRING) \ auto elapsed = std::chrono::high_resolution_clock::now() - start; \ long long microseconds = std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count(); \ printf(#STRING":%lld\n", microseconds); void recTranspose(double *__restrict__ a, double *__restrict__ aT, const int n, const int k, const int lda, const int ldat){ if (n*k <= 128) { for(int i = 0; i < n; i++) { for(int j = 0; j < k; j++) { aT[j*ldat+i] = a[i*lda+j]; } } //printf("Reached _|_"); return; } if(k > n) { recTranspose(a, aT, n, (k+1)/2, lda, ldat); recTranspose(&a[(k+1)/2], &aT[(k+1)/2*ldat], n, k-((k+1)/2), lda, ldat); } else { recTranspose(a, aT, (n+1)/2, k, lda, ldat); recTranspose(&a[(n+1)/2*lda], &aT[(n+1)/2], n- (n+1)/2, k, lda, ldat); } } /** Calculates 8 cols and 30 rows of c.*/ inline void eightbythirty(double *__restrict__ a, double *__restrict__ b, double * __restrict__ c, const int size) { __m512d c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; __m512d c10, c11, c12, c13, c14, c15, c16, c17, c18, c19; __m512d c20, c21, c22, c23, c24, c25, c26, c27, c28, c29; vZero(c0); vZero(c1); vZero(c2); vZero(c3); vZero(c4); vZero(c5); vZero(c6); vZero(c7); vZero(c8); vZero(c9); vZero(c10); vZero(c11); vZero(c12); vZero(c13); vZero(c14); vZero(c15); vZero(c16); vZero(c17); vZero(c18); vZero(c19); vZero(c20); vZero(c21); vZero(c22); vZero(c23); vZero(c24); vZero(c25); vZero(c26); vZero(c27); vZero(c28); vZero(c29); __assume_aligned(a, ALIGNMENT); __assume_aligned(b, ALIGNMENT); __assume_aligned(c, ALIGNMENT); __assume(size%16==0); for(int i = 0; i < size; i++) { const __m512d bv = _mm512_load_pd(b+i*size); c0 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c0); c1 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c1); c2 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+2, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c2); c3 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+3, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c3); c4 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+4, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c4); c5 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+5, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c5); c6 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+6, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c6); c7 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+7, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c7); c8 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+8, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c8); c9 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+9, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c9); c10 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+10, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0),bv, c10); c11 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+11, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0),bv, c11); c12 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+12, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c12); c13 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+13, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c13); c14 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+14, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c14); c15 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+15, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c15); c16 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+16, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c16); c17 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+17, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c17); c18 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+18, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c18); c19 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+19, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c19); c20 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+20, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c20); c21 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+21, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c21); c22 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+22, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c22); c23 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+23, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c23); c24 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+24, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c24); c25 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+25, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c25); c26 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+26, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c26); c27 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+27, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c27); c28 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+28, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c28); c29 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+29, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c29); } _mm512_storenr_pd(c+0*size, c0); _mm512_storenr_pd(c+1*size, c1); _mm512_storenr_pd(c+2*size, c2); _mm512_storenr_pd(c+3*size, c3); _mm512_storenr_pd(c+4*size, c4); _mm512_storenr_pd(c+5*size, c5); _mm512_storenr_pd(c+6*size, c6); _mm512_storenr_pd(c+7*size, c7); _mm512_storenr_pd(c+8*size, c8); _mm512_storenr_pd(c+9*size, c9); _mm512_storenr_pd(c+10*size, c10); _mm512_storenr_pd(c+11*size, c11); _mm512_storenr_pd(c+12*size, c12); _mm512_storenr_pd(c+13*size, c13); _mm512_storenr_pd(c+14*size, c14); _mm512_storenr_pd(c+15*size, c15); _mm512_storenr_pd(c+16*size, c16); _mm512_storenr_pd(c+17*size, c17); _mm512_storenr_pd(c+18*size, c18); _mm512_storenr_pd(c+19*size, c19); _mm512_storenr_pd(c+20*size, c20); _mm512_storenr_pd(c+21*size, c21); _mm512_storenr_pd(c+22*size, c22); _mm512_storenr_pd(c+23*size, c23); _mm512_storenr_pd(c+24*size, c24); _mm512_storenr_pd(c+25*size, c25); _mm512_storenr_pd(c+26*size, c26); _mm512_storenr_pd(c+27*size, c27); _mm512_storenr_pd(c+28*size, c28); _mm512_storenr_pd(c+29*size, c29); } int main(int argc, const char ** argv) { #ifdef SIZES const int size = SIZE; #else const int size = atoi(argv[1]); #endif void* p = malloc((sizeof(double)*5*size*size) + ALIGNMENT-1); double *__restrict__ a = (double*)(((size_t)p + ALIGNMENT-1) / ALIGNMENT * ALIGNMENT); double *__restrict__ aT = (double*) a+size*size; double *__restrict__ b = aT+size*size; double *__restrict__ c = b+size*size; double *__restrict__ d = c+size*size; srand(time(NULL)); for(int i = 0; i < size; i++) { for(int j = 0; j < size; j++) { a[i*size+j] = (double) (rand()%20); } for(int j2=0; j2<size; j2++){ c[i*size+j2] = 0.0; } } for(int i = 0; i < size; i++) { for(int j = 0; j < size; j++) { b[i*size+j] = (double) (rand()%20); } } start_time(); recTranspose(a, aT, size, size, size, size); for(int i = 0; i < size; i+=30) { for(int j = 0; j < size; j+=8) { eightbythirty(&aT, &b, &c[i*size+j], size); } } elapsed_time(); double gflops = 2.0*size*size*size*1.0e-03/(microseconds); printf("Gflops: %f\n", gflops); for(int i = 0; i < size; i++) { for(int j = 0; j < size; j++) { double s = 0; for(int u = 0; u < size; u++) { s += a[i*size+u] * b[u*size+j]; } d[i*size+j] = s; } } int error = 0; for(int i = 0; i < size; i++) { for(int j = 0; j < size; j++) { if(abs(c[i*size+j] - d[i*size+j]) > 1) { printf("Error at %d %d , %f instead of %f\n", i, j, c[i*size+j], d[i*size+j]); error++; if(error > 16) return 0; } } } printf("OK\n"); }
So for example, having size 960 (for now it works only with sizes multiples of 30*8):
- if I compile with compile time given size: icc -mmic -O3 -restrict -std=c++11 -DSIZES -DSIZE=960 mmul.cpp -o mmul.o
Elapsed time: 0.460745s
Gflops: 3.840458 - if I compile with runtime given size: icc -mmic -O3 -restrict -std=c++11 mmul.cpp -o mmul.o
Elapsed time: 2.204564s
Gflops: 0.802640
I'm thinking it could be a prefetching issue with icc that can't recognize the memory access pattern. Looking at the generated asm source, the number of vprefetch instructions is much more higher in the "compile time" version.
Funny fact: the check for the correct result of the multiplication (the two for loops at the end of the code, rows 178-197) is much more slower in the compile time version!
Any thoughts? I tried the #pragma loop_count but it seems it's useless, also doing manual intrinsic prefetching doesn't seem to be very effective.
Thanks in advance for any answer.
Regards,
Luca
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I would check to see if register (zmm) pressure exceeds that of the number available.
You have defined 30 c's and your inner loop uses 1 for bv totaling 31. There are 32 available. Each _mm512_extload_pd will require one more temp register, meaning you reach the maximum number of zmm registers on your first _mm512_extload_pd. Though this "might" not make a difference on KNC architecture, it will make a difference on the future KNL architecture. And the compiler optimization may require a few spare registers. In looking at the disassembly of the eightbythirty you will be able to ascertain if some of your c's reverted to memory (stack) variables.
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Thanks for the answer.
I checked and I think the number of available zmm registers is not exceeded.
Here's the asm of the version compiled without defines:
..B1.46: # Preds ..B1.48 ..B1.45 Latency 141 incl %r9d #170.3 c1 movl 152(%rsp), %r12d #171.43 c1 lea (,%r9,8), %r10d #171.4 c5 movq 144(%rsp), %r11 #171.27 c5 movslq %r10d, %r10 #171.27 c9 lea (%r12,%r9,8), %r15d #171.43 c9 lea -64(%r11,%r10,8), %rcx #171.27 c13 movslq %r15d, %r15 #171.34 c13 movq 136(%rsp), %r11 #171.34 c17 movl %r9d, 192(%rsp) # c17 lea -64(%r11,%r15,8), %r12 #171.34 c21 xorl %r15d, %r15d #171.4 c21 vpxorq %zmm0, %zmm0, %zmm0 #171.4 c25 xorl %r11d, %r11d # c25 vpxorq %zmm1, %zmm1, %zmm1 #171.4 c29 movq 160(%rsp), %r8 # c29 vpxorq %zmm2, %zmm2, %zmm2 #171.4 c33 vpxorq %zmm3, %zmm3, %zmm3 #171.4 c37 vpxorq %zmm4, %zmm4, %zmm4 #171.4 c41 vpxorq %zmm5, %zmm5, %zmm5 #171.4 c45 vpxorq %zmm6, %zmm6, %zmm6 #171.4 c49 vpxorq %zmm7, %zmm7, %zmm7 #171.4 c53 vpxorq %zmm8, %zmm8, %zmm8 #171.4 c57 vpxorq %zmm9, %zmm9, %zmm9 #171.4 c61 vpxorq %zmm10, %zmm10, %zmm10 #171.4 c65 vpxorq %zmm11, %zmm11, %zmm11 #171.4 c69 vpxorq %zmm12, %zmm12, %zmm12 #171.4 c73 vpxorq %zmm13, %zmm13, %zmm13 #171.4 c77 vpxorq %zmm14, %zmm14, %zmm14 #171.4 c81 vpxorq %zmm15, %zmm15, %zmm15 #171.4 c85 vpxorq %zmm16, %zmm16, %zmm16 #171.4 c89 vpxorq %zmm17, %zmm17, %zmm17 #171.4 c93 vpxorq %zmm18, %zmm18, %zmm18 #171.4 c97 vpxorq %zmm19, %zmm19, %zmm19 #171.4 c101 vpxorq %zmm20, %zmm20, %zmm20 #171.4 c105 vpxorq %zmm21, %zmm21, %zmm21 #171.4 c109 vpxorq %zmm22, %zmm22, %zmm22 #171.4 c113 vpxorq %zmm23, %zmm23, %zmm23 #171.4 c117 vpxorq %zmm24, %zmm24, %zmm24 #171.4 c121 vpxorq %zmm25, %zmm25, %zmm25 #171.4 c125 vpxorq %zmm26, %zmm26, %zmm26 #171.4 c129 vpxorq %zmm27, %zmm27, %zmm27 #171.4 c133 vpxorq %zmm28, %zmm28, %zmm28 #171.4 c137 vpxorq %zmm29, %zmm29, %zmm29 #171.4 c141
And this is the version compiled with -DSIZES -DSIZE=960
..B1.21: # Preds ..B1.21 ..B1.20 Latency 205 vmovapd (%rax,%r9,8), %zmm31 #171.4 c1 incl %r10d #171.4 c1 vprefetch1 61440(%rax,%r9,8) #171.4 c5 vfmadd231pd (%rbx,%r9,8){1to8}, %zmm31, %zmm30 #171.4 c9 vprefetch0 15360(%rax,%r9,8) #171.4 c13 vfmadd231pd 8(%rbx,%r9,8){1to8}, %zmm31, %zmm29 #171.4 c17 vprefetch1 61440(%rbx,%r9,8) #171.4 c21 vfmadd231pd 16(%rbx,%r9,8){1to8}, %zmm31, %zmm28 #171.4 c25 vprefetch0 15360(%rbx,%r9,8) #171.4 c29 vfmadd231pd 24(%rbx,%r9,8){1to8}, %zmm31, %zmm27 #171.4 c33 vprefetch1 61448(%r8) #171.4 c37 vfmadd231pd 32(%rbx,%r9,8){1to8}, %zmm31, %zmm26 #171.4 c41 vprefetch0 15368(%r8) #171.4 c45 vfmadd231pd 40(%rbx,%r9,8){1to8}, %zmm31, %zmm25 #171.4 c49 vprefetch1 61480(%r8) #171.4 c53 vfmadd231pd 48(%rbx,%r9,8){1to8}, %zmm31, %zmm24 #171.4 c57 vprefetch0 15400(%r8) #171.4 c61 vfmadd231pd 56(%rbx,%r9,8){1to8}, %zmm31, %zmm23 #171.4 c65 vprefetch1 61512(%r8) #171.4 c69 vfmadd231pd 64(%rbx,%r9,8){1to8}, %zmm31, %zmm22 #171.4 c73 vprefetch0 15432(%r8) #171.4 c77 vfmadd231pd 72(%rbx,%r9,8){1to8}, %zmm31, %zmm21 #171.4 c81 vprefetch1 61544(%r8) #171.4 c85 vfmadd231pd 80(%rbx,%r9,8){1to8}, %zmm31, %zmm20 #171.4 c89 vprefetch0 15464(%r8) #171.4 c93 vfmadd231pd 88(%rbx,%r9,8){1to8}, %zmm31, %zmm19 #171.4 c97 vprefetch1 61576(%r8) #171.4 c101 vfmadd231pd 96(%rbx,%r9,8){1to8}, %zmm31, %zmm18 #171.4 c105 vprefetch0 15496(%r8) #171.4 c109 vfmadd231pd 104(%rbx,%r9,8){1to8}, %zmm31, %zmm17 #171.4 c113 vprefetch1 61608(%r8) #171.4 c117 vfmadd231pd 112(%rbx,%r9,8){1to8}, %zmm31, %zmm16 #171.4 c121 vprefetch0 15528(%r8) #171.4 c125 vfmadd231pd 120(%rbx,%r9,8){1to8}, %zmm31, %zmm15 #171.4 c129 vprefetch1 61640(%r8) #171.4 c133 vfmadd231pd 128(%rbx,%r9,8){1to8}, %zmm31, %zmm14 #171.4 c137 vprefetch0 15560(%r8) #171.4 c141 vfmadd231pd 136(%rbx,%r9,8){1to8}, %zmm31, %zmm13 #171.4 c145 vprefetch1 61672(%r8) #171.4 c149 vfmadd231pd 144(%rbx,%r9,8){1to8}, %zmm31, %zmm12 #171.4 c153 vprefetch0 15592(%r8) #171.4 c157 vfmadd231pd 152(%rbx,%r9,8){1to8}, %zmm31, %zmm11 #171.4 c161 addq $7680, %r8 #171.4 c161 vfmadd231pd 160(%rbx,%r9,8){1to8}, %zmm31, %zmm10 #171.4 c165 vfmadd231pd 168(%rbx,%r9,8){1to8}, %zmm31, %zmm9 #171.4 c169 vfmadd231pd 176(%rbx,%r9,8){1to8}, %zmm31, %zmm8 #171.4 c173 vfmadd231pd 184(%rbx,%r9,8){1to8}, %zmm31, %zmm7 #171.4 c177 vfmadd231pd 192(%rbx,%r9,8){1to8}, %zmm31, %zmm6 #171.4 c181 vfmadd231pd 200(%rbx,%r9,8){1to8}, %zmm31, %zmm5 #171.4 c185 vfmadd231pd 208(%rbx,%r9,8){1to8}, %zmm31, %zmm4 #171.4 c189 vfmadd231pd 216(%rbx,%r9,8){1to8}, %zmm31, %zmm3 #171.4 c193 vfmadd231pd 224(%rbx,%r9,8){1to8}, %zmm31, %zmm2 #171.4 c197 vfmadd231pd 232(%rbx,%r9,8){1to8}, %zmm31, %zmm1 #171.4 c201 addq $960, %r9 #171.4 c201 cmpl $960, %r10d #171.4 c205 jb ..B1.21 # Prob 82% #171.4 c205
Also, I compiled with -no-opt-prefetch, and the the performances collapse even in the "good" case (4.310070 s, 0.410544 GFLOPS), so it seems is all about prefetching.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
The one without the defines is not showing the compute section of the code, rather it is showing the zeroing section of of the c's. Please present the compute loop (the equivalent to the ..B1.21: ... through jb ..B1.21, though the label may differ).
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Sorry, my mistake.
..B1.47: # Preds ..B1.46 ..B1.47 Latency 129 movslq %r11d, %r11 #171.4 c1 incl %r15d #171.4 c1 vmovapd (%rcx,%r11,8), %zmm30 #171.4 c5 lea (%r11,%r14,2), %r9d #171.4 c5 vfmadd231pd (%r8,%r11,8){1to8}, %zmm30, %zmm0 #171.4 c9 movslq %r9d, %r9 #171.4 c9 vfmadd231pd 8(%r8,%r11,8){1to8}, %zmm30, %zmm1 #171.4 c13 vprefetch1 (%rcx,%r9,8) #171.4 c13 vfmadd231pd 16(%r8,%r11,8){1to8}, %zmm30, %zmm2 #171.4 c17 vprefetch1 (%r8,%r9,8) #171.4 c17 vfmadd231pd 24(%r8,%r11,8){1to8}, %zmm30, %zmm3 #171.4 c21 vfmadd231pd 32(%r8,%r11,8){1to8}, %zmm30, %zmm4 #171.4 c25 vfmadd231pd 40(%r8,%r11,8){1to8}, %zmm30, %zmm5 #171.4 c29 vfmadd231pd 48(%r8,%r11,8){1to8}, %zmm30, %zmm6 #171.4 c33 vfmadd231pd 56(%r8,%r11,8){1to8}, %zmm30, %zmm7 #171.4 c37 vfmadd231pd 64(%r8,%r11,8){1to8}, %zmm30, %zmm8 #171.4 c41 vfmadd231pd 72(%r8,%r11,8){1to8}, %zmm30, %zmm9 #171.4 c45 vfmadd231pd 80(%r8,%r11,8){1to8}, %zmm30, %zmm10 #171.4 c49 vfmadd231pd 88(%r8,%r11,8){1to8}, %zmm30, %zmm11 #171.4 c53 vfmadd231pd 96(%r8,%r11,8){1to8}, %zmm30, %zmm12 #171.4 c57 vfmadd231pd 104(%r8,%r11,8){1to8}, %zmm30, %zmm13 #171.4 c61 vfmadd231pd 112(%r8,%r11,8){1to8}, %zmm30, %zmm14 #171.4 c65 vfmadd231pd 120(%r8,%r11,8){1to8}, %zmm30, %zmm15 #171.4 c69 vfmadd231pd 128(%r8,%r11,8){1to8}, %zmm30, %zmm16 #171.4 c73 vfmadd231pd 136(%r8,%r11,8){1to8}, %zmm30, %zmm17 #171.4 c77 vfmadd231pd 144(%r8,%r11,8){1to8}, %zmm30, %zmm18 #171.4 c81 vfmadd231pd 152(%r8,%r11,8){1to8}, %zmm30, %zmm19 #171.4 c85 vfmadd231pd 160(%r8,%r11,8){1to8}, %zmm30, %zmm20 #171.4 c89 vfmadd231pd 168(%r8,%r11,8){1to8}, %zmm30, %zmm21 #171.4 c93 vfmadd231pd 176(%r8,%r11,8){1to8}, %zmm30, %zmm22 #171.4 c97 vfmadd231pd 184(%r8,%r11,8){1to8}, %zmm30, %zmm23 #171.4 c101 vfmadd231pd 192(%r8,%r11,8){1to8}, %zmm30, %zmm24 #171.4 c105 vfmadd231pd 200(%r8,%r11,8){1to8}, %zmm30, %zmm25 #171.4 c109 vfmadd231pd 208(%r8,%r11,8){1to8}, %zmm30, %zmm26 #171.4 c113 vfmadd231pd 216(%r8,%r11,8){1to8}, %zmm30, %zmm27 #171.4 c117 vfmadd231pd 224(%r8,%r11,8){1to8}, %zmm30, %zmm28 #171.4 c121 vfmadd231pd 232(%r8,%r11,8){1to8}, %zmm30, %zmm29 #171.4 c125 addl %r14d, %r11d #171.4 c125 cmpl %r14d, %r15d #171.4 c129 jb ..B1.47 # Prob 82% #171.4 c129
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
In the above code the vprefetch0's are missing as compared to the code using the #define'd size. This is what you were stating as an assumption (but now can prove).
Assuming the compilation options (and #pragmas) were the same, then this would indicate a deficiency in the auto prefetch insertion by the compiler.
It may be best if you can package this as a reproducer and send it on to Intel for analysis. Intel is always eager to have a good (and simple) reproducer that exposes inefficiency in their optimization.
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I can submit this to Development for you Lucas for some additional investigation. Are you using the latest 15.0 compiler?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
jimdempseyatthecove wrote:
In the above code the vprefetch0's are missing as compared to the code using the #define'd size. This is what you were stating as an assumption (but now can prove).
Assuming the compilation options (and #pragmas) were the same, then this would indicate a deficiency in the auto prefetch insertion by the compiler.
It may be best if you can package this as a reproducer and send it on to Intel for analysis. Intel is always eager to have a good (and simple) reproducer that exposes inefficiency in their optimization.
Jim Dempsey
Yes it was exactly the same source code with identical compilation flags and pragmas.
Kevin Davis wrote:
I can submit this to Development for you Lucas for some additional investigation. Are you using the latest 15.0 compiler?
Yes, I'm using the latest version.
If you would be so kind to submit it I can only say thank you!
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Note that there is another thread addressing this same topic on Stack Exchange with a different set of experts.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Thank you Luca A. I submitted this to Development for further analysis (see internal tracking id below). I was unable to convince the compiler to increase the use of prefetches via any compiler option or directive. I will keep you posted on information I receive from Development.
(Internal tracking id: DPD200362905)
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Kevin Davis (Intel) wrote:
Thank you Luca A. I submitted this to Development for further analysis (see internal tracking id below). I was unable to convince the compiler to increase the use of prefetches via any compiler option or directive. I will keep you posted on information I receive from Development.
(Internal tracking id: DPD200362905)
How can I track it?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
You can't for the most part. Once closed the ids appear in the fixes list published with our releases. I will keep you updated as Development updates the ticket internally and you're welcome to ping for status in this thread at any time

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page