Hi all,
I just noticed a potential performance bug in the DGEMM implementation of MKL (16.0.1)
when using a single thread. I merely want to make someone at Intel aware of it, in case it is of interest.
Strangely DGEMM performs better for beta=1 than for beta=0 in certain
situations. Here is an example:
Intel(R) Xeon(R) CPU E5-2650:
m=72, n=373248, k=72, beta=0.00 : 14.25 GF
m=72, n=373248, k=72, beta=1.00 : 18.36 GF
Intel(R) Xeon(R) CPU E5-2650:
m=72, n=373248, k=72, beta=0.00 : 19.25 GF
m=72, n=373248, k=72, beta=1.00 : 28.34 GF
As you can see, the performance difference is significant. It is
actually so significant that it pays off to set C to zero explicitly
before calling MKL and then using the more efficient beta=1
implementation instead.
Here is a quick test driver:
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
extern "C"
int dgemm_(char *transa, char *transb, int *m, int *
n, int *k, double *alpha, double *a, int *lda,
double *b, int *ldb, double *beta, double *c, int *ldc);
void trashCache(float* trash1, float* trash2, int nTotal){
for(int i = 0; i < nTotal; i ++)
trash1 += 0.99 * trash2;
}
int main(int argc, char ** argv)
{
if(argc < 2 ){
printf("Usage: <beta>\n");
exit(-1);
}
float *trash1, *trash2;
int nTotal = 1024*1024*100;
trash1 = (float*) malloc(sizeof(float)*nTotal);
trash2 = (float*) malloc(sizeof(float)*nTotal);
int m = 72;
int n = 72*72*72;
int k = 72;
double flops = 2.E-9 * m*n*k;
double alpha=1;
double beta=atof(argv[1]);
double *A, *B, *C;
int ret = posix_memalign((void**) &A, 64, sizeof(double) * m*k);
ret += posix_memalign((void**) &B, 64, sizeof(double) * n*k);
ret += posix_memalign((void**) &C, 64, sizeof(double) * m*n);
double minTime = 1e100;
for (int i=0; i<3; i++){
trashCache(trash1, trash2, nTotal);
double t = omp_get_wtime();
dgemm_("T", "N", &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m);
t = omp_get_wtime() - t;
minTime = (minTime < t) ? minTime : t;
}
printf("m=%d, n=%d, k=%d, beta=%.2f : %.2lf GF\n", m,n,k,beta,flops/minTime);
free(A);
free(B);
free(C);
free(trash1);
free(trash2);
return 0;
}
Best, Paul
链接已复制