- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
The support folks for MKL on the premier site say these are small for MKL to handle efficiently. Does the IPP small matrix library work better for matrices in this size range? What is a typical "small" matrix by this packages standard?
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
IPP was optimized for a typical size of matrices like 3x3, 6x6 and up to 20x20.
Regards,
Vladimir
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I think IPP should be faster in cases where it was especially optimized (small sizes) the sameshould betrue for MKL for big sizes
Vladimir
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Coincidentally, I experimented with something very similar just a couple of weeks ago, decided to post, but nuked my post at the last minute. Admittedly, I was only evaluating an eval version of IPP 4.2, but I expect nothing major to have changed w.r.t. the few functions I am using below.
My benchmark initially consisted of creating 100,000 matrices in a loop, inverting them, multiplying the inverse with the original and printing them out (to /dev/null) I originally tried using 100x100 matrices but the speed with IPP was unbearably slow compared to almost everything else, including home-grown code; I then realized that IPP is for small matrices after all, and so I decided to use 6x6 matrices so I could use the 6x6 library functions. Here is the code (linux)
------- IPP benchmark code -------
#include
#include
#include "ipp.h"
main() {
int dim = 6;
srand(1);
for (int n = 0; n < 1e5; n++) {
if (n%10000==0) fprintf(stderr, "iter: %d
", n);
Ipp32f mat[dim*dim];
Ipp32f inv[dim*dim];
Ipp32f mul[dim*dim];
int srcStride1 = dim * sizeof(Ipp32f);
int dstStride1 = dim * sizeof(Ipp32f);
for (int i = 0; i < dim; i++)
for (int j = 0; j < dim; j++)
mat[i*dim+j] = (double) rand()/RAND_MAX;
(void) ippmInvert_m_32f_6x6((const Ipp32f*) mat, srcStride1, inv, dstStride1);
(void) ippmMul_mm_32f_6x6((const Ipp32f*) mat, srcStride1,
(const Ipp32f*) inv, srcStride1, mul, dstStride1);
printf("Orig matrix
"); printMat(mat, dim, dim);
printf("Inverted matrix
"); printMat(inv, dim, dim);
printf("Product matrix
"); printMat(mul, dim, dim);
}
}
// --------------------------------------------------
I compile this with:
g++ -O3 -L /opt/intel/ipp41/ia32_itanium/sharedlib ippmat.cc -o ippmat -I/opt/intel/ipp41/ia32_itanium/include -lippm
And I run (on my linux Fedora 7 PC, with a ) it with:
$ time ippmat >/dev/null
6.733u 0.012s 0:06.75 99.8% 0+0k 0+0io 0pf+0w
$ uname -va
Linux myhost 2.6.22.4-65.fc7 #1 SMP Tue Aug 21 22:36:56 EDT 2007 i686 i686 i386 GNU/Linux
$ cat /proc/cpuinfo
...
vendor_id : GenuineIntel
cpu family : 15
model : 4
model name : Intel Pentium 4 CPU 3.20GHz
stepping : 3
cpu MHz : 2800.000
cache size : 2048 KB
...
// ----------- Home-grown non-IPP code -------------
#include
#include
#include
struct Matrix {
int nRows;
int nCols;
double **data;
};
Matrix *mat_create(int nRows, int nCols) {
Matrix *A = (Matrix *) calloc(1, sizeof(Matrix));
A->nRows = nRows;
A->nCols = nCols;
A->data = (double**) calloc(nRows, sizeof(double*));
for (int i = 0; i < nRows; i++)
A->data = (double*) calloc(nCols, sizeof(double));
return A;
}
void mat_destroy(Matrix** A) {
if (!A || !*A) return;
Matrix *p = *A;
for (int i = 0; i < p->nRows; i++)
free(p->data);
free(p->data);
free(p);
*A = 0; // To prevent double frees
}
inline void mat_rowSwap(Matrix *m, int r1, int r2){
double *tmp = m->data[r1];
m->data[r1] = m->data[r2];
m->data[r2] = tmp;
}
inline void mat_rowMul(Matrix *m, int row, double val){
for(int col = 0; col < m->nCols; col++)
m->data[row][col] *= val;
}
inline void mat_rowMulAdd(Matrix *m, int rFrom, int rTo, double val) {
for(int col = 0; col < m->nCols; col++)
m->data[rTo][col] += val * m->data[rFrom][col];
}
void mat_rref(Matrix *m) {
int pivCol = 0, pivRow = 0;
double absval;
int tmp = 0;
while(pivRow < m->nRows) {
while(absval == 0.0 && pivCol < m->nCols) {
absval = fabs(m->data[pivRow][pivCol]);
tmp = pivRow;
for(int row = pivRow+1; row < m->nRows; row++) {
if(fabs(m->data[row][pivCol]) > absval) {
absval = fabs(m->data[row][pivCol]);
tmp = row;
}
}
if(absval == 0) pivCol++;
}
if(pivCol >= m->nCols || pivRow >= m->nRows) return;
if(pivRow != tmp) mat_rowSwap(m,tmp,pivRow); // Move to top
mat_rowMul(m, pivRow, 1.0/m->data[pivRow][pivCol]); // Rescale
for(int row = 0; row < m->nRows; row++) {
if (row == pivRow) continue;
mat_rowMulAdd(m,pivRow,row,-m->data[row][pivCol]);
}
pivRow++; // Step down the diagnoal
pivCol++;
}
}
Matrix *mat_invert(const Matrix *m) { // invert using RREF
Matrix *tmp = mat_create(m->nRows, m->nCols*2);
Matrix *inv = mat_create(m->nRows, m->nCols);
for(int i = 0; i < m->nRows; i++) {
tmp->data[i+m->nCols] = 1;
for(int j = 0; j < m->nCols; j++)
tmp->data= m->data ;
}
mat_rref(tmp);
for(int i = 0; i < m->nRows; i++)
for(int j = 0; j < m->nCols; j++)
inv->data= tmp->data[j+m->nCols];
mat_destroy(&tmp);
return inv;
}
Matrix *mat_mul(const Matrix *A, const Matrix *B) {
Matrix *P = mat_create(A->nRows, B->nCols);
for (int j = 0; j < B->nCols; j++)
for (int i = 0; i < A->nRows; i++)
for (int k = 0; k < A->nCols; k++)
P->data+= A->data * B->data ;
return P;
}//--------------------------------------------------------------------------------------
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Apologies, the rendering of the previous post seems to be completely messed up! I'll investigate and edit...
&
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Anyway, assumtion that nothing changed in IPP5.2since IPP 4.1 is not correct. There were two years of hard work on optimization of IPP and developing new functions especially in ippMX domain between those two versions. Please try the latest version of IPP.
Vladimir
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Ok Thanks. We'll try it out with the new library this weekend.
&
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
$ time 4.1.ippmat >/dev/null
iter: 100000
...
iter: 900000
46.685u 0.084s 0:46.86 99.7% 0+0k 0+0io 0pf+0w
$ time 5.2.ippmat > /dev/null
iter: 100000
...
iter: 900000
49.631u 0.100s 0:49.80 99.8% 0+0k 0+0io 0pf+0w
$ time cvmat > /dev/null
iter: 100000
...
iter: 900000
52.738u 0.111s 0:53.09 99.5% 0+0k 2832+0io 15pf+0w
$ time mymat > /dev/null
iter: 100000
...
iter: 900000
52.988u 0.089s 0:53.15 99.8% 0+0k 0+0io 0pf+0w
// ---------- IPP 5.2 Matrix benchmark code -------------------
#include
#include
#include "ipp.h"
void printMat(Ipp64f mat[], int w, int h) {
for (int i = 0; i < h; i++) {
for (int j = 0; j < w; j++)
printf("%3.2f ", mat[i*w+j]);
printf(" ");
}
printf(" ");
}
main() {
int dim = 6;
int widthHeight = 6;
IppStatus status;
srand(1);
// Profile block
for (int n = 0; n < 1e6; n++) {
if (n%100000==0) fprintf(stderr, "iter: %d ", n);
Ipp64f mat[dim*dim];
int srcStride1 = dim * sizeof(Ipp64f);
int dstStride1 = dim * sizeof(Ipp64f);
int srcStride2 = sizeof(Ipp64f);
int dstStride2 = sizeof(Ipp64f);
Ipp64f pBuffer[dim*dim+dim];
for (int i = 0; i < dim; i++)
for (int j = 0; j < dim; j++)
mat[i*dim+j] = (double) rand()/RAND_MAX;
Ipp64f inv[dim*dim]; ippmInvert_m_64f((const Ipp64f*) mat, srcStride1, srcStride2,
pBuffer, inv, dstStride1, dstStride2, widthHeight);
Ipp64f mul[dim*dim]; ippmMul_mm_64f((const Ipp64f*) mat, srcStride1, srcStride2, dim, dim,
(const Ipp64f*) inv, srcStride1, srcStride2, dim, dim,
mul, dstStride1, dstStride2);
printf("Inverted matrix "); printMat(inv, dim, dim);
printf("Product matrix "); printMat(mul, dim, dim);
}
// Profile block
}
//------------------------------------------------
As before, we compiled with:
g++ -O3 5.2.ippmat.cc -o 5.2.ippmat -I/opt/intel/ipp/5.2/ia32/include -lippm -L/opt/intel/ipp/5.2/ia32/sharedlib -lippcore
Thanks.
&
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
there is comment from our expert:
Such performance results seem to be true.
The new API has led to some performance regression for operation with one small matrix
because of the additional overhead on parameters check and branching.
The main distinctive feature of IPP small matrices domain is matrix arrays processing.
The optimization has been focused on operations with large amount of small matrices (sizes 3x3, 4x4, 5x5, 6x6).
Our advice is to use matrix array functions like as ippmInvert_ma & ippmMul_mama
+ use dynamic linking in order to see advantage from parallelization.
Regards,
Vladimir
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
&

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page