Intel® Integrated Performance Primitives
Deliberate problems developing high-performance vision, signal, security, and storage applications.

What am I doing wrong?

manimal
Beginner
533 Views
Hello all! I have some code that subtracts and multiplies some images, and the hand-coded calculation is faster than the IPP one. The code is listed below. I am using VS 2005 (I think the code should work in linux, but I haven't tried it), and the output I get is:
Time for calculation (800 frames): 1.689512 (s)
Time for one frame: 0.002112 (s)
Frequency: 473.509565
============================
ippit7-5.2.dll 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 1.929452 (s)
Time for one frame: 0.002412 (s)
IPP Frequency: 414.625492
maxError: 1
Thank you for any help you can give.

Eric

#ifdef WIN32

#include

#else

#include

#include

#endif

#include

#include

#include

#include

#define YDIM 1280

#define XDIM 150

#define LINES 80

#define MAX_USHORT 65535

#define TIMING 10

#define MAX(a,b) (((a)<(b))?(b):(a))

#define MIN(a,b) (((a)>(b))?(b):(a))

double randomu();

double GetTimeOfDay(void);

void libinfo(void) {

const IppLibraryVersion* lib = ippiGetLibVersion();

printf("============================ ");

printf("%s %s %d.%d.%d.%d ", lib->Name, lib->Version,

lib->major, lib->minor, lib->majorBuild, lib->build);

printf("============================ ");}

int main(int argc, char *argv[]) {

int ii,jj;

unsigned short *outData1,*outData2,*data,*subtractM;

static int tmpVal;

float *multiplyM;

double start,stop;

Ipp16u *tmpInt,*tmpSub;

Ipp32f *tmpFlt,*tmpMult;

int intStep,fltStep,subStep,multStep;

IppiSize roi={XDIM,YDIM};

int maxError=0;

data=malloc(YDIM*LINES*XDIM*sizeof(unsigned short));

outData1=malloc(YDIM*LINES*XDIM*sizeof(unsigned short));

outData2=malloc(YDIM*LINES*XDIM*sizeof(unsigned short));

subtractM=malloc(YDIM*XDIM*sizeof(unsigned short));

multiplyM=malloc(YDIM*XDIM*sizeof(float));

//Generate random data

for(ii=0;ii

data[ii]=(unsigned short)(10000.0 + (2.0*randomu()-1.0)*2500.0);

for(ii=0;ii

subtractM[ii]=(unsigned short)(1000.0 + (2.0*randomu()-1.0)*200.0);

for(ii=0;ii

multiplyM[ii]=(float)(1.0 + (2.0*randomu()-1.0)*0.2);

/////////////////////////////////////////////////////

// Do the calculation

/////////////////////////////////////////////////////

start=GetTimeOfDay();

for(ii=0;ii

for(jj=0;jj

// Subtract and multiply

tmpVal=(int)((data[(ii%LINES)*YDIM*XDIM+jj]-subtractM[jj])*multiplyM[jj]);//+0.5;

// Data checks to make sure the data is still in range.

if ((tmpVal < 0) || (tmpVal > MAX_USHORT)) {

const int a=tmpVal;

tmpVal=MAX(0,a);

tmpVal=MIN(MAX_USHORT,a);

}

// Implicit conversion to short

outData1[(ii%LINES)*YDIM*XDIM+jj]=tmpVal;

}

}

stop=GetTimeOfDay();

/////////////////////////////////////////////////////

// End calculation

/////////////////////////////////////////////////////

printf("Time for calculation (%i frames): %f (s) ",TIMING*LINES, stop-start);

printf("Time for one frame: %f (s) ",(stop-start)/(TIMING*LINES));

printf("Frequency: %f ",(LINES*TIMING)/(stop-start));

/////////////////////////////////////////////////////

// Do th e calculation with IPP

/////////////////////////////////////////////////////

libinfo();

tmpInt=ippiMalloc_16s_C1(XDIM,YDIM,&intStep);

tmpFlt=ippiMalloc_32f_C1(XDIM,YDIM,&fltStep);

tmpSub=ippiMalloc_16u_C1(XDIM,YDIM,&subStep);

tmpMult=ippiMalloc_32f_C1(XDIM,YDIM,&multStep);

ippiCopy_16u_C1R(subtractM,XDIM*sizeof(short),tmpSub,subStep,roi);

ippiCopy_32f_C1R(multiplyM,XDIM*sizeof(float),tmpMult,multStep,roi);

start=GetTimeOfDay();

for(ii=0;ii

//Subtract

ippiSub_16u_C1RSfs(tmpSub,subStep,&data[(ii%LINES)*YDIM*XDIM],XDIM*sizeof(short),tmpInt,intStep,roi,0);

//Convert data to floats

ippiConvert_16s32f_C1R(tmpInt,intStep,tmpFlt,fltStep,roi);

//Multiply

ippiMul_32f_C1IR(tmpMult,multStep,tmpFlt,fltStep,roi);

//Threshold for converting back to shorts

ippiThreshold_GTVal_32f_C1IR(tmpFlt,fltStep,roi,MAX_USHORT-1,MAX_USHORT-1);

ippiThreshold_LTVal_32f_C1IR(tmpFlt,fltStep,roi,0,0);

//Convert back to shorts

ippiConvert_32f16u_C1R(tmpFlt,fltStep,&outData2[(ii%LINES)*YDIM*XDIM],

XDIM*sizeof(short),roi,ippRndZero);

}

stop=GetTimeOfDay();

ippiFree(tmpInt);

ippiFree(tmpMult);

ippiFree(tmpSub);

ippiFree(tmpFlt);

/////////////////////////////////////////////////////

// End calculation

/////////////////////////////////////////////////////

printf("Time for IPP calculation (%i frames): %f (s) ",TIMING*LINES, stop-start);

printf("Time for one frame: %f (s) ",(stop-start)/(TIMING*LINES));

printf("IPP Frequency: t %f ",(LINES*TIMING)/(stop-start));

for(jj=0;jj

if (abs(outData1[jj]-outData2[jj]) > maxError)

maxError=abs(outData1[jj]-outData2[jj]);

}

printf("maxError: %i ",maxError);

free(subtractM);

free(multiplyM);

free(data);

free(outData1);

free(outData2);

return 0;

}

double randomu() {

return (double)rand()/(double)RAND_MAX;

}

double GetTimeOfDay(void) {

#ifdef WIN32

static LARGE_INTEGER freq; LARGE_INTEGER time;

if(freq.QuadPart==0) QueryPerformanceFrequency(&freq);

QueryPerformanceCounter(&time);

return (double)time.QuadPart/freq.QuadPart;

#else

struct timeval currTime;

gettimeofday(&currTime,NULL);

return((double)((double)currTime.tv_sec+

(double)currTime.tv_usec/1000000.0));

#endif

}

0 Kudos
8 Replies
manimal
Beginner
533 Views
Wow, that formatting is really bad. Sorry about that.

Eric
0 Kudos
manimal
Beginner
533 Views
Here's some more timing info. using 64-bit linux. With GCC:
Time for calculation (800 frames): 0.776799 (s)
Time for one frame: 0.000971 (s)
Frequency: 1029.867492
============================
libippim7.so.5.2 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 2.083939 (s)
Time for one frame: 0.002605 (s)
IPP Frequency: 383.888382
maxError: 0

With Intel's compiler:
Time for calculation (800 frames): 1.013748 (s)
Time for one frame: 0.001267 (s)
Frequency: 789.150809
============================
libippim7.so.5.2 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 2.038634 (s)
Time for one frame: 0.002548 (s)
IPP Frequency: 392.419664
maxError: 0

The only notable compiler option I am using is -O2.
0 Kudos
Vladimir_Dudnik
Employee
533 Views

Hello,

an IPP expert recommend you to change code in the following way (to optimize memory access):

#ifdef WIN32

#include

#else

#include

#include

#endif

#include

#include

#include

#include

#define YDIM 1280

#define XDIM 150

#define LINES 80

#define MAX_USHORT 65535

#define TIMING 10

#define MAX(a,b) (((a)<(b))?(b):(a))

#define MIN(a,b) (((a)>(b))?(b):(a))

double randomu();

double GetTimeOfDay(void);

void libinfo(void) {

const IppLibraryVersion* lib = ippiGetLibVersion();

printf("============================ ");

printf("%s %s %d.%d.%d.%d ", lib->Name, lib->Version,

lib->major, lib->minor, lib->majorBuild, lib->build);

printf("============================ ");}

int main(int argc, char *argv[]) {

int ii,jj, i;

unsigned short *outData1,*outData2,*data,*subtractM;

static int tmpVal;

float *multiplyM;

double start,stop;

Ipp16u *tmpInt,*tmpSub;

Ipp32f *tmpFlt,*tmpMult;

int intStep,fltStep,subStep,multStep;

IppiSize roi={XDIM,YDIM};

int maxError=0;

data=malloc(YDIM*LINES*XDIM*sizeof(unsigned short));

outData1=malloc(YDIM*LINES*XDIM*sizeof(unsigned short));

outData2=malloc(YDIM*LINES*XDIM*sizeof(unsigned short));

subtractM=malloc(YDIM*XDIM*sizeof(unsigned short));

multiplyM=malloc(YDIM*XDIM*sizeof(float));

//Generate random data

for(ii=0;ii

data[ii]=(unsigned short)(10000.0 + (2.0*randomu()-1.0)*2500.0);

for(ii=0;ii

subtractM[ii]=(unsigned short)(1000.0 + (2.0*randomu()-1.0)*200.0);

for(ii=0;ii

multiplyM[ii]=(float)(1.0 + (2.0*randomu()-1.0)*0.2);

/////////////////////////////////////////////////////

// Do the calculation

/////////////////////////////////////////////////////

start=GetTimeOfDay();

for(ii=0;ii

for(jj=0;jj

// Subtract and multiply

tmpVal=(int)((data[(ii%LINES)*YDIM*XDIM+jj]-subtractM[jj])*multiplyM[jj]);//+0.5;

// Data checks to make sure the data is still in range.

if ((tmpVal < 0) || (tmpVal > MAX_USHORT)) {

const int a=tmpVal;

tmpVal=MAX(0,a);

tmpVal=MIN(MAX_USHORT,a);

}

// Implicit conversion to short

outData1[(ii%LINES)*YDIM*XDIM+jj]=tmpVal;

}

}

stop=GetTimeOfDay();

/////////////////////////////////////////////////////

// End calculation

/////////////////////////////////////////////////////

printf("Time for calculation (%i frames): %f (s) ",TIMING*LINES, stop-start);

printf("Time for one frame: %f (s) ",(stop-start)/(TIMING*LINES));

printf("Frequency: %f ",(LINES*TIMING)/(stop-start));

/////////////////////////////////////////////////////

// Do the calculation with IPP

/////////////////////////////////////////////////////

libinfo();

tmpInt=ippiMalloc_16u_C1(XDIM,YDIM,&intStep);

tmpFlt=ippiMalloc_32f_C1(XDIM,YDIM,&fltStep);

tmpSub=ippiMalloc_16u_C1(XDIM,YDIM,&subStep);

tmpMult=ippiMalloc_32f_C1(XDIM,YDIM,&multStep);

ippiCopy_16u_C1R(subtractM,XDIM*sizeof(short),tmpSub,subStep,roi);

ippiCopy_32f_C1R(multiplyM,XDIM*sizeof(float),tmpMult,multStep,roi);

start=GetTimeOfDay();

for(ii=0;ii

for( i = 0; i < YDIM; i++ ){

Ipp16u *tmpin = &data[(ii%LINES)*YDIM*XDIM] + i * XDIM;

Ipp16u *tmpint = (Ipp16u*)((Ipp8u*)tmpInt + intStep * i );

Ipp16u *tmpins = (Ipp16u*)((Ipp8u*)tmpSub + subStep * i );

Ipp32f *tmpflt = tmpFlt;//(Ipp32f*)((Ipp8u*)tmpFlt + fltStep * i );

Ipp32f *tmpmul = (Ipp32f*)((Ipp8u*)tmpMult + multStep * i );

Ipp16u *tmpout = &outData2[(ii%LINES)*YDIM*XDIM] + i * XDIM;

ippsSub_16u_Sfs( tmpins, tmpin, tmpint, XDIM, 0 );

ippsConvert_16u32f( tmpint, tmpflt, XDIM );

ippsMul_32f_I( tmpmul, tmpflt, XDIM );

ippsConvert_32f16u_Sfs( tmpflt, tmpout, XDIM, ippRndZero, 0 );

}

}

stop=GetTimeOfDay();

ippiFree(tmpInt);

ippiFree(tmpMult);

ippiFree(tmpSub);

ippiFree(tmpFlt);

/////////////////////////////////////////////////////

// End calculation

/////////////////////////////////////////////////////

printf("Time for IPP calculation (%i frames): %f (s) ",TIMING*LINES, stop-start);

printf("Time for one frame: %f (s) ",(stop-start)/(TIMING*LINES));

printf("IPP Frequency: %f ",(LINES*TIMING)/(stop-start));

for(jj=0;jj

if (abs(outData1[jj]-outData2[jj]) > maxError)< p="">

maxError=abs(outData1[jj]-outData2[jj]);

}

printf("maxError: %i ",maxError);

free(subtractM);

free(multiplyM);

free(data);

free(outData1);

free(outData2);

return 0;

}

double randomu() {

return (double)rand()/(double)RAND_MAX;

}

double GetTimeOfDay(void) {

#ifdef WIN32

static LARGE_INTEGER freq; LARGE_INTEGER time;

if(freq.QuadPart==0) QueryPerformanceFrequency(&freq);

QueryPerformanceCounter(&time);

return (double)time.QuadPart/freq.QuadPart;

#else

struct timeval currTime;

gettimeofday(&currTime,NULL);

return((double)((double)currTime.tv_sec+

(double)currTime.tv_usec/1000000.0));

#endif

}

Regards,
Vladimir

0 Kudos
manimal
Beginner
533 Views
Holy Cow! That much better! Here's the timing now:
Time for calculation (800 frames): 1.738825 (s)
Time for one frame: 0.002174 (s)
Frequency: 460.080812
============================
ippit7-5.2.dll 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 0.699127 (s)
Time for one frame: 0.000874 (s)
IPP Frequency: 1144.284661
maxError: 1
Thanks Vladimir, and let your engineer know I am grateful!
0 Kudos
manimal
Beginner
533 Views
Okay, I'm trying to figure out where the speed improvements are coming from. First, I've tried to replace my original code with the signal processing calls that Vladimir suggested but without all the clever pointers:
 tmpInt=malloc(XDIM*YDIM*sizeof(short));
tmpFlt=malloc(XDIM*YDIM*sizeof(float));

start=GetTimeOfDay();

for(ii=0;ii
ippsSub_16u_Sfs(subtractM,&data[(ii%LINES)*YDIM*XDIM],tmpInt,
XDIM*YDIM, 0 );
ippsConvert_16u32f( tmpInt, tmpFlt, XDIM*YDIM );
ippsMul_32f_I( multiplyM, tmpFlt, XDIM*YDIM );
ippsConvert_32f16u_Sfs( tmpFlt, &outData2[(ii%LINES)*YDIM*XDIM],
XDIM*YDIM, ippRndZero, 0 );
}
This does pretty good too, but not as good as Vladimir's:
Time for calculation (800 frames):      1.700834 (s)
Time for one frame: 0.002126 (s)
Frequency: 470.357378
============================
ippit7-5.2.dll 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 0.815084 (s)
Time for one frame: 0.001019 (s)
IPP Frequency: 981.493513
maxError: 1
So, I think the lesson is, use the signal processing calls if you can.
0 Kudos
manimal
Beginner
533 Views
Vladimir's code preforms the calculation in rows. So the next thing is to try the calculation in rows, but without all the clever pointer arithmetic:
 for(ii=0;ii for( i = 0; i < YDIM; i++ ){
ippsSub_16u_Sfs(&subtractM[i*XDIM],&data[(ii%LINES)*YDIM*XDIM+i*XDIM],&tmpInt[i*XDIM],
XDIM, 0 );
ippsConvert_16u32f( &tmpInt[i*XDIM], &tmpFlt[i*XDIM], XDIM );
ippsMul_32f_I( &multiplyM[i*XDIM], &tmpFlt[i*XDIM], XDIM );
ippsConvert_32f16u_Sfs( &tmpFlt[i*XDIM], &outData2[(ii%LINES)*YDIM*XDIM+i*XDIM],
XDIM, ippRndZero, 0 );
}
This is actually slower:
Time for calculation (800 frames): 1.694970 (s)
Time for one frame: 0.002119 (s)
Frequency: 471.984726
============================
ippit7-5.2.dll 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 1.077069 (s)
Time for one frame: 0.001346 (s)
IPP Frequency: 742.756803
maxError: 1




0 Kudos
manimal
Beginner
533 Views
Now, try the same thing as above, but with clever pointers (thought not as clever a Vladimir's) :
for(ii=0;ii for( i = 0; i < YDIM; i++ ){
unsigned short *tmpin = &data[(ii%LINES)*YDIM*XDIM + i * XDIM];
unsigned short *tmpint = &tmpInt[XDIM * i];
unsigned short *tmpins = &subtractM[XDIM * i];
float *tmpflt = tmpFlt;
float *tmpmul = &multiplyM[XDIM * i];
unsigned short *tmpout = &outData2[(ii%LINES)*YDIM*XDIM + i * XDIM];

ippsSub_16u_Sfs( tmpins, tmpin, tmpint, XDIM, 0 );
ippsConvert_16u32f( tmpint, tmpflt, XDIM );
ippsMul_32f_I( tmpmul, tmpflt, XDIM );
ippsConvert_32f16u_Sfs( tmpflt, tmpout, XDIM, ippRndZero, 0 );

}
This code is just about as fast as Vladimir's code:
Time for calculation (800 frames):      1.692036 (s)
Time for one frame: 0.002115 (s)
Frequency: 472.803299
============================
ippit7-5.2.dll 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 0.696450 (s)
Time for one frame: 0.000871 (s)
IPP Frequency: 1148.682410
maxError: 1
So try not to do pointer arithmetic in IPP functions.


0 Kudos
manimal
Beginner
533 Views
Since, tmpFlt and tmpInt are just being used to store rows now, only allocate a row and skip the pointer arithmetic entirely:
tmpInt=malloc(XDIM*sizeof(short));
tmpFlt=malloc(XDIM*sizeof(float));

start=GetTimeOfDay();

for(ii=0;ii for( i = 0; i < YDIM; i++ ){
unsigned short *tmpin = &data[(ii%LINES)*YDIM*XDIM + i * XDIM];
unsigned short *tmpins = &subtractM[XDIM * i ];
float *tmpmul = &multiplyM[XDIM* i ];
unsigned short *tmpout = &outData2[(ii%LINES)*YDIM*XDIM + i * XDIM];

ippsSub_16u_Sfs( tmpins, tmpin, tmpInt, XDIM, 0 );
ippsConvert_16u32f( tmpInt, tmpFlt, XDIM );
ippsMul_32f_I( tmpmul, tmpFlt, XDIM );
ippsConvert_32f16u_Sfs( tmpFlt, tmpout, XDIM, ippRndZero, 0 );

}
This is pretty fast now:
Time for calculation (800 frames): 1.702078 (s)
Time for one frame: 0.002128 (s)
Frequency: 470.013800
============================
ippit7-5.2.dll 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 0.499302 (s)
Time for one frame: 0.000624 (s)
IPP Frequency: 1602.236225
maxError: 1
I think this is where I am going to quit. Here's my summary:
  1. Use the IPPS library instead of IPPI wherever you can.
  2. Do calculation with small chunks of data (rows in this case), rather than all at once. (I guess? Maybe? I'm not sure about this one.)
  3. Do pointer math, but not in IPP functions (and don't do extra pointer math).
0 Kudos
Reply