What am I doing wrong?

manimal · ‎07-27-2007

Hello all! I have some code that subtracts and multiplies some images, and the hand-coded calculation is faster than the IPP one. The code is listed below. I am using VS 2005 (I think the code should work in linux, but I haven't tried it), and the output I get is:

Time for calculation (800 frames): 1.689512 (s)
Time for one frame: 0.002112 (s)
Frequency: 473.509565
============================
ippit7-5.2.dll 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 1.929452 (s)
Time for one frame: 0.002412 (s)
IPP Frequency: 414.625492
maxError: 1

Thank you for any help you can give.

Eric

#ifdef WIN32
#include 
#else#include 
#include #endif
#include 
#include 
#include 
#include 
#define YDIM 1280
#define XDIM 150
#define LINES 80
#define MAX_USHORT 65535
#define TIMING 10
#define MAX(a,b) (((a)<(b))?(b):(a))
#define MIN(a,b) (((a)>(b))?(b):(a))
double randomu();
double GetTimeOfDay(void);
void libinfo(void) {
const IppLibraryVersion* lib = ippiGetLibVersion();
printf("============================
");
printf("%s %s %d.%d.%d.%d
", lib->Name, lib->Version,
lib->major, lib->minor, lib->majorBuild, lib->build);
printf("============================
");}
int main(int argc, char *argv[]) {
int ii,jj;
unsigned short *outData1,*outData2,*data,*subtractM;
static int tmpVal;
float *multiplyM;
double start,stop;
Ipp16u *tmpInt,*tmpSub;
Ipp32f *tmpFlt,*tmpMult;
int intStep,fltStep,subStep,multStep;
IppiSize roi={XDIM,YDIM};
int maxError=0;
data=malloc(YDIM*LINES*XDIM*sizeof(unsigned short));
outData1=malloc(YDIM*LINES*XDIM*sizeof(unsigned short));
outData2=malloc(YDIM*LINES*XDIM*sizeof(unsigned short));
subtractM=malloc(YDIM*XDIM*sizeof(unsigned short));
multiplyM=malloc(YDIM*XDIM*sizeof(float));
//Generate random data
for(ii=0;ii
data[ii]=(unsigned short)(10000.0 + (2.0*randomu()-1.0)*2500.0);
for(ii=0;ii
subtractM[ii]=(unsigned short)(1000.0 + (2.0*randomu()-1.0)*200.0);
for(ii=0;ii
multiplyM[ii]=(float)(1.0 + (2.0*randomu()-1.0)*0.2);
/////////////////////////////////////////////////////
// Do the calculation
/////////////////////////////////////////////////////
start=GetTimeOfDay();
for(ii=0;ii
for(jj=0;jj
// Subtract and multiply
tmpVal=(int)((data[(ii%LINES)*YDIM*XDIM+jj]-subtractM[jj])*multiplyM[jj]);//+0.5;
// Data checks to make sure the data is still in range.
if ((tmpVal < 0) || (tmpVal > MAX_USHORT)) {
const int a=tmpVal;
tmpVal=MAX(0,a);
tmpVal=MIN(MAX_USHORT,a);
}
// Implicit conversion to short
outData1[(ii%LINES)*YDIM*XDIM+jj]=tmpVal;
}
}
stop=GetTimeOfDay();
/////////////////////////////////////////////////////
// End calculation
/////////////////////////////////////////////////////
printf("Time for calculation (%i frames):	%f (s)
",TIMING*LINES, stop-start);
printf("Time for one frame:			%f (s)
",(stop-start)/(TIMING*LINES));
printf("Frequency:				%f
",(LINES*TIMING)/(stop-start));
/////////////////////////////////////////////////////
// Do th
e calculation with IPP
/////////////////////////////////////////////////////
libinfo();
tmpInt=ippiMalloc_16s_C1(XDIM,YDIM,&intStep);
tmpFlt=ippiMalloc_32f_C1(XDIM,YDIM,&fltStep);
tmpSub=ippiMalloc_16u_C1(XDIM,YDIM,&subStep);
tmpMult=ippiMalloc_32f_C1(XDIM,YDIM,&multStep);
ippiCopy_16u_C1R(subtractM,XDIM*sizeof(short),tmpSub,subStep,roi);
ippiCopy_32f_C1R(multiplyM,XDIM*sizeof(float),tmpMult,multStep,roi);
start=GetTimeOfDay();
for(ii=0;ii
//Subtract
ippiSub_16u_C1RSfs(tmpSub,subStep,&data[(ii%LINES)*YDIM*XDIM],XDIM*sizeof(short),tmpInt,intStep,roi,0);
//Convert data to floats
ippiConvert_16s32f_C1R(tmpInt,intStep,tmpFlt,fltStep,roi);
//Multiply
ippiMul_32f_C1IR(tmpMult,multStep,tmpFlt,fltStep,roi);
//Threshold for converting back to shorts
ippiThreshold_GTVal_32f_C1IR(tmpFlt,fltStep,roi,MAX_USHORT-1,MAX_USHORT-1);
ippiThreshold_LTVal_32f_C1IR(tmpFlt,fltStep,roi,0,0);
//Convert back to shorts
ippiConvert_32f16u_C1R(tmpFlt,fltStep,&outData2[(ii%LINES)*YDIM*XDIM],
XDIM*sizeof(short),roi,ippRndZero);
}
stop=GetTimeOfDay();
ippiFree(tmpInt);
ippiFree(tmpMult);
ippiFree(tmpSub);
ippiFree(tmpFlt);
/////////////////////////////////////////////////////
// End calculation
/////////////////////////////////////////////////////
printf("Time for IPP calculation (%i frames):	%f (s)
",TIMING*LINES, stop-start);
printf("Time for one frame:			%f (s)
",(stop-start)/(TIMING*LINES));
printf("IPP Frequency:	
t		%f
",(LINES*TIMING)/(stop-start));
for(jj=0;jj
if (abs(outData1[jj]-outData2[jj]) > maxError)
maxError=abs(outData1[jj]-outData2[jj]);
}
printf("maxError: %i
",maxError);
free(subtractM);
free(multiplyM);
free(data);
free(outData1);
free(outData2);
return 0;
}
double randomu() {
return (double)rand()/(double)RAND_MAX;
}
double GetTimeOfDay(void) {
#ifdef WIN32
static LARGE_INTEGER freq; LARGE_INTEGER time;
if(freq.QuadPart==0) QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&time);
return (double)time.QuadPart/freq.QuadPart;
#elsestruct timeval currTime;
gettimeofday(&currTime,NULL);
return((double)((double)currTime.tv_sec+
(double)currTime.tv_usec/1000000.0));#endif}

manimal · ‎07-27-2007

Wow, that formatting is really bad. Sorry about that.

Eric

manimal · ‎07-30-2007

Here's some more timing info. using 64-bit linux. With GCC:

Time for calculation (800 frames): 0.776799 (s)
Time for one frame: 0.000971 (s)
Frequency: 1029.867492
============================
libippim7.so.5.2 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 2.083939 (s)
Time for one frame: 0.002605 (s)
IPP Frequency: 383.888382
maxError: 0

With Intel's compiler:

Time for calculation (800 frames): 1.013748 (s)
Time for one frame: 0.001267 (s)
Frequency: 789.150809
============================
libippim7.so.5.2 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 2.038634 (s)
Time for one frame: 0.002548 (s)
IPP Frequency: 392.419664
maxError: 0

The only notable compiler option I am using is -O2.

Vladimir_Dudnik · ‎08-01-2007

Hello,

an IPP expert recommend you to change code in the following way (to optimize memory access):

#ifdef WIN32
#include 
#else
#include 
#include 
#endif
#include 
#include 
#include 
#include 
#define YDIM 1280
#define XDIM 150
#define LINES 80
#define MAX_USHORT 65535
#define TIMING 10
#define MAX(a,b) (((a)<(b))?(b):(a))
#define MIN(a,b) (((a)>(b))?(b):(a))
double randomu();
double GetTimeOfDay(void);
void libinfo(void) {
const IppLibraryVersion* lib = ippiGetLibVersion();
printf("============================
");
printf("%s %s %d.%d.%d.%d
", lib->Name, lib->Version,
lib->major, lib->minor, lib->majorBuild, lib->build);
printf("============================
");}
int main(int argc, char *argv[]) {
int ii,jj, i;
unsigned short *outData1,*outData2,*data,*subtractM;
static int tmpVal;
float *multiplyM;
double start,stop;
Ipp16u *tmpInt,*tmpSub;
Ipp32f *tmpFlt,*tmpMult;
int intStep,fltStep,subStep,multStep;
IppiSize roi={XDIM,YDIM};
int maxError=0;
data=malloc(YDIM*LINES*XDIM*sizeof(unsigned short));
outData1=malloc(YDIM*LINES*XDIM*sizeof(unsigned short));
outData2=malloc(YDIM*LINES*XDIM*sizeof(unsigned short));
subtractM=malloc(YDIM*XDIM*sizeof(unsigned short));
multiplyM=malloc(YDIM*XDIM*sizeof(float));
//Generate random data
for(ii=0;ii
 data[ii]=(unsigned short)(10000.0 + (2.0*randomu()-1.0)*2500.0);
for(ii=0;ii
 subtractM[ii]=(unsigned short)(1000.0 + (2.0*randomu()-1.0)*200.0);
for(ii=0;ii
 multiplyM[ii]=(float)(1.0 + (2.0*randomu()-1.0)*0.2);
/////////////////////////////////////////////////////
// Do the calculation
/////////////////////////////////////////////////////
start=GetTimeOfDay();
for(ii=0;ii
 for(jj=0;jj
// Subtract and multiply
 tmpVal=(int)((data[(ii%LINES)*YDIM*XDIM+jj]-subtractM[jj])*multiplyM[jj]);//+0.5;
// Data checks to make sure the data is still in range.
 if ((tmpVal < 0) || (tmpVal > MAX_USHORT)) {
 const int a=tmpVal;
 tmpVal=MAX(0,a);
 tmpVal=MIN(MAX_USHORT,a);
 }
// Implicit conversion to short
 outData1[(ii%LINES)*YDIM*XDIM+jj]=tmpVal;
 }
}
stop=GetTimeOfDay();
/////////////////////////////////////////////////////
// End calculation
/////////////////////////////////////////////////////
printf("Time for calculation (%i frames):	%f (s)
",TIMING*LINES, stop-start);
printf("Time for one frame:			%f (s)
",(stop-start)/(TIMING*LINES));
printf("Frequency:				%f
",(LINES*TIMING)/(stop-start));
/////////////////////////////////////////////////////
// Do the calculation with IPP
/////////////////////////////////////////////////////
libinfo();
tmpInt=ippiMalloc_16u_C1(XDIM,YDIM,&intStep);
tmpFlt=ippiMalloc_32f_C1(XDIM,YDIM,&fltStep);
tmpSub=ippiMalloc_16u_C1(XDIM,YDIM,&subStep);
tmpMult=ippiMalloc_32f_C1(XDIM,YDIM,&multStep);
ippiCopy_16u_C1R(subtractM,XDIM*sizeof(short),tmpSub,subStep,roi);
ippiCopy_32f_C1R(multiplyM,XDIM*sizeof(float),tmpMult,multStep,roi);
start=GetTimeOfDay();
for(ii=0;ii
 for( i = 0; i < YDIM; i++ ){
 Ipp16u *tmpin = &data[(ii%LINES)*YDIM*XDIM] + i * XDIM;
 Ipp16u *tmpint = (Ipp16u*)((Ipp8u*)tmpInt + intStep * i );
 Ipp16u *tmpins = (Ipp16u*)((Ipp8u*)tmpSub + subStep * i );
 Ipp32f *tmpflt = tmpFlt;//(Ipp32f*)((Ipp8u*)tmpFlt + fltStep * i );
 Ipp32f *tmpmul = (Ipp32f*)((Ipp8u*)tmpMult + multStep * i );
 Ipp16u *tmpout = &outData2[(ii%LINES)*YDIM*XDIM] + i * XDIM;
 ippsSub_16u_Sfs( tmpins, tmpin, tmpint, XDIM, 0 );
 ippsConvert_16u32f( tmpint, tmpflt, XDIM );
 ippsMul_32f_I( tmpmul, tmpflt, XDIM );
 ippsConvert_32f16u_Sfs( tmpflt, tmpout, XDIM, ippRndZero, 0 );
 }
}
stop=GetTimeOfDay();
ippiFree(tmpInt);
ippiFree(tmpMult);
ippiFree(tmpSub);
ippiFree(tmpFlt);
/////////////////////////////////////////////////////
// End calculation
/////////////////////////////////////////////////////
printf("Time for IPP calculation (%i frames):	%f (s)
",TIMING*LINES, stop-start);
printf("Time for one frame:			%f (s)
",(stop-start)/(TIMING*LINES));
printf("IPP Frequency:				%f
",(LINES*TIMING)/(stop-start));
for(jj=0;jj
if (abs(outData1[jj]-outData2[jj]) > maxError)< p="">
maxError=abs(outData1[jj]-outData2[jj]);
}
printf("maxError: %i
",maxError);
free(subtractM);
free(multiplyM);
free(data);
free(outData1);
free(outData2);
return 0;
}
double randomu() {
return (double)rand()/(double)RAND_MAX;
}
double GetTimeOfDay(void) {
#ifdef WIN32
static LARGE_INTEGER freq; LARGE_INTEGER time;
if(freq.QuadPart==0) QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&time);
return (double)time.QuadPart/freq.QuadPart;
#else
struct timeval currTime;
gettimeofday(&currTime,NULL);
return((double)((double)currTime.tv_sec+
(double)currTime.tv_usec/1000000.0));
#endif
}

Regards,
Vladimir

manimal · ‎08-02-2007

Holy Cow! That much better! Here's the timing now:

Time for calculation (800 frames): 1.738825 (s)
Time for one frame: 0.002174 (s)
Frequency: 460.080812
============================
ippit7-5.2.dll 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 0.699127 (s)
Time for one frame: 0.000874 (s)
IPP Frequency: 1144.284661
maxError: 1

Thanks Vladimir, and let your engineer know I am grateful!

manimal · ‎08-02-2007

Okay, I'm trying to figure out where the speed improvements are coming from. First, I've tried to replace my original code with the signal processing calls that Vladimir suggested but without all the clever pointers:

 tmpInt=malloc(XDIM*YDIM*sizeof(short));
 tmpFlt=malloc(XDIM*YDIM*sizeof(float));

 start=GetTimeOfDay();

 for(ii=0;ii
 ippsSub_16u_Sfs(subtractM,&data[(ii%LINES)*YDIM*XDIM],tmpInt, 
 XDIM*YDIM, 0 );
 ippsConvert_16u32f( tmpInt, tmpFlt, XDIM*YDIM );
 ippsMul_32f_I( multiplyM, tmpFlt, XDIM*YDIM );
 ippsConvert_32f16u_Sfs( tmpFlt, &outData2[(ii%LINES)*YDIM*XDIM],
 XDIM*YDIM, ippRndZero, 0 );
 }

This does pretty good too, but not as good as Vladimir's:

Time for calculation (800 frames):      1.700834 (s)
Time for one frame:                     0.002126 (s)
Frequency:                              470.357378
============================
ippit7-5.2.dll 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames):  0.815084 (s)
Time for one frame:                     0.001019 (s)
IPP Frequency:                          981.493513
maxError: 1

So, I think the lesson is, use the signal processing calls if you can.

manimal · ‎08-02-2007

Vladimir's code preforms the calculation in rows. So the next thing is to try the calculation in rows, but without all the clever pointer arithmetic:

 for(ii=0;ii for( i = 0; i < YDIM; i++ ){
  ippsSub_16u_Sfs(&subtractM[i*XDIM],&data[(ii%LINES)*YDIM*XDIM+i*XDIM],&tmpInt[i*XDIM], 
   XDIM, 0 );
  ippsConvert_16u32f( &tmpInt[i*XDIM], &tmpFlt[i*XDIM], XDIM );
  ippsMul_32f_I( &multiplyM[i*XDIM], &tmpFlt[i*XDIM], XDIM );
  ippsConvert_32f16u_Sfs( &tmpFlt[i*XDIM], &outData2[(ii%LINES)*YDIM*XDIM+i*XDIM],
   XDIM, ippRndZero, 0 );
 }

This is actually slower:

Time for calculation (800 frames): 1.694970 (s)
Time for one frame: 0.002119 (s)
Frequency: 471.984726
============================
ippit7-5.2.dll 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 1.077069 (s)
Time for one frame: 0.001346 (s)
IPP Frequency: 742.756803
maxError: 1

manimal · ‎08-02-2007

Now, try the same thing as above, but with clever pointers (thought not as clever a Vladimir's) :

for(ii=0;ii for( i = 0; i < YDIM; i++ ){
 unsigned short *tmpin = &data[(ii%LINES)*YDIM*XDIM + i * XDIM];
 unsigned short *tmpint = &tmpInt[XDIM * i];
 unsigned short *tmpins = &subtractM[XDIM * i];
 float *tmpflt = tmpFlt;
  float *tmpmul = &multiplyM[XDIM * i];
 unsigned short *tmpout = &outData2[(ii%LINES)*YDIM*XDIM + i * XDIM];

  ippsSub_16u_Sfs( tmpins, tmpin, tmpint, XDIM, 0 );
 ippsConvert_16u32f( tmpint, tmpflt, XDIM );
 ippsMul_32f_I( tmpmul, tmpflt, XDIM );
 ippsConvert_32f16u_Sfs( tmpflt, tmpout, XDIM, ippRndZero, 0 );

 }

This code is just about as fast as Vladimir's code:

Time for calculation (800 frames):      1.692036 (s)
Time for one frame:                     0.002115 (s)
Frequency:                              472.803299
============================
ippit7-5.2.dll 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames):  0.696450 (s)
Time for one frame:                     0.000871 (s)
IPP Frequency:                          1148.682410
maxError: 1

So try not to do pointer arithmetic in IPP functions.

manimal · ‎08-02-2007

Since, tmpFlt and tmpInt are just being used to store rows now, only allocate a row and skip the pointer arithmetic entirely:

tmpInt=malloc(XDIM*sizeof(short));
 tmpFlt=malloc(XDIM*sizeof(float));

 start=GetTimeOfDay();

 for(ii=0;ii for( i = 0; i < YDIM; i++ ){
 unsigned short *tmpin = &data[(ii%LINES)*YDIM*XDIM + i * XDIM];
 unsigned short *tmpins = &subtractM[XDIM * i ];
  float *tmpmul = &multiplyM[XDIM* i ];
 unsigned short *tmpout = &outData2[(ii%LINES)*YDIM*XDIM + i * XDIM];

  ippsSub_16u_Sfs( tmpins, tmpin, tmpInt, XDIM, 0 );
 ippsConvert_16u32f( tmpInt, tmpFlt, XDIM );
 ippsMul_32f_I( tmpmul, tmpFlt, XDIM );
 ippsConvert_32f16u_Sfs( tmpFlt, tmpout, XDIM, ippRndZero, 0 );

 }

This is pretty fast now:

Time for calculation (800 frames): 1.702078 (s)
Time for one frame: 0.002128 (s)
Frequency: 470.013800
============================
ippit7-5.2.dll 5.2 5.2.108.410
============================
Time for IPP calculation (800 frames): 0.499302 (s)
Time for one frame: 0.000624 (s)
IPP Frequency: 1602.236225
maxError: 1

I think this is where I am going to quit. Here's my summary:

Use the IPPS library instead of IPPI wherever you can.
Do calculation with small chunks of data (rows in this case), rather than all at once. (I guess? Maybe? I'm not sure about this one.)
Do pointer math, but not in IPP functions (and don't do extra pointer math).