The Intel Compiler parallelizing automatically.

Gaiger_Chen · ‎07-20-2010

Hi,

I have write a newbie code for practice SSE2:

#include
#include
#include
#include
#include
#include

typedef float Type;

int SSEfunction(Type *pArray1, Type *pArray2, Type *pResult, int n)
{
int i;
int nLoop ;
int nparallel;
nparallel = 16/sizeof(Type);
nLoop = n/nparallel;

__m128 m1, m2, m3, m4;

__m128 *pSrc1, *pSrc2, *pDest;

pSrc1 = (__m128*) pArray1; pSrc2 = (__m128*) pArray2; pDest = (__m128*) pResult;

__m128 m0_5 = _mm_set_ps1(0.5f); // m0_5[0, 1, 2, 3] = 0.5

for (i = 0; i < nLoop; i++ )
{
m1 = _mm_mul_ps(*pSrc1, *pSrc1); // m1 = *pSrc1 * *pSrc1
m2 = _mm_mul_ps(*pSrc2, *pSrc2); // m2 = *pSrc2 * *pSrc2
m3 = _mm_add_ps(m1, m2); // m3 = m1 + m2
m4 = _mm_sqrt_ps(m3); // m4 = sqrt(m3)
*pDest = _mm_add_ps(m4, m0_5); // *pDest = m4 + 0.5

pSrc1++;
pSrc2++;
pDest++;
}
return 1;
}/*SSEfunction*/

int Cfunction1(Type *pArray1, Type *pArray2, Type *pResult, int n)
{
int i;
int nSize = n;
Type* pSource1 = pArray1;
Type* pSource2 = pArray2;
Type* pDest = pResult;

Type temp;
temp = 0;
for ( i = 0; i < nSize; i++ )
{
temp = (*pSource1) * (*pSource1) + (*pSource2)* (*pSource2) ;
*pDest = (Type)sqrt(temp) + 0.5f;

pSource1++;
pSource2++;
pDest++;
}/*for */

return 1;
}/*Cfunction1*/

int Cfunction2(Type *pArray1, Type *pArray2, Type *pResult, int n)
{
int i;
int nSize = n;
Type* pSource1 = pArray1;
Type* pSource2 = pArray2;
Type* pDest = pResult;

int nparallel = 16/sizeof(Type);

Type *temp0, *temp1, *temp2, *temp3;
Type *temp4, *temp5, *temp6, *temp7;
Type *temp9, *temp10, *temp11, *temp12;

temp0 = pSource1;
temp1 = pSource1 + 1;
temp2 = pSource1 + 2;
temp3 = pSource1 + 3;

temp4 = pSource2;
temp5 = pSource2 + 1;
temp6 = pSource2 + 2;
temp7 = pSource2 + 3;

temp9 = pResult;
temp10 = pResult + 1;
temp11 = pResult + 2;
temp12 = pResult + 3;

int nLoop = n/nparallel;

for ( i = 0; i < nLoop; i++ )
{

*temp9 = sqrt( (*temp0)*(*temp0) + (*temp4)*(*temp4) ) +0.5f;
*temp10 = sqrt( (*temp1)*(*temp1) + (*temp5)*(*temp5) ) +0.5f;
*temp11 = sqrt( (*temp2)*(*temp2) + (*temp6)*(*temp6) ) +0.5f;
*temp12 = sqrt( (*temp3)*(*temp3) + (*temp7)*(*temp7) ) +0.5f;

temp0 += 4;
temp1 += 4;
temp2 += 4;
temp3 += 4;

temp4 += 4;
temp5 += 4;
temp6 += 4;
temp7 += 4;

temp9 += 4;
temp10 += 4;
temp11 += 4;
temp12 += 4;

}/*for */

return 1;
}/*Cfunction2*/

int main(void)
{
Type *Src1, *Src2, *Result;
int i, n;

n = 100000000;
Src1 = (Type*)calloc(n, sizeof(Type));
Src2 = (Type*)calloc(n, sizeof(Type));
Result = (Type*)calloc(n, sizeof(Type));

for(i = 0;i Src1 = 1.0;
Src2 = 3.0;
//printf("i = %d, Src1 = %f, Src2 = %f \\n",i,Src1, Src2);
}/*for i*/

time_t begin, end;

begin = clock();
Cfunction2(Src1, Src2, Result, n);
end = clock();
printf("C time = %6.4f\\n",(end- begin)/float(CLOCKS_PER_SEC) );

begin = clock();
SSEfunction(Src1, Src2, Result, n);
end = clock();
printf("SSE time = %6.4f\\n",(end- begin)/float(CLOCKS_PER_SEC) );

free(Src1); free(Src2); free(Result);
return 1;
}/*main*/

In my computer :

intel duel core E5200 2.5GHz
windows XP SP3 :

the run time is :

SSEfunction : 0.38 sec for both VC8 and ICC 10.1

CFunction1 :
1.546 sec for VC8
0.969 sec for ICC.

CFunction2:
1.456 sec for VC8
0.959 sec for ICC.

whatever optimization selection (specify CPU, O2/O3 ) I turn on/off, the performance is the same.

Obviously, the ICC is better than VC8.

I get a question, that is : I thank the CFunction2 is more parallelized for compiler to use SSE instruction, but the proformance of CFunction2 and CFunction1 is the same for ICC.
But there is still about 152% ( 0.959 sec vs 0.38 sec) gap between SSEfuncion.

How should I do to modify the C code to let the performance get close to the SSEfunction result?

The know-how is very useful for me.....

Thank you lots.

Brandon_H_Intel · ‎07-20-2010

Hi Gaiger,

Try the latest 11.1 compiler - I get better performance than I do with 10.1 ( from .73/.65 seconds with 10.1 down to .36 and .29 seconds with 11.1).

Gaiger_Chen · ‎07-21-2010

Hi:

I would like to know what case you run. could you post your testing source code ?

If that is my test case, I would like to know, in ICC 10.1, what function costs 0.73 sec., and which costs 0.63 sec ?

thank you.

Brandon_H_Intel · ‎07-23-2010

It's your test case, CFunction1 and CFunction 2 respectively.

emmanuel_attia · ‎07-26-2010

Maybe you should try offseting your pointers with column index instead of having moving pointers.

Jurczak__Paul · ‎08-01-2010

Hi Brandon,

What was the time for SSEfunction? The ratio between SSE and C functions would be a good measure of auto-vectorizer performance, which is the topic Gaiger brought up.

Thank you,
Paul.