- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

I have write a newbie code for practice SSE2:

#include

#include

#include

#include

#include

#include

typedef float Type;

int SSEfunction(Type *pArray1, Type *pArray2, Type *pResult, int n)

{

int i;

int nLoop ;

int nparallel;

nparallel = 16/sizeof(Type);

nLoop = n/nparallel;

__m128 m1, m2, m3, m4;

__m128 *pSrc1, *pSrc2, *pDest;

pSrc1 = (__m128*) pArray1; pSrc2 = (__m128*) pArray2; pDest = (__m128*) pResult;

__m128 m0_5 = _mm_set_ps1(0.5f); // m0_5[0, 1, 2, 3] = 0.5

for (i = 0; i < nLoop; i++ )

{

m1 = _mm_mul_ps(*pSrc1, *pSrc1); // m1 = *pSrc1 * *pSrc1

m2 = _mm_mul_ps(*pSrc2, *pSrc2); // m2 = *pSrc2 * *pSrc2

m3 = _mm_add_ps(m1, m2); // m3 = m1 + m2

m4 = _mm_sqrt_ps(m3); // m4 = sqrt(m3)

*pDest = _mm_add_ps(m4, m0_5); // *pDest = m4 + 0.5

pSrc1++;

pSrc2++;

pDest++;

}

return 1;

}/*SSEfunction*/

int Cfunction1(Type *pArray1, Type *pArray2, Type *pResult, int n)

{

int i;

int nSize = n;

Type* pSource1 = pArray1;

Type* pSource2 = pArray2;

Type* pDest = pResult;

Type temp;

temp = 0;

for ( i = 0; i < nSize; i++ )

{

temp = (*pSource1) * (*pSource1) + (*pSource2)* (*pSource2) ;

*pDest = (Type)sqrt(temp) + 0.5f;

pSource1++;

pSource2++;

pDest++;

}/*for */

return 1;

}/*Cfunction1*/

int Cfunction2(Type *pArray1, Type *pArray2, Type *pResult, int n)

{

int i;

int nSize = n;

Type* pSource1 = pArray1;

Type* pSource2 = pArray2;

Type* pDest = pResult;

int nparallel = 16/sizeof(Type);

Type *temp0, *temp1, *temp2, *temp3;

Type *temp4, *temp5, *temp6, *temp7;

Type *temp9, *temp10, *temp11, *temp12;

temp0 = pSource1;

temp1 = pSource1 + 1;

temp2 = pSource1 + 2;

temp3 = pSource1 + 3;

temp4 = pSource2;

temp5 = pSource2 + 1;

temp6 = pSource2 + 2;

temp7 = pSource2 + 3;

temp9 = pResult;

temp10 = pResult + 1;

temp11 = pResult + 2;

temp12 = pResult + 3;

int nLoop = n/nparallel;

for ( i = 0; i < nLoop; i++ )

{

*temp9 = sqrt( (*temp0)*(*temp0) + (*temp4)*(*temp4) ) +0.5f;

*temp10 = sqrt( (*temp1)*(*temp1) + (*temp5)*(*temp5) ) +0.5f;

*temp11 = sqrt( (*temp2)*(*temp2) + (*temp6)*(*temp6) ) +0.5f;

*temp12 = sqrt( (*temp3)*(*temp3) + (*temp7)*(*temp7) ) +0.5f;

temp0 += 4;

temp1 += 4;

temp2 += 4;

temp3 += 4;

temp4 += 4;

temp5 += 4;

temp6 += 4;

temp7 += 4;

temp9 += 4;

temp10 += 4;

temp11 += 4;

temp12 += 4;

}/*for */

return 1;

}/*Cfunction2*/

int main(void)

{

Type *Src1, *Src2, *Result;

int i, n;

n = 100000000;

Src1 = (Type*)calloc(n, sizeof(Type));

Src2 = (Type*)calloc(n, sizeof(Type));

Result = (Type*)calloc(n, sizeof(Type));

for(i = 0;i

*= 1.0;*

Src2

Src2

*= 3.0;*

//printf("i = %d, Src1 = %f, Src2 = %f \\n",i,Src1//printf("i = %d, Src1 = %f, Src2 = %f \\n",i,Src1

*, Src2**);*

}/*for i*/

time_t begin, end;

begin = clock();

Cfunction2(Src1, Src2, Result, n);

end = clock();

printf("C time = %6.4f\\n",(end- begin)/float(CLOCKS_PER_SEC) );

begin = clock();

SSEfunction(Src1, Src2, Result, n);

end = clock();

printf("SSE time = %6.4f\\n",(end- begin)/float(CLOCKS_PER_SEC) );

free(Src1); free(Src2); free(Result);

return 1;

}/*main*/

In my computer :

intel duel core E5200 2.5GHz

windows XP SP3 :

the run time is :

SSEfunction : 0.38 sec for both VC8 and ICC 10.1

CFunction1 :

1.546 sec for VC8

0.969 sec for ICC.

CFunction2:

1.456 sec for VC8

0.959 sec for ICC.

whatever optimization selection (specify CPU, O2/O3 ) I turn on/off, the performance is the same.

Obviously, the ICC is better than VC8.

I get a question, that is : I thank the CFunction2 is more parallelized for compiler to use SSE instruction, but the proformance of CFunction2 and CFunction1 is the same for ICC.

But there is still about 152% ( 0.959 sec vs 0.38 sec) gap between SSEfuncion.

How should I do to modify the C code to let the performance get close to the SSEfunction result?

The know-how is very useful for me.....

Thank you lots.

}/*for i*/

time_t begin, end;

begin = clock();

Cfunction2(Src1, Src2, Result, n);

end = clock();

printf("C time = %6.4f\\n",(end- begin)/float(CLOCKS_PER_SEC) );

begin = clock();

SSEfunction(Src1, Src2, Result, n);

end = clock();

printf("SSE time = %6.4f\\n",(end- begin)/float(CLOCKS_PER_SEC) );

free(Src1); free(Src2); free(Result);

return 1;

}/*main*/

In my computer :

intel duel core E5200 2.5GHz

windows XP SP3 :

the run time is :

SSEfunction : 0.38 sec for both VC8 and ICC 10.1

CFunction1 :

1.546 sec for VC8

0.969 sec for ICC.

CFunction2:

1.456 sec for VC8

0.959 sec for ICC.

whatever optimization selection (specify CPU, O2/O3 ) I turn on/off, the performance is the same.

Obviously, the ICC is better than VC8.

I get a question, that is : I thank the CFunction2 is more parallelized for compiler to use SSE instruction, but the proformance of CFunction2 and CFunction1 is the same for ICC.

But there is still about 152% ( 0.959 sec vs 0.38 sec) gap between SSEfuncion.

How should I do to modify the C code to let the performance get close to the SSEfunction result?

The know-how is very useful for me.....

Thank you lots.

Link Copied

5 Replies

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

Try the latest 11.1 compiler - I get better performance than I do with 10.1 ( from .73/.65 seconds with 10.1 down to .36 and .29 seconds with 11.1).

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

I would like to know what case you run. could you post your testing source code ?

If that is my test case, I would like to know, in ICC 10.1, what function costs 0.73 sec., and which costs 0.63 sec ?

thank you.

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

It's your test case, CFunction1 and CFunction 2 respectively.

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

Maybe you should try offseting your pointers with column index instead of having moving pointers.

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content

What was the time for SSEfunction? The ratio between SSE and C functions would be a good measure of auto-vectorizer performance, which is the topic Gaiger brought up.

Thank you,

Paul.

Topic Options

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page