- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
I have write a newbie code for practice SSE2:
#include
#include
#include
#include
#include
#include
typedef float Type;
int SSEfunction(Type *pArray1, Type *pArray2, Type *pResult, int n)
{
int i;
int nLoop ;
int nparallel;
nparallel = 16/sizeof(Type);
nLoop = n/nparallel;
__m128 m1, m2, m3, m4;
__m128 *pSrc1, *pSrc2, *pDest;
pSrc1 = (__m128*) pArray1; pSrc2 = (__m128*) pArray2; pDest = (__m128*) pResult;
__m128 m0_5 = _mm_set_ps1(0.5f); // m0_5[0, 1, 2, 3] = 0.5
for (i = 0; i < nLoop; i++ )
{
m1 = _mm_mul_ps(*pSrc1, *pSrc1); // m1 = *pSrc1 * *pSrc1
m2 = _mm_mul_ps(*pSrc2, *pSrc2); // m2 = *pSrc2 * *pSrc2
m3 = _mm_add_ps(m1, m2); // m3 = m1 + m2
m4 = _mm_sqrt_ps(m3); // m4 = sqrt(m3)
*pDest = _mm_add_ps(m4, m0_5); // *pDest = m4 + 0.5
pSrc1++;
pSrc2++;
pDest++;
}
return 1;
}/*SSEfunction*/
int Cfunction1(Type *pArray1, Type *pArray2, Type *pResult, int n)
{
int i;
int nSize = n;
Type* pSource1 = pArray1;
Type* pSource2 = pArray2;
Type* pDest = pResult;
Type temp;
temp = 0;
for ( i = 0; i < nSize; i++ )
{
temp = (*pSource1) * (*pSource1) + (*pSource2)* (*pSource2) ;
*pDest = (Type)sqrt(temp) + 0.5f;
pSource1++;
pSource2++;
pDest++;
}/*for */
return 1;
}/*Cfunction1*/
int Cfunction2(Type *pArray1, Type *pArray2, Type *pResult, int n)
{
int i;
int nSize = n;
Type* pSource1 = pArray1;
Type* pSource2 = pArray2;
Type* pDest = pResult;
int nparallel = 16/sizeof(Type);
Type *temp0, *temp1, *temp2, *temp3;
Type *temp4, *temp5, *temp6, *temp7;
Type *temp9, *temp10, *temp11, *temp12;
temp0 = pSource1;
temp1 = pSource1 + 1;
temp2 = pSource1 + 2;
temp3 = pSource1 + 3;
temp4 = pSource2;
temp5 = pSource2 + 1;
temp6 = pSource2 + 2;
temp7 = pSource2 + 3;
temp9 = pResult;
temp10 = pResult + 1;
temp11 = pResult + 2;
temp12 = pResult + 3;
int nLoop = n/nparallel;
for ( i = 0; i < nLoop; i++ )
{
*temp9 = sqrt( (*temp0)*(*temp0) + (*temp4)*(*temp4) ) +0.5f;
*temp10 = sqrt( (*temp1)*(*temp1) + (*temp5)*(*temp5) ) +0.5f;
*temp11 = sqrt( (*temp2)*(*temp2) + (*temp6)*(*temp6) ) +0.5f;
*temp12 = sqrt( (*temp3)*(*temp3) + (*temp7)*(*temp7) ) +0.5f;
temp0 += 4;
temp1 += 4;
temp2 += 4;
temp3 += 4;
temp4 += 4;
temp5 += 4;
temp6 += 4;
temp7 += 4;
temp9 += 4;
temp10 += 4;
temp11 += 4;
temp12 += 4;
}/*for */
return 1;
}/*Cfunction2*/
int main(void)
{
Type *Src1, *Src2, *Result;
int i, n;
n = 100000000;
Src1 = (Type*)calloc(n, sizeof(Type));
Src2 = (Type*)calloc(n, sizeof(Type));
Result = (Type*)calloc(n, sizeof(Type));
for(i = 0;i Src1 = 1.0;
Src2 = 3.0;
//printf("i = %d, Src1 = %f, Src2 = %f \\n",i,Src1, Src2);
}/*for i*/
time_t begin, end;
begin = clock();
Cfunction2(Src1, Src2, Result, n);
end = clock();
printf("C time = %6.4f\\n",(end- begin)/float(CLOCKS_PER_SEC) );
begin = clock();
SSEfunction(Src1, Src2, Result, n);
end = clock();
printf("SSE time = %6.4f\\n",(end- begin)/float(CLOCKS_PER_SEC) );
free(Src1); free(Src2); free(Result);
return 1;
}/*main*/
In my computer :
intel duel core E5200 2.5GHz
windows XP SP3 :
the run time is :
SSEfunction : 0.38 sec for both VC8 and ICC 10.1
CFunction1 :
1.546 sec for VC8
0.969 sec for ICC.
CFunction2:
1.456 sec for VC8
0.959 sec for ICC.
whatever optimization selection (specify CPU, O2/O3 ) I turn on/off, the performance is the same.
Obviously, the ICC is better than VC8.
I get a question, that is : I thank the CFunction2 is more parallelized for compiler to use SSE instruction, but the proformance of CFunction2 and CFunction1 is the same for ICC.
But there is still about 152% ( 0.959 sec vs 0.38 sec) gap between SSEfuncion.
How should I do to modify the C code to let the performance get close to the SSEfunction result?
The know-how is very useful for me.....
Thank you lots.
I have write a newbie code for practice SSE2:
#include
#include
#include
#include
#include
#include
typedef float Type;
int SSEfunction(Type *pArray1, Type *pArray2, Type *pResult, int n)
{
int i;
int nLoop ;
int nparallel;
nparallel = 16/sizeof(Type);
nLoop = n/nparallel;
__m128 m1, m2, m3, m4;
__m128 *pSrc1, *pSrc2, *pDest;
pSrc1 = (__m128*) pArray1; pSrc2 = (__m128*) pArray2; pDest = (__m128*) pResult;
__m128 m0_5 = _mm_set_ps1(0.5f); // m0_5[0, 1, 2, 3] = 0.5
for (i = 0; i < nLoop; i++ )
{
m1 = _mm_mul_ps(*pSrc1, *pSrc1); // m1 = *pSrc1 * *pSrc1
m2 = _mm_mul_ps(*pSrc2, *pSrc2); // m2 = *pSrc2 * *pSrc2
m3 = _mm_add_ps(m1, m2); // m3 = m1 + m2
m4 = _mm_sqrt_ps(m3); // m4 = sqrt(m3)
*pDest = _mm_add_ps(m4, m0_5); // *pDest = m4 + 0.5
pSrc1++;
pSrc2++;
pDest++;
}
return 1;
}/*SSEfunction*/
int Cfunction1(Type *pArray1, Type *pArray2, Type *pResult, int n)
{
int i;
int nSize = n;
Type* pSource1 = pArray1;
Type* pSource2 = pArray2;
Type* pDest = pResult;
Type temp;
temp = 0;
for ( i = 0; i < nSize; i++ )
{
temp = (*pSource1) * (*pSource1) + (*pSource2)* (*pSource2) ;
*pDest = (Type)sqrt(temp) + 0.5f;
pSource1++;
pSource2++;
pDest++;
}/*for */
return 1;
}/*Cfunction1*/
int Cfunction2(Type *pArray1, Type *pArray2, Type *pResult, int n)
{
int i;
int nSize = n;
Type* pSource1 = pArray1;
Type* pSource2 = pArray2;
Type* pDest = pResult;
int nparallel = 16/sizeof(Type);
Type *temp0, *temp1, *temp2, *temp3;
Type *temp4, *temp5, *temp6, *temp7;
Type *temp9, *temp10, *temp11, *temp12;
temp0 = pSource1;
temp1 = pSource1 + 1;
temp2 = pSource1 + 2;
temp3 = pSource1 + 3;
temp4 = pSource2;
temp5 = pSource2 + 1;
temp6 = pSource2 + 2;
temp7 = pSource2 + 3;
temp9 = pResult;
temp10 = pResult + 1;
temp11 = pResult + 2;
temp12 = pResult + 3;
int nLoop = n/nparallel;
for ( i = 0; i < nLoop; i++ )
{
*temp9 = sqrt( (*temp0)*(*temp0) + (*temp4)*(*temp4) ) +0.5f;
*temp10 = sqrt( (*temp1)*(*temp1) + (*temp5)*(*temp5) ) +0.5f;
*temp11 = sqrt( (*temp2)*(*temp2) + (*temp6)*(*temp6) ) +0.5f;
*temp12 = sqrt( (*temp3)*(*temp3) + (*temp7)*(*temp7) ) +0.5f;
temp0 += 4;
temp1 += 4;
temp2 += 4;
temp3 += 4;
temp4 += 4;
temp5 += 4;
temp6 += 4;
temp7 += 4;
temp9 += 4;
temp10 += 4;
temp11 += 4;
temp12 += 4;
}/*for */
return 1;
}/*Cfunction2*/
int main(void)
{
Type *Src1, *Src2, *Result;
int i, n;
n = 100000000;
Src1 = (Type*)calloc(n, sizeof(Type));
Src2 = (Type*)calloc(n, sizeof(Type));
Result = (Type*)calloc(n, sizeof(Type));
for(i = 0;i
Src2 = 3.0;
//printf("i = %d, Src1 = %f, Src2 = %f \\n",i,Src1, Src2);
}/*for i*/
time_t begin, end;
begin = clock();
Cfunction2(Src1, Src2, Result, n);
end = clock();
printf("C time = %6.4f\\n",(end- begin)/float(CLOCKS_PER_SEC) );
begin = clock();
SSEfunction(Src1, Src2, Result, n);
end = clock();
printf("SSE time = %6.4f\\n",(end- begin)/float(CLOCKS_PER_SEC) );
free(Src1); free(Src2); free(Result);
return 1;
}/*main*/
In my computer :
intel duel core E5200 2.5GHz
windows XP SP3 :
the run time is :
SSEfunction : 0.38 sec for both VC8 and ICC 10.1
CFunction1 :
1.546 sec for VC8
0.969 sec for ICC.
CFunction2:
1.456 sec for VC8
0.959 sec for ICC.
whatever optimization selection (specify CPU, O2/O3 ) I turn on/off, the performance is the same.
Obviously, the ICC is better than VC8.
I get a question, that is : I thank the CFunction2 is more parallelized for compiler to use SSE instruction, but the proformance of CFunction2 and CFunction1 is the same for ICC.
But there is still about 152% ( 0.959 sec vs 0.38 sec) gap between SSEfuncion.
How should I do to modify the C code to let the performance get close to the SSEfunction result?
The know-how is very useful for me.....
Thank you lots.
Link Copied
5 Replies
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Gaiger,
Try the latest 11.1 compiler - I get better performance than I do with 10.1 ( from .73/.65 seconds with 10.1 down to .36 and .29 seconds with 11.1).
Try the latest 11.1 compiler - I get better performance than I do with 10.1 ( from .73/.65 seconds with 10.1 down to .36 and .29 seconds with 11.1).
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi:
I would like to know what case you run. could you post your testing source code ?
If that is my test case, I would like to know, in ICC 10.1, what function costs 0.73 sec., and which costs 0.63 sec ?
thank you.
I would like to know what case you run. could you post your testing source code ?
If that is my test case, I would like to know, in ICC 10.1, what function costs 0.73 sec., and which costs 0.63 sec ?
thank you.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
It's your test case, CFunction1 and CFunction 2 respectively.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Maybe you should try offseting your pointers with column index instead of having moving pointers.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Brandon,
What was the time for SSEfunction? The ratio between SSE and C functions would be a good measure of auto-vectorizer performance, which is the topic Gaiger brought up.
Thank you,
Paul.
What was the time for SSEfunction? The ratio between SSE and C functions would be a good measure of auto-vectorizer performance, which is the topic Gaiger brought up.
Thank you,
Paul.

Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page