- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi, I wrote a very simple c program some days ago, I tried to optimize the code my program with sse, but i just understood sse slower than c. This program is very important for my work if you can help me optimize the code fast I will be very happy
code:
#include
#include
#include
typedef _declspec(align(16)) float vec3_t[3];
inline void vec_normalize_sse(vec3_t vec)
{
_asm {
mov esi, vec
movups xmm0, [esi]
movups xmm1, xmm0
mulps xmm1, xmm1
movups xmm2, xmm1
shufps xmm2, xmm1, 0xe1
movups xmm3, xmm1
shufps xmm3, xmm1, 0xc6
addps xmm1, xmm2
addps xmm1, xmm3
shufps xmm1, xmm1, 0x00
sqrtps xmm1, xmm1
divps xmm0, xmm1
movups [esi], xmm0
}
}
inline void vec_normalize_c(vec3_t vec)
{
float len;
len = vec[0]*vec[0] + vec[1]*vec[1] + vec[2]*vec[2];
len = (float)sqrt(len);
len = 1.0f/len;
vec[0] *= len;
vec[1] *= len;
vec[2] *= len;
}
int main()
{
int i, s, e, count;
vec3_t vec;
count = 1000000;
vec[0] = 1.0f;
vec[1] = 2.0f;
vec[2] = 3.0f;
s = clock();
for (i = 0; i < count; i++) {
vec[0] += 0.1f;
vec[1] += 0.1f;
vec[2] += 0.1f;
vec_normalize_sse(vec);
}
e = clock();
printf("sse = %d, %f, %f, %f
", e - s, vec[0], vec[1], vec[2]);
vec[0] = 1.0f;
vec[1] = 2.0f;
vec[2] = 3.0f;
s = clock();
for (i = 0; i < count; i++) {
vec[0] += 0.1f;
vec[1] += 0.1f;
vec[2] += 0.1f;
vec_normalize_c(vec);
}
e = clock();
printf("c = %d, %f, %f, %f
", e - s, vec[0], vec[1], vec[2]);
getch();
return 0;
}
I wrote more about my work I work at https://writer4sale.com/ and this program helps to us to do our easy write different articles and etc.
- Tags:
- Intel® Advanced Vector Extensions (Intel® AVX)
- Intel® Streaming SIMD Extensions
- Parallel Computing
Link Copied
0 Replies
Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page