I understand Why SSE is slower than ANSI C

Yavorski__Nick · ‎07-19-2018

Hi, I wrote a very simple c program some days ago, I tried to optimize the code my program with sse, but i just understood sse slower than c. This program is very important for my work if you can help me optimize the code fast I will be very happy

code:

#include

typedef _declspec(align(16)) float vec3_t[3];

inline void vec_normalize_sse(vec3_t vec)

{

_asm {

mov esi, vec

movups xmm0, [esi]

movups xmm1, xmm0

mulps xmm1, xmm1

movups xmm2, xmm1

shufps xmm2, xmm1, 0xe1

movups xmm3, xmm1

shufps xmm3, xmm1, 0xc6

addps xmm1, xmm2

addps xmm1, xmm3

shufps xmm1, xmm1, 0x00

sqrtps xmm1, xmm1

divps xmm0, xmm1

movups [esi], xmm0

}

inline void vec_normalize_c(vec3_t vec)

{

float len;

len = vec[0]*vec[0] + vec[1]*vec[1] + vec[2]*vec[2];

len = (float)sqrt(len);

len = 1.0f/len;

vec[0] *= len;

vec[1] *= len;

vec[2] *= len;

}

int main()

{

int i, s, e, count;

vec3_t vec;

count = 1000000;

vec[0] = 1.0f;

vec[1] = 2.0f;

vec[2] = 3.0f;

s = clock();

for (i = 0; i < count; i++) {

vec[0] += 0.1f;

vec[1] += 0.1f;

vec[2] += 0.1f;

vec_normalize_sse(vec);

}

e = clock();

printf("sse = %d, %f, %f, %f

", e - s, vec[0], vec[1], vec[2]);

vec[0] = 1.0f;

vec[1] = 2.0f;

vec[2] = 3.0f;

s = clock();

for (i = 0; i < count; i++) {

vec[0] += 0.1f;

vec[1] += 0.1f;

vec[2] += 0.1f;

vec_normalize_c(vec);

}

e = clock();

printf("c = %d, %f, %f, %f

", e - s, vec[0], vec[1], vec[2]);

getch();

return 0;

}

I wrote more about my work I work at https://writer4sale.com/ and this program helps to us to do our easy write different articles and etc.