Hi Sergey!
I completely redesigned vectorized fastsin() function.Reduced by more than 70% size of code (140 lines of code removed)and improved inline SSE Horner scheme evaluation.Inside _asm block I removed even power multiplication of xmm register and total count of instruction executed per one
polynomial term is 3(one mov,one mul ,one add).As you can see from the code below array-like initialization of structures was used.
Here is an improved version:
inline struct SinVector *fastsinVec4D(struct Test1 *test1ptr1){
if(test1ptr1 == NULL){
return NULL;
}else if(test1ptr1->c1 >= HALF_PI_FLT || test1ptr1->c2 >= HALF_PI_FLT || test1ptr1->c3 >= HALF_PI_FLT || test1ptr1->c4 >=HALF_PI_FLT)
{
return NULL;
}else if(test1ptr1->c1 <= NEG_HALF_PI_FLT || test1ptr1->c2 <= NEG_HALF_PI_FLT || test1ptr1->c3 <= NEG_HALF_PI_FLT || test1ptr1->c4 <= NEG_HALF_PI_FLT)
{
return NULL;
}else{
SinVector sinvec1 = {-0.1666666,-0.1666666,-0.1666666,-0.1666666},*sinvec1ptr;
sinvec1ptr = &sinvec1;
SinVector sinvec2 = {0.0083333,0.0083333,0.0083333,0.0083333},*sinvec2ptr;
sinvec2ptr = &sinvec2;
SinVector sinvec3 = {-1.9841269e-4,-1.9841269e-4,-1.9841269e-4,-1.9841269e-4},*sinvec3ptr;
sinvec3ptr = &sinvec3;
SinVector sinvec4 = {2.7557319e-6,2.7557319e-6,2.7557319e-6,2.7557319e-6},*sinvec4ptr;
sinvec4ptr = &sinvec4;
SinVector sinvec5 = {-2.5052108e-8,-2.5052108e-8,-2.5052108e-8,-2.5052108e-8},*sinvec5ptr;
sinvec5ptr = &sinvec5;
SinVector sinvec6 = { 1.6059043e-10, 1.6059043e-10, 1.6059043e-10, 1.6059043e-10},*sinvec6ptr;
sinvec6ptr = &sinvec6;
SinVector sinvec7 = {-7.6471637e-13,-7.6471637e-13,-7.6471637e-13,-7.6471637e-13},*sinvec7ptr;
sinvec7ptr = &sinvec7;
SinVector sinvec8 = {2.8114572e-15,2.8114572e-15,2.8114572e-15,2.8114572e-15},*sinvec8ptr;
sinvec8ptr = &sinvec8;
SinVector sinvec9 = {-8.2206352e-18,-8.2206352e-18,-8.2206352e-18,-8.2206352e-18},*sinvec9ptr;
sinvec9ptr = &sinvec9;
SinVector sinvec10 = {1.9572941e-20,1.9572941e-20,1.9572941e-20,1.9572941e-20},*sinvec10ptr;
sinvec10ptr = &sinvec10;
SinVector sinvec11 = {-3.8681701e-23,-3.8681701e-23,-3.8681701e-23,-3.8681701e-23},*sinvec11ptr;
sinvec11ptr = &sinvec11;
SinVector result = {0.0f,0.0f,0.0f,0.0f},*resultptr;
resultptr = &result;
_asm{
xorps xmm0,xmm0
xorps xmm1,xmm1
xorps xmm6,xmm6
xorps xmm7,xmm7
xorps xmm5,xmm5
movups xmm0,test1 //arg x,y,z,w
movups xmm7,xmm0 // copy of arg xmm7 accumulator
mulps xmm0,test1 //x^2
mulps xmm0,test1 //x^3
movups xmm1,sinvec1
mulps xmm0,xmm1
addps xmm7,xmm0
movups xmm1,sinvec2
mulps xmm0,xmm1
addps xmm7,xmm0
movups xmm1,sinvec3
mulps xmm0,xmm1
addps xmm7,xmm0
movups xmm1,sinvec4
mulps xmm0,xmm1
addps xmm7,xmm0
movups xmm1,sinvec5
mulps xmm0,xmm1
addps xmm7,xmm0
movups xmm1,sinvec6
mulps xmm0,xmm1
addps xmm7,xmm0
movups xmm1,sinvec7
mulps xmm0,xmm1
addps xmm7,xmm0
movups xmm1,sinvec8
mulps xmm0,xmm1
addps xmm7,xmm0
movups xmm1,sinvec9
mulps xmm0,xmm1
addps xmm7,xmm0
movups xmm1,sinvec10
mulps xmm0,xmm1
addps xmm7,xmm0
movups xmm1,sinvec11
mulps xmm0,xmm1
addps xmm7,xmm0
movups result,xmm7
}
return resultptr;
}
}