Analyzers
Talk to fellow users of Intel Analyzer tools (Intel VTune™ Profiler, Intel Advisor)

SSE optimazation problem

miva_cz
Beginner
739 Views
Hi, I have a code, in c written as :
------------------
float value = 0.5f;
for (int j = 0; j < count; j++)
{
int ind = input .index;
diff_table [ind].pos [0] += input .pos[0]*value;
diff_table [ind].pos [1] += input .pos [1]*value;
diff_table [ind].pos [2] += input .pos [2]*value;
}
------------------
and I've done a SSE version (diff_table is 16 bytes aligned and each item heas four float, thus 16 bytes):
------------------
_asm {
mov esi, input
mov edi, diff_table
mov ecx, count

movss xmm1, value
shufps xmm1, xmm1, 0

_loop:
mov eax, dword ptr [esi + 12]
shl eax, 4
movaps xmm0, xmmword ptr [esi]
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0

add esi, 16
dec ecx
jnz _loop
}
------------------
The problem is, that the SSE version is much slower than C version, I tried to change the four paralel SSE instruction to 12 non paralel versions of SSE instruction:
------------------
...
movss xmm0, dword ptr [esi]
mulss xmm0, xmm1
addss xmm0, dword ptr [edi + eax]
movss dword ptr [edi + eax], xmm0

movss xmm0, dword ptr [esi + 4]
mulss xmm0, xmm1
addss xmm0, dword ptr [edi + eax + 4]
movss dword ptr [edi + eax + 4], xmm0

movss xmm0, dword ptr [esi + 8]
mulss xmm0, xmm1
addss xmm0, dword ptr [edi + eax + 8]
movss dword ptr [edi + eax + 8], xmm0
...
------------------
which is three times faster than version in C... why the paralel version is the lowest ??? have no idea ;-(
thanks for answer, michal
0 Kudos
4 Replies
David_A_Intel1
Employee
739 Views

Hi michal:

We're going to need more information. For example, what processor are you running this on? What compiler are you building with, and what options are you specifying when building the application?

Thanks.

0 Kudos
p4top
Beginner
739 Views
Your data structure is something wrong.



Do you notice that the excepts always occur? Just because input.index is int data, not float data--> excepts occurs when addps/mulps be executed.



It is the real reason of the poor performence!



If you want to use mulps/addps, please choose a reasonable data structure.







I have revised your code by using andps to mask the input.index to zero.



Now, let's roll: SSE parallel is faster than SSE scalar!







int w[4], *p;







w[0]=0xffffffff;



w[1]=0xffffffff;



w[2]=0xffffffff;



w[3]=0x00; //For masking input.index to 0



p=w;











_asm {



mov esi,p;



movups xmm5,[esi]







mov esi, input



mov edi, diff_table



mov ecx, count







movss xmm1, value



shufps xmm1, xmm1, 0



.align 16



_loop:





mov edx, [esi+16] //speculative load

mov eax, dword ptr [esi + 12]



shl eax, 4



movaps xmm0, xmmword ptr [esi]



andps xmm0, xmm5 //mask input.index



mulps xmm0, xmm1



addps xmm0, xmmword ptr [edi + eax]



movaps xmmword ptr [edi + eax], xmm0



dec ecx



jz _exitloop



//unroll and use more XMM regs to improve the performence.



mov eax, dword ptr [esi + 28]



shl eax, 4



movaps xmm6, xmmword ptr [esi+16]



andps xmm6, xmm5 //mask input.index



mulps xmm6, xmm1



addps xmm6, xmmword ptr [edi + eax]



movaps xmmword ptr [edi + eax], xmm6







add esi, 32



dec ecx







jnz _loop



_exitloop:



}

Message Edited by p4top on 04-04-200605:22 AM

Message Edited by p4top on 04-04-200605:39 AM

0 Kudos
p4top
Beginner
739 Views
Your data structure is something wrong.

Do you notice that the excepts always occur? Just because input.index is int data, not float data--> excepts occurs when addps/mulps be executed.



It is the real reason of the poor performence!

If you want to use mulps/addps, please choose a reasonable data structure.



I have revised your code by using andps to mask the input.index to zero.

Now, let's roll: SSE parallel is faster than SSE scalar!





int w[4], *p;



w[0]=0xffffffff;

w[1]=0xffffffff;

w[2]=0xffffffff;

w[3]=0x00; //For masking input.index to 0

p=w;



_asm {

mov esi,p;

movups xmm5,[esi]



mov esi, input

mov edi, diff_table

mov ecx, count



movss xmm1, value

shufps xmm1, xmm1, 0



.align 16

_loop:

mov edx, [esi+128] //seculative load for improving the performance

/*it can be deleted if you find it may bring some risks or not improve the performance.*/

mov eax, dword ptr [esi + 12]

shl eax, 4

movaps xmm0, xmmword ptr [esi]

andps xmm0, xmm5 //mask input.index

mulps xmm0, xmm1

addps xmm0, xmmword ptr [edi + eax]

movaps xmmword ptr [edi + eax], xmm0

sub ecx,1

jz _exitloop



//unroll and use more XMM regs to improve the performence.

mov eax, dword ptr [esi + 28]

shl eax, 4

movaps xmm6, xmmword ptr [esi+16]

andps xmm6, xmm5 //mask input.index

mulps xmm6, xmm1

addps xmm6, xmmword ptr [edi + eax]

movaps xmmword ptr [edi + eax], xmm6



add esi, 32

sub ecx,1

jnz _loop



_exitloop:



}

Message Edited by p4top on 04-04-200606:22 AM

0 Kudos
p4top
Beginner
739 Views
You can try the below SSE parallel code (For count=1000, it is about 2-3 times or more fast than your SSE scalar code.)

int w[4]={0xffffffff,0xffffffff,0xffffffff,0};

_asm {
movups xmm5,w

mov esi, input
mov edi, diff_table
mov ecx, count

movss xmm1, value
shufps xmm1, xmm1, 0

push ebp

mov edx,3
and edx,ecx
je _loop0

mov eax, dword ptr [esi + 12]
shl eax, 4
movaps xmm0, xmmword ptr [esi]
andps xmm0, xmm5
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
sub ecx,1
jz _exitloop
sub edx,1
jz _loop0

mov eax, dword ptr [esi + 12+16]
shl eax, 4
movaps xmm0, xmmword ptr [esi+16]
andps xmm0, xmm5
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
sub ecx,1
jz _exitloop
sub edx,1
jz _loop0

mov eax, dword ptr [esi + 12+16]
shl eax, 4
movaps xmm0, xmmword ptr [esi+16]
andps xmm0, xmm5
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
sub ecx,1
jz _exitloop
_loop0:
sub ecx,4
jz _xlp
.align 16
_loop:
MOV edx, dword ptr [esi + 128]

mov eax, dword ptr [esi + 12]
shl eax, 4

mov ebx, dword ptr [esi + 12+16]
shl ebx, 4

mov ebp, dword ptr [esi + 12+32]
shl ebp, 4

mov edx, dword ptr [esi + 12+48]
shl edx, 4

movaps xmm0, xmmword ptr [esi]
movaps xmm2, xmmword ptr [esi+16]
movaps xmm3, xmmword ptr [esi+32]
movaps xmm4, xmmword ptr [esi+48]

andps xmm0, xmm5
andps xmm2, xmm5
andps xmm3, xmm5
andps xmm4, xmm5

mulps xmm0, xmm1
mulps xmm2, xmm1
mulps xmm3, xmm1
mulps xmm4, xmm1

addps xmm0, xmmword ptr [edi + eax]
addps xmm2, xmmword ptr [edi + ebx]
addps xmm3, xmmword ptr [edi + ebp]
addps xmm4, xmmword ptr [edi + edx]

movaps xmmword ptr [edi + eax], xmm0
movaps xmmword ptr [edi + ebx], xmm2
movaps xmmword ptr [edi + ebp], xmm3
movaps xmmword ptr [edi + edx], xmm4

add esi, 64
sub ecx,4
jnz _loop

_xlp:
mov eax, dword ptr [esi + 12]
shl eax, 4

mov ebx, dword ptr [esi + 12+16]
shl ebx, 4

mov ebp, dword ptr [esi + 12+32]
shl ebp, 4

mov edx, dword ptr [esi + 12+48]
shl edx, 4

movaps xmm0, xmmword ptr [esi]
movaps xmm2, xmmword ptr [esi+16]
movaps xmm3, xmmword ptr [esi+32]
movaps xmm4, xmmword ptr [esi+48]

andps xmm0, xmm5
andps xmm2, xmm5
andps xmm3, xmm5
andps xmm4, xmm5

mulps xmm0, xmm1
mulps xmm2, xmm1
mulps xmm3, xmm1
mulps xmm4, xmm1

addps xmm0, xmmword ptr [edi + eax]
addps xmm2, xmmword ptr [edi + ebx]
addps xmm3, xmmword ptr [edi + ebp]
addps xmm4, xmmword ptr [edi + edx]

movaps xmmword ptr [edi + eax], xmm0
movaps xmmword ptr [edi + ebx], xmm2
movaps xmmword ptr [edi + ebp], xmm3
movaps xmmword ptr [edi + edx], xmm4

_exitloop:
pop ebp
}
0 Kudos
Reply