- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi, I have a code, in c written as :
------------------
float value = 0.5f;
for (int j = 0; j < count; j++)
{
int ind = input.index;
diff_table [ind].pos [0] += input.pos[0]*value;
diff_table [ind].pos [1] += input.pos [1]*value;
diff_table [ind].pos [2] += input.pos [2]*value;
}
------------------
and I've done a SSE version (diff_table is 16 bytes aligned and each item heas four float, thus 16 bytes):
------------------
_asm {
mov esi, input
mov edi, diff_table
mov ecx, count
movss xmm1, value
shufps xmm1, xmm1, 0
_loop:
mov eax, dword ptr [esi + 12]
shl eax, 4
movaps xmm0, xmmword ptr [esi]
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
add esi, 16
dec ecx
jnz _loop
}
------------------
The problem is, that the SSE version is much slower than C version, I tried to change the four paralel SSE instruction to 12 non paralel versions of SSE instruction:
------------------
...
movss xmm0, dword ptr [esi]
mulss xmm0, xmm1
addss xmm0, dword ptr [edi + eax]
movss dword ptr [edi + eax], xmm0
movss xmm0, dword ptr [esi + 4]
mulss xmm0, xmm1
addss xmm0, dword ptr [edi + eax + 4]
movss dword ptr [edi + eax + 4], xmm0
movss xmm0, dword ptr [esi + 8]
mulss xmm0, xmm1
addss xmm0, dword ptr [edi + eax + 8]
movss dword ptr [edi + eax + 8], xmm0
...
------------------
which is three times faster than version in C... why the paralel version is the lowest ??? have no idea ;-(
thanks for answer, michal
------------------
float value = 0.5f;
for (int j = 0; j < count; j++)
{
int ind = input
diff_table [ind].pos [0] += input
diff_table [ind].pos [1] += input
diff_table [ind].pos [2] += input
}
------------------
and I've done a SSE version (diff_table is 16 bytes aligned and each item heas four float, thus 16 bytes):
------------------
_asm {
mov esi, input
mov edi, diff_table
mov ecx, count
movss xmm1, value
shufps xmm1, xmm1, 0
_loop:
mov eax, dword ptr [esi + 12]
shl eax, 4
movaps xmm0, xmmword ptr [esi]
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
add esi, 16
dec ecx
jnz _loop
}
------------------
The problem is, that the SSE version is much slower than C version, I tried to change the four paralel SSE instruction to 12 non paralel versions of SSE instruction:
------------------
...
movss xmm0, dword ptr [esi]
mulss xmm0, xmm1
addss xmm0, dword ptr [edi + eax]
movss dword ptr [edi + eax], xmm0
movss xmm0, dword ptr [esi + 4]
mulss xmm0, xmm1
addss xmm0, dword ptr [edi + eax + 4]
movss dword ptr [edi + eax + 4], xmm0
movss xmm0, dword ptr [esi + 8]
mulss xmm0, xmm1
addss xmm0, dword ptr [edi + eax + 8]
movss dword ptr [edi + eax + 8], xmm0
...
------------------
which is three times faster than version in C... why the paralel version is the lowest ??? have no idea ;-(
thanks for answer, michal
Link Copied
4 Replies
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi michal:
We're going to need more information. For example, what processor are you running this on? What compiler are you building with, and what options are you specifying when building the application?
Thanks.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Your data structure is something wrong.
Do you notice that the excepts always occur? Just because input.index is int data, not float data--> excepts occurs when addps/mulps be executed.
It is the real reason of the poor performence!
If you want to use mulps/addps, please choose a reasonable data structure.
I have revised your code by using andps to mask the input.index to zero.
Now, let's roll: SSE parallel is faster than SSE scalar!
int w[4], *p;
w[0]=0xffffffff;
w[1]=0xffffffff;
w[2]=0xffffffff;
w[3]=0x00; //For masking input.index to 0
p=w;
_asm {
mov esi,p;
movups xmm5,[esi]
mov esi, input
mov edi, diff_table
mov ecx, count
movss xmm1, value
shufps xmm1, xmm1, 0
.align 16
_loop:
mov edx, [esi+16] //speculative load
mov eax, dword ptr [esi + 12]
shl eax, 4
movaps xmm0, xmmword ptr [esi]
andps xmm0, xmm5 //mask input.index
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
dec ecx
jz _exitloop
//unroll and use more XMM regs to improve the performence.
mov eax, dword ptr [esi + 28]
shl eax, 4
movaps xmm6, xmmword ptr [esi+16]
andps xmm6, xmm5 //mask input.index
mulps xmm6, xmm1
addps xmm6, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm6
add esi, 32
dec ecx
jnz _loop
_exitloop:
}
Do you notice that the excepts always occur? Just because input
It is the real reason of the poor performence!
If you want to use mulps/addps, please choose a reasonable data structure.
I have revised your code by using andps to mask the input
Now, let's roll: SSE parallel is faster than SSE scalar!
int w[4], *p;
w[0]=0xffffffff;
w[1]=0xffffffff;
w[2]=0xffffffff;
w[3]=0x00; //For masking input
p=w;
_asm {
mov esi,p;
movups xmm5,[esi]
mov esi, input
mov edi, diff_table
mov ecx, count
movss xmm1, value
shufps xmm1, xmm1, 0
.align 16
_loop:
mov edx, [esi+16] //speculative load
mov eax, dword ptr [esi + 12]
shl eax, 4
movaps xmm0, xmmword ptr [esi]
andps xmm0, xmm5 //mask input
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
dec ecx
jz _exitloop
//unroll and use more XMM regs to improve the performence.
mov eax, dword ptr [esi + 28]
shl eax, 4
movaps xmm6, xmmword ptr [esi+16]
andps xmm6, xmm5 //mask input
mulps xmm6, xmm1
addps xmm6, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm6
add esi, 32
dec ecx
jnz _loop
_exitloop:
}
Message Edited by p4top on 04-04-200605:22 AM
Message Edited by p4top on 04-04-200605:39 AM
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Your data structure is something wrong.
Do you notice that the excepts always occur? Just because input.index is int data, not float data--> excepts occurs when addps/mulps be executed.
It is the real reason of the poor performence!
If you want to use mulps/addps, please choose a reasonable data structure.
I have revised your code by using andps to mask the input.index to zero.
Now, let's roll: SSE parallel is faster than SSE scalar!
int w[4], *p;
w[0]=0xffffffff;
w[1]=0xffffffff;
w[2]=0xffffffff;
w[3]=0x00; //For masking input.index to 0
p=w;
_asm {
mov esi,p;
movups xmm5,[esi]
mov esi, input
mov edi, diff_table
mov ecx, count
movss xmm1, value
shufps xmm1, xmm1, 0
.align 16
_loop:
mov edx, [esi+128] //seculative load for improving the performance
/*it can be deleted if you find it may bring some risks or not improve the performance.*/
mov eax, dword ptr [esi + 12]
shl eax, 4
movaps xmm0, xmmword ptr [esi]
andps xmm0, xmm5 //mask input.index
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
sub ecx,1
jz _exitloop
//unroll and use more XMM regs to improve the performence.
mov eax, dword ptr [esi + 28]
shl eax, 4
movaps xmm6, xmmword ptr [esi+16]
andps xmm6, xmm5 //mask input.index
mulps xmm6, xmm1
addps xmm6, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm6
add esi, 32
sub ecx,1
jnz _loop
_exitloop:
}
Do you notice that the excepts always occur? Just because input
It is the real reason of the poor performence!
If you want to use mulps/addps, please choose a reasonable data structure.
I have revised your code by using andps to mask the input
Now, let's roll: SSE parallel is faster than SSE scalar!
int w[4], *p;
w[0]=0xffffffff;
w[1]=0xffffffff;
w[2]=0xffffffff;
w[3]=0x00; //For masking input
p=w;
_asm {
mov esi,p;
movups xmm5,[esi]
mov esi, input
mov edi, diff_table
mov ecx, count
movss xmm1, value
shufps xmm1, xmm1, 0
.align 16
_loop:
mov edx, [esi+128] //seculative load for improving the performance
/*it can be deleted if you find it may bring some risks or not improve the performance.*/
mov eax, dword ptr [esi + 12]
shl eax, 4
movaps xmm0, xmmword ptr [esi]
andps xmm0, xmm5 //mask input
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
sub ecx,1
jz _exitloop
//unroll and use more XMM regs to improve the performence.
mov eax, dword ptr [esi + 28]
shl eax, 4
movaps xmm6, xmmword ptr [esi+16]
andps xmm6, xmm5 //mask input
mulps xmm6, xmm1
addps xmm6, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm6
add esi, 32
sub ecx,1
jnz _loop
_exitloop:
}
Message Edited by p4top on 04-04-200606:22 AM
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
You can try the below SSE parallel code (For count=1000, it is about 2-3 times or more fast than your SSE scalar code.)
int w[4]={0xffffffff,0xffffffff,0xffffffff,0};
_asm {
movups xmm5,w
mov esi, input
mov edi, diff_table
mov ecx, count
movss xmm1, value
shufps xmm1, xmm1, 0
push ebp
mov edx,3
and edx,ecx
je _loop0
mov eax, dword ptr [esi + 12]
shl eax, 4
movaps xmm0, xmmword ptr [esi]
andps xmm0, xmm5
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
sub ecx,1
jz _exitloop
sub edx,1
jz _loop0
mov eax, dword ptr [esi + 12+16]
shl eax, 4
movaps xmm0, xmmword ptr [esi+16]
andps xmm0, xmm5
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
sub ecx,1
jz _exitloop
sub edx,1
jz _loop0
mov eax, dword ptr [esi + 12+16]
shl eax, 4
movaps xmm0, xmmword ptr [esi+16]
andps xmm0, xmm5
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
sub ecx,1
jz _exitloop
_loop0:
sub ecx,4
jz _xlp
.align 16
_loop:
MOV edx, dword ptr [esi + 128]
mov eax, dword ptr [esi + 12]
shl eax, 4
mov ebx, dword ptr [esi + 12+16]
shl ebx, 4
mov ebp, dword ptr [esi + 12+32]
shl ebp, 4
mov edx, dword ptr [esi + 12+48]
shl edx, 4
movaps xmm0, xmmword ptr [esi]
movaps xmm2, xmmword ptr [esi+16]
movaps xmm3, xmmword ptr [esi+32]
movaps xmm4, xmmword ptr [esi+48]
andps xmm0, xmm5
andps xmm2, xmm5
andps xmm3, xmm5
andps xmm4, xmm5
mulps xmm0, xmm1
mulps xmm2, xmm1
mulps xmm3, xmm1
mulps xmm4, xmm1
addps xmm0, xmmword ptr [edi + eax]
addps xmm2, xmmword ptr [edi + ebx]
addps xmm3, xmmword ptr [edi + ebp]
addps xmm4, xmmword ptr [edi + edx]
movaps xmmword ptr [edi + eax], xmm0
movaps xmmword ptr [edi + ebx], xmm2
movaps xmmword ptr [edi + ebp], xmm3
movaps xmmword ptr [edi + edx], xmm4
add esi, 64
sub ecx,4
jnz _loop
_xlp:
mov eax, dword ptr [esi + 12]
shl eax, 4
mov ebx, dword ptr [esi + 12+16]
shl ebx, 4
mov ebp, dword ptr [esi + 12+32]
shl ebp, 4
mov edx, dword ptr [esi + 12+48]
shl edx, 4
movaps xmm0, xmmword ptr [esi]
movaps xmm2, xmmword ptr [esi+16]
movaps xmm3, xmmword ptr [esi+32]
movaps xmm4, xmmword ptr [esi+48]
andps xmm0, xmm5
andps xmm2, xmm5
andps xmm3, xmm5
andps xmm4, xmm5
mulps xmm0, xmm1
mulps xmm2, xmm1
mulps xmm3, xmm1
mulps xmm4, xmm1
addps xmm0, xmmword ptr [edi + eax]
addps xmm2, xmmword ptr [edi + ebx]
addps xmm3, xmmword ptr [edi + ebp]
addps xmm4, xmmword ptr [edi + edx]
movaps xmmword ptr [edi + eax], xmm0
movaps xmmword ptr [edi + ebx], xmm2
movaps xmmword ptr [edi + ebp], xmm3
movaps xmmword ptr [edi + edx], xmm4
_exitloop:
pop ebp
}
int w[4]={0xffffffff,0xffffffff,0xffffffff,0};
_asm {
movups xmm5,w
mov esi, input
mov edi, diff_table
mov ecx, count
movss xmm1, value
shufps xmm1, xmm1, 0
push ebp
mov edx,3
and edx,ecx
je _loop0
mov eax, dword ptr [esi + 12]
shl eax, 4
movaps xmm0, xmmword ptr [esi]
andps xmm0, xmm5
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
sub ecx,1
jz _exitloop
sub edx,1
jz _loop0
mov eax, dword ptr [esi + 12+16]
shl eax, 4
movaps xmm0, xmmword ptr [esi+16]
andps xmm0, xmm5
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
sub ecx,1
jz _exitloop
sub edx,1
jz _loop0
mov eax, dword ptr [esi + 12+16]
shl eax, 4
movaps xmm0, xmmword ptr [esi+16]
andps xmm0, xmm5
mulps xmm0, xmm1
addps xmm0, xmmword ptr [edi + eax]
movaps xmmword ptr [edi + eax], xmm0
sub ecx,1
jz _exitloop
_loop0:
sub ecx,4
jz _xlp
.align 16
_loop:
MOV edx, dword ptr [esi + 128]
mov eax, dword ptr [esi + 12]
shl eax, 4
mov ebx, dword ptr [esi + 12+16]
shl ebx, 4
mov ebp, dword ptr [esi + 12+32]
shl ebp, 4
mov edx, dword ptr [esi + 12+48]
shl edx, 4
movaps xmm0, xmmword ptr [esi]
movaps xmm2, xmmword ptr [esi+16]
movaps xmm3, xmmword ptr [esi+32]
movaps xmm4, xmmword ptr [esi+48]
andps xmm0, xmm5
andps xmm2, xmm5
andps xmm3, xmm5
andps xmm4, xmm5
mulps xmm0, xmm1
mulps xmm2, xmm1
mulps xmm3, xmm1
mulps xmm4, xmm1
addps xmm0, xmmword ptr [edi + eax]
addps xmm2, xmmword ptr [edi + ebx]
addps xmm3, xmmword ptr [edi + ebp]
addps xmm4, xmmword ptr [edi + edx]
movaps xmmword ptr [edi + eax], xmm0
movaps xmmword ptr [edi + ebx], xmm2
movaps xmmword ptr [edi + ebp], xmm3
movaps xmmword ptr [edi + edx], xmm4
add esi, 64
sub ecx,4
jnz _loop
_xlp:
mov eax, dword ptr [esi + 12]
shl eax, 4
mov ebx, dword ptr [esi + 12+16]
shl ebx, 4
mov ebp, dword ptr [esi + 12+32]
shl ebp, 4
mov edx, dword ptr [esi + 12+48]
shl edx, 4
movaps xmm0, xmmword ptr [esi]
movaps xmm2, xmmword ptr [esi+16]
movaps xmm3, xmmword ptr [esi+32]
movaps xmm4, xmmword ptr [esi+48]
andps xmm0, xmm5
andps xmm2, xmm5
andps xmm3, xmm5
andps xmm4, xmm5
mulps xmm0, xmm1
mulps xmm2, xmm1
mulps xmm3, xmm1
mulps xmm4, xmm1
addps xmm0, xmmword ptr [edi + eax]
addps xmm2, xmmword ptr [edi + ebx]
addps xmm3, xmmword ptr [edi + ebp]
addps xmm4, xmmword ptr [edi + edx]
movaps xmmword ptr [edi + eax], xmm0
movaps xmmword ptr [edi + ebx], xmm2
movaps xmmword ptr [edi + ebp], xmm3
movaps xmmword ptr [edi + edx], xmm4
_exitloop:
pop ebp
}
Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page