Intel® ISA Extensions
Use hardware-based isolation and memory encryption to provide more code protection in your solutions.

Low rate on sse2 code

maa1
Beginner
299 Views

Hi!
why this scalar sse2 code (all data in L1 cache) executes on Core2 only on rate 1.49 flop/cycle?

L10:
movsd(%esi), %xmm5
movsd(%ebx), %xmm4
addl$4, %edi
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm3
movsd(%ecx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm2
movsd(%edx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm1
movsd(%eax), %xmm4
mulsd%xmm5, %xmm4
movsd8(%esi), %xmm5
addsd%xmm4, %xmm0

movsd448(%ebx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm3
movsd448(%ecx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm2
movsd448(%edx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm1
movsd448(%eax), %xmm4
mulsd%xmm5, %xmm4
movsd16(%esi), %xmm5
addsd%xmm4, %xmm0

movsd896(%ebx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm3
movsd896(%ecx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm2
movsd896(%edx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm1
movsd896(%eax), %xmm4
mulsd%xmm5, %xmm4
movsd24(%esi), %xmm5
addsd%xmm4, %xmm0

addl$32, %esi
movsd1344(%ebx), %xmm4
addl$1792, %ebx
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm3
movsd1344(%ecx), %xmm4
addl$1792, %ecx
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm2
movsd1344(%edx), %xmm4
addl$1792, %edx
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm1
movsd1344(%eax), %xmm4
addl$1792, %eax
mulsd%xmm5, %xmm4
cmpl$56, %edi
addsd%xmm4, %xmm0

jneL10

how to rewrite this code for achievement near theoretical peak rate (2.0 flop/cycle)?

0 Kudos
0 Replies
Reply