- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi!
why this scalar sse2 code (all data in L1 cache) executes on Core2 only on rate 1.49 flop/cycle?
L10:
movsd(%esi), %xmm5
movsd(%ebx), %xmm4
addl$4, %edi
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm3
movsd(%ecx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm2
movsd(%edx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm1
movsd(%eax), %xmm4
mulsd%xmm5, %xmm4
movsd8(%esi), %xmm5
addsd%xmm4, %xmm0
movsd448(%ebx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm3
movsd448(%ecx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm2
movsd448(%edx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm1
movsd448(%eax), %xmm4
mulsd%xmm5, %xmm4
movsd16(%esi), %xmm5
addsd%xmm4, %xmm0
movsd896(%ebx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm3
movsd896(%ecx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm2
movsd896(%edx), %xmm4
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm1
movsd896(%eax), %xmm4
mulsd%xmm5, %xmm4
movsd24(%esi), %xmm5
addsd%xmm4, %xmm0
addl$32, %esi
movsd1344(%ebx), %xmm4
addl$1792, %ebx
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm3
movsd1344(%ecx), %xmm4
addl$1792, %ecx
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm2
movsd1344(%edx), %xmm4
addl$1792, %edx
mulsd%xmm5, %xmm4
addsd%xmm4, %xmm1
movsd1344(%eax), %xmm4
addl$1792, %eax
mulsd%xmm5, %xmm4
cmpl$56, %edi
addsd%xmm4, %xmm0
jneL10
how to rewrite this code for achievement near theoretical peak rate (2.0 flop/cycle)?
Link Copied
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page