[cpp]static const unsigned long long C64 = 0x1234567844441111ULL;
__asm__ volatile ("movq %0, %%mm0" ::"m"(C64));
__asm movq mm0, C64;
[cpp]PUBLIC _set_mm0 movq mm0, QWORD PTR [_C64] ret [/cpp]
[bash]PUBLIC _set_mm0 sub esp, 8 mov eax, 1145311505 mov edx, 305419896 mov DWORD PTR [esp], eax mov DWORD PTR [4+esp], edx movq mm0, QWORD PTR [esp] add esp, 8 ret [/bash]
[bash]movq C64(%rip), %mm0which, if such ASMs are used heavily in your code, may make your objects shorter because the IP-offsets to your constants in memory are probably smaller than full 64-bit addresses. However, this need not be the fastest option.
[bash] movq $0x1234567844441111, %rax #4.0which seems to indicate that the optimizer thinks that it is more efficient to turn your constants into immediate constants, which have to be first loaded into local stack memory and then into mmx registers. Since this memory is likely to be in the cache, these three instructions may run a bit faster than loading from out-of-cache .DATA segment memory, particularly if the constant is not in the cache.
movq %rax, -8(%rsp) #4.0
movq -8(%rsp), %mm0 #4.0
[bash]movq $0x123456789abcdef0,%raxor through memory. I don't know how to coax this out of the compiler without coding two corresponding #asm statements myself.
movq %rax, %mm0