Community
cancel
Showing results for 
Search instead for 
Did you mean: 
mustache_marc
Beginner
185 Views

ssse3+ shuffle instruction

Hi all,
I'm trying to shuffle bits in 2 xmm, and i'm trying to find the right document, and having trouble to decode the intel documents how to use the instruction. Let me first explain, the situation.
I have one xmm 128-bit register with four 32-bit values a0a1...a31b0b1...b31c0c1...c31d0d1...d31. i want to shuffle these bits the following way: a0b0c0d0a1a2a3a4...a31b31c31d31.
After some suggestions i tried the pshufb instruction with mask 0x0F0001020304050607080090A0B0C0D0E. but it results only in that looks like 0x60606060...6060. The information on how to build such a mask seems absent, or not wel documented. Could some give a hint?
Regards,
-Marc
0 Kudos
2 Replies
Matthias_Kretz
New Contributor I
185 Views

Oh this useless editor!!! Just deleted all my text once again.. :(

pshufb shuffles bytes not bits. You'll have to do it yourself. Take a look at the bit twiddling hacks.
denbianh
Beginner
185 Views

1. pshufb reordering by byte order

pshufb do not meet your requirement. However does it work for the case, A3A2A1A0, B3B2B1B0, C3C2C1C0, D3D2D1D0 => A3B3C3D3, A2B2C2D2, A1B1C1D1, A0B0C0D0 (A,B,C,D: 1 byte) using the shuffle mask (from low to high byte order)(

BYTE_REORDER_MASK:

db 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f

)

2. As your requirement, shuffling by bit order

basic idea, 32 times loops below codes:

movmskps ecx, xmm0

pslld xmm0, 01h

////////details utilization for item 1 and 2 refer to the code section as below///////////


; reorder_simd.asm
BITS 32

;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************

%macro GLOBAL_ASM 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro

;***********************************************************************
; Local Data (Read Only)
;***********************************************************************

SECTION .rodata align=16

;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************
ALIGN 16
BYTE_REORDER_MASK:
db 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f

;***********************************************************************
; Code
;***********************************************************************

SECTION .text

GLOBAL_ASM reorder_dqword_byte_ssse3
;=========================================================================
; void reorder_dqword_byte_ssse3( void *v );
; A, B, C, D: 1 byte
; High <--> Low bytes
; A3 A2 A1 A0, B3 B2 B1 B0, C3 C2 C1 C0, D3 D2 D1 D0
;=>A3 B3 C3 D3, A2 B2 C2 D2, A1 B1 C1 D1, A0 B0 C0 D0
;=========================================================================
reorder_dqword_byte_ssse3:
mov eax, [esp+4]
movdqa xmm0, [eax]
movdqa xmm1, [BYTE_REORDER_MASK]
pshufb xmm0, xmm1
movdqa [eax], xmm0
ret

GLOBAL_ASM reorder_dqword_bit_sse2
;=========================================================================
; void reorder_dqword_bit_sse2( void *v );
; a, b, c, d: 1 bit
; High <--> Low bits
; a31 .. a2 a1 a0, b31 .. b2 b1 b0, c31 .. c2 c1 c0, d31 .. d2 d1 d0
;=>a31 b31 c31 d31, .., a2 b2 c2 d2, a1 b1 c1 d1, a0 b0 c0 d0
;=========================================================================
%macro reorder_byte_sse2 3 ; dst(ebx), src(xmm0), tmp(ecx)
movmskps %3, %2 ; bit31: [127][95][63][31], zero extended other bits
mov %1, %3
shl %1, 1ch ; left shift 28 bits
pslld %2, 01h
movmskps %3, %2 ; bit30: [127][95][63][31], zero extended other bits
shl %3, 18h ; left shift 24 bits
or %1, %3
pslld %2, 01h
movmskps %3, %2 ; bit29: [127][95][63][31], zero extended other bits
shl %3, 14h ; left shift 20 bits
or %1, %3
pslld %2, 01h
movmskps %3, %2 ; bit28: [127][95][63][31], zero extended other bits
shl %3, 10h ; left shift 16 bits
or %1, %3
pslld %2, 01h
movmskps %3, %2 ; bit27: [127][95][63][31], zero extended other bits
shl %3, 0ch ; left shift 12 bits
or %1, %3
pslld %2, 01h
movmskps %3, %2 ; bit26: [127][95][63][31], zero extended other bits
shl %3, 08h ; left shift 08 bits
or %1, %3
pslld %2, 01h
movmskps %3, %2 ; bit25: [127][95][63][31], zero extended other bits
shl %3, 04h ; left shift 4 bits
or %1, %3
pslld %2, 01h
movmskps %3, %2 ; bit24: [127][95][63][31], zero extended other bits
or %1, %3
%endmacro ; end of reorder_byte_sse2
reorder_dqword_bit_sse2:
push ebx
%define STACK_SIZE 4
%define EXTRA_STACK_SIZE 16

sub esp, EXTRA_STACK_SIZE ; tmp buffer 16 bytes

mov eax, [esp++STACK_SIZE+EXTRA_STACK_SIZE+4]
movdqa xmm0, [eax]

reorder_byte_sse2 ebx, xmm0, ecx ; dst(ebx), src(xmm0), tmp(ecx)

mov [esp+12], ebx

pslld xmm0, 01h
reorder_byte_sse2 ebx, xmm0, ecx ; dst(ebx), src(xmm0), tmp(ecx)

mov [esp+8], ebx

pslld xmm0, 01h
reorder_byte_sse2 ebx, xmm0, ecx ; dst(ebx), src(xmm0), tmp(ecx)

mov [esp+4], ebx

pslld xmm0, 01h
reorder_byte_sse2 ebx, xmm0, ecx ; dst(ebx), src(xmm0), tmp(ecx)

mov [esp+0], ebx

movdqu xmm1, [esp+0]
movdqa [eax], xmm1

add esp, EXTRA_STACK_SIZE
%undef EXTRA_STACK_SIZE
%undef STACK_SIZE
pop ebx
ret
Reply