Intel® ISA Extensions
Use hardware-based isolation and memory encryption to provide more code protection in your solutions.

ssse3+ shuffle instruction

mustache_marc
Beginner
1,079 Views
Hi all,
I'm trying to shuffle bits in 2 xmm, and i'm trying to find the right document, and having trouble to decode the intel documents how to use the instruction. Let me first explain, the situation.
I have one xmm 128-bit register with four 32-bit values a0a1...a31b0b1...b31c0c1...c31d0d1...d31. i want to shuffle these bits the following way: a0b0c0d0a1a2a3a4...a31b31c31d31.
After some suggestions i tried the pshufb instruction with mask 0x0F0001020304050607080090A0B0C0D0E. but it results only in that looks like 0x60606060...6060. The information on how to build such a mask seems absent, or not wel documented. Could some give a hint?
Regards,
-Marc
0 Kudos
2 Replies
Matthias_Kretz
New Contributor I
1,079 Views
Oh this useless editor!!! Just deleted all my text once again.. :(

pshufb shuffles bytes not bits. You'll have to do it yourself. Take a look at the bit twiddling hacks.
0 Kudos
denbianh
Beginner
1,079 Views

1. pshufb reordering by byte order

pshufb do not meet your requirement. However does it work for the case, A3A2A1A0, B3B2B1B0, C3C2C1C0, D3D2D1D0 => A3B3C3D3, A2B2C2D2, A1B1C1D1, A0B0C0D0 (A,B,C,D: 1 byte) using the shuffle mask (from low to high byte order)(

BYTE_REORDER_MASK:

db 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f

)

2. As your requirement, shuffling by bit order

basic idea, 32 times loops below codes:

movmskps ecx, xmm0

pslld xmm0, 01h

////////details utilization for item 1 and 2 refer to the code section as below///////////


; reorder_simd.asm
BITS 32

;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************

%macro GLOBAL_ASM 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro

;***********************************************************************
; Local Data (Read Only)
;***********************************************************************

SECTION .rodata align=16

;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************
ALIGN 16
BYTE_REORDER_MASK:
db 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f

;***********************************************************************
; Code
;***********************************************************************

SECTION .text

GLOBAL_ASM reorder_dqword_byte_ssse3
;=========================================================================
; void reorder_dqword_byte_ssse3( void *v );
; A, B, C, D: 1 byte
; High <--> Low bytes
; A3 A2 A1 A0, B3 B2 B1 B0, C3 C2 C1 C0, D3 D2 D1 D0
;=>A3 B3 C3 D3, A2 B2 C2 D2, A1 B1 C1 D1, A0 B0 C0 D0
;=========================================================================
reorder_dqword_byte_ssse3:
mov eax, [esp+4]
movdqa xmm0, [eax]
movdqa xmm1, [BYTE_REORDER_MASK]
pshufb xmm0, xmm1
movdqa [eax], xmm0
ret

GLOBAL_ASM reorder_dqword_bit_sse2
;=========================================================================
; void reorder_dqword_bit_sse2( void *v );
; a, b, c, d: 1 bit
; High <--> Low bits
; a31 .. a2 a1 a0, b31 .. b2 b1 b0, c31 .. c2 c1 c0, d31 .. d2 d1 d0
;=>a31 b31 c31 d31, .., a2 b2 c2 d2, a1 b1 c1 d1, a0 b0 c0 d0
;=========================================================================
%macro reorder_byte_sse2 3 ; dst(ebx), src(xmm0), tmp(ecx)
movmskps %3, %2 ; bit31: [127][95][63][31], zero extended other bits
mov %1, %3
shl %1, 1ch ; left shift 28 bits
pslld %2, 01h
movmskps %3, %2 ; bit30: [127][95][63][31], zero extended other bits
shl %3, 18h ; left shift 24 bits
or %1, %3
pslld %2, 01h
movmskps %3, %2 ; bit29: [127][95][63][31], zero extended other bits
shl %3, 14h ; left shift 20 bits
or %1, %3
pslld %2, 01h
movmskps %3, %2 ; bit28: [127][95][63][31], zero extended other bits
shl %3, 10h ; left shift 16 bits
or %1, %3
pslld %2, 01h
movmskps %3, %2 ; bit27: [127][95][63][31], zero extended other bits
shl %3, 0ch ; left shift 12 bits
or %1, %3
pslld %2, 01h
movmskps %3, %2 ; bit26: [127][95][63][31], zero extended other bits
shl %3, 08h ; left shift 08 bits
or %1, %3
pslld %2, 01h
movmskps %3, %2 ; bit25: [127][95][63][31], zero extended other bits
shl %3, 04h ; left shift 4 bits
or %1, %3
pslld %2, 01h
movmskps %3, %2 ; bit24: [127][95][63][31], zero extended other bits
or %1, %3
%endmacro ; end of reorder_byte_sse2
reorder_dqword_bit_sse2:
push ebx
%define STACK_SIZE 4
%define EXTRA_STACK_SIZE 16

sub esp, EXTRA_STACK_SIZE ; tmp buffer 16 bytes

mov eax, [esp++STACK_SIZE+EXTRA_STACK_SIZE+4]
movdqa xmm0, [eax]

reorder_byte_sse2 ebx, xmm0, ecx ; dst(ebx), src(xmm0), tmp(ecx)

mov [esp+12], ebx

pslld xmm0, 01h
reorder_byte_sse2 ebx, xmm0, ecx ; dst(ebx), src(xmm0), tmp(ecx)

mov [esp+8], ebx

pslld xmm0, 01h
reorder_byte_sse2 ebx, xmm0, ecx ; dst(ebx), src(xmm0), tmp(ecx)

mov [esp+4], ebx

pslld xmm0, 01h
reorder_byte_sse2 ebx, xmm0, ecx ; dst(ebx), src(xmm0), tmp(ecx)

mov [esp+0], ebx

movdqu xmm1, [esp+0]
movdqa [eax], xmm1

add esp, EXTRA_STACK_SIZE
%undef EXTRA_STACK_SIZE
%undef STACK_SIZE
pop ebx
ret
0 Kudos
Reply