Solved: Histogram examples using AVX-512 CD in Dec 2017 Optimization Ref Manual are wrong?

Nelson__Trent · ‎03-01-2018

The December 2017 Optimization Reference Manual has two sections describing how to use the new AVX-512 conflict detection instructions for histogram calculation.

There are numerous issues with section 15.16.1 and the example code in 15-18. In particular, the code in the conflict_loop segment just doesn't work as described.

    vmovaps zmm4, all_1 // {1, 1, …, 1}
    vmovaps zmm5, all_negative_1
    vmovaps zmm6, all_31
    vmovaps zmm7, all_bins_minus_1
    mov ebx, num_inputs
    mov r10, pInput
    mov r15, pHistogram
histogram_loop:
    vpandd zmm3, [r10+rcx*4], zmm7
    vpconflictd zmm0, zmm3
    kxnorw k1, k1, k1
    vmovaps zmm2, zmm4
    vpxord zmm1, zmm1, zmm1
    vpgatherdd zmm1{k1}, [r15+zmm3*4]
    vptestmd k1, zmm0, zmm0
    kortestw k1, k1
    je update
    vplzcntd zmm0, zmm0
    vpsubd zmm0, zmm6, zmm0
conflict_loop:
    vpermd zmm8{k1}{z}, zmm2, zmm0
    vpermd zmm0{k1}, zmm0, zmm0
    vpaddd zmm2{k1}, zmm2, zmm8
    vpcmpd k1, 4, zmm5, zmm0
    kortestw k1, k1
    jne conflict_loop
update:
    vpaddd zmm0, zmm2, zmm1
    kxnorw k1, k1, k1
    addq rcx, 16
    vpscatterdd [r15+zmm3*4]{k1}, zmm0
    cmpl ebx, ecx
    jb histogram_loop

Section 17.2.3 has another example of using AVX-512 CD intrinsics. This one is also incorrect, but less so. The two `vpsubd zmm1, zmm1, zmm5` are a mistake -- only one `vpsubd zmm1, zmm1, zmm5` should be necessary. Additionally, no indication is given as to the value in the following addresses: [rip+0x185c], [rip+0x1884], [rip+0x18ba]. The loop termination logic in Resolve_conflicts seems redundant, too.

Top:
    vmovups zmm4, [rsp+rdx*4+0x40]
    vpxord zmm1, zmm1, zmm1
    kmovw k2, k1
    vpconflictd zmm2, zmm4
    vpgatherdd zmm1{k2}, [rax+zmm4*4]
    vptestmd k0, zmm2, [rip+0x185c]
    kmovw ecx, k0
    vpaddd zmm3, zmm1, zmm0
    test ecx, ecx
    jz <No_conflicts>
    vmovups zmm1, [rip+0x1884]
    vptestmd k0, zmm2, [rip+0x18ba]
    vplzcntd zmm5, zmm2
    xor bl, bl
    kmovw ecx, k0
    vpsubd zmm1, zmm1, zmm5
    vpsubd zmm1, zmm1, zmm5

Resolve_conflicts:
    vpbroadcastd zmm5, ecx
    kmovw k2, ecx
    vpermd zmm3{k2}, zmm1, zmm3
    vpaddd zmm3{k2}, zmm3, zmm0
    vptestmd k0{k2}, zmm5, zmm2
    kmovw esi, k0
    and ecx, esi
    jz <No_conflicts>
    add bl, 0x1
    cmp bl, 0x10
    jb <Resolve_conflicts>

No_conflicts:
    kmovw k2, k1
    vpscatterdd [rax+zmm4*4]{k2}, zmm3
    add edx, 0x10
    cmp edx, 0x400
    jb <Top>

I've managed to massage the second example into something that appears to work, as follows:

include ksamd64.inc

OP_EQ   equ 0
OP_NEQ  equ 4

;
; Define constant variables.
;

ZMM_ALIGN equ 64
YMM_ALIGN equ 32
XMM_ALIGN equ 16

_DATA$00 SEGMENT PAGE 'DATA'

        align   ZMM_ALIGN
        public  AllOnes
AllOnes         dd  16  dup (1)

        align   ZMM_ALIGN
        public  AllNegativeOnes
AllNegativeOnes dd  16  dup (-1)

        align   ZMM_ALIGN
        public  AllBinsMinusOne
AllBinsMinusOne dd  16  dup (254)

        align   ZMM_ALIGN
        public  AllThirtyOne
AllThirtyOne    dd  16  dup (31)

        align   ZMM_ALIGN
Input1544    dd   5,  3, 3,  1,  8,  2, 50, 1,  0,  7,  6,  4,  9, 3, 10,  3
Permute1544  dd  -1, -1, 1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, 2, -1, 13
Conflict1544 dd   0,  0, 2,  0,  0,  0,  0, 8,  0,  0,  0,  0,  0, 6,  0, 8198
Counts1544   dd   1,  1, 2,  1,  1,  1,  1, 2,  1,  1,  1,  1,  1, 3,  1,  4

        public  Input1544
        public  Permute1544
        public  Conflict1544
        public  Counts1544

Input1544v2  dd   5,  3, 3,  1,  8,  2, 50, 1,  0,  7,  6,  4,  9, 3, 10,  3
             dd   5,  3, 3,  1,  8,  2, 50, 1,  0,  7,  6,  4,  9, 3, 10,  3

_DATA$00 ends

        NESTED_ENTRY Histo1710, _TEXT$00

;
; Begin prologue.  Allocate stack space and save non-volatile registers.
;

        alloc_stack LOCALS_SIZE

        save_reg    rbp, Locals.SavedRbp        ; Save non-volatile rbp.
        save_reg    rbx, Locals.SavedRbx        ; Save non-volatile rbx.
        save_reg    rdi, Locals.SavedRdi        ; Save non-volatile rdi.
        save_reg    rsi, Locals.SavedRsi        ; Save non-volatile rsi.
        save_reg    r12, Locals.SavedR12        ; Save non-volatile r12.
        save_reg    r13, Locals.SavedR13        ; Save non-volatile r13.
        save_reg    r14, Locals.SavedR14        ; Save non-volatile r14.
        save_reg    r15, Locals.SavedR15        ; Save non-volatile r15.

        save_xmm128 xmm6, Locals.SavedXmm6      ; Save non-volatile xmm6.
        save_xmm128 xmm7, Locals.SavedXmm7      ; Save non-volatile xmm7.
        save_xmm128 xmm8, Locals.SavedXmm8      ; Save non-volatile xmm8.
        save_xmm128 xmm9, Locals.SavedXmm9      ; Save non-volatile xmm9.
        save_xmm128 xmm10, Locals.SavedXmm10    ; Save non-volatile xmm10.
        save_xmm128 xmm11, Locals.SavedXmm11    ; Save non-volatile xmm11.
        save_xmm128 xmm12, Locals.SavedXmm12    ; Save non-volatile xmm12.
        save_xmm128 xmm13, Locals.SavedXmm13    ; Save non-volatile xmm13.
        save_xmm128 xmm14, Locals.SavedXmm14    ; Save non-volatile xmm14.
        save_xmm128 xmm15, Locals.SavedXmm15    ; Save non-volatile xmm15.

        END_PROLOGUE

        mov     Locals.HomeRcx[rsp], rcx                ; Home rcx.
        mov     Locals.HomeRdx[rsp], rdx                ; Home rdx.
        mov     Locals.HomeR8[rsp], r8                  ; Home r8.
        mov     Locals.HomeR9[rsp], r9                  ; Home r9.

        vmovntdqa       zmm28, zmmword ptr [AllOnes]
        vmovntdqa       zmm29, zmmword ptr [AllNegativeOnes]
        vmovntdqa       zmm30, zmmword ptr [AllBinsMinusOne]
        vmovntdqa       zmm31, zmmword ptr [AllThirtyOne]

        mov     rax, rdx
        xor     rdx, rdx

        lea     r10, Input1544v2

Top:
        ;vmovups zmm4, [rsp+rdx*4+0x40]
        vmovntdqa   zmm4, zmmword ptr [r10]
        add     r10, 40h

        ;vmovups zmm4, [rsp+rdx*4+0x40]
        vpxord zmm1, zmm1, zmm1

;
; kmovw k2, k1
;
;   What's k1?!  Assume it's all 1s for now given that it's fed into vpgatherdd.
;

        kxnorw k1, k1, k1

        kmovw k2, k1

        vpconflictd zmm2, zmm4

        vpgatherdd zmm1{k2}, [rax+zmm4*4]

;
; vptestmd k0, zmm2, [rip+0x185c]
;
;   What's [rip+0x185c]?  Guess -1 as it's being used to compare the vpconflictd
;   result, then determining if there are conflicts.
;
        ;vptestmd k0, zmm2, [rip+0x185c]
        vptestmd k0, zmm2, zmm29 ; Test against AllNegativeOnes

        kmovw ecx, k0

;
; vpaddd zmm3, zmm1, zmm0
;
;   What's zmm0?  Assume all 1s, so use zmm28.
;

        vmovaps zmm0, zmm28

        ;vpaddd zmm3, zmm1, zmm0
        vpaddd zmm3, zmm1, zmm0

        test ecx, ecx

        jz No_conflicts

;
; vmovups zmm1, [rip+0x1884]
;
;   What's [rip+0x1884]?
;
;   Try:
;       - AllThirtyOne (31).
;       ;- AllOnes (zmm28)
;

        ;vmovups zmm1, [rip+0x1884]
        vmovaps zmm1, zmm31

;
; vptestmd k0, zmm2, [rip+0x18ba]
;
;   What's [rip+0x18ba]?
;
;   Try:
;
;       - AllNegativeOnes
;

        ;vptestmd k0, zmm2, [rip+0x18ba]
        vptestmd k0, zmm2, zmm29

        vplzcntd zmm5, zmm2

        xor bl, bl

        kmovw ecx, k0

;
; XXX: why two vpsubds here?
;

        vpsubd zmm1, zmm1, zmm5
        ;vpsubd zmm1, zmm1, zmm5

Resolve_conflicts:
        vpbroadcastd zmm5, ecx
        kmovw k2, ecx
        ; The vpermd doesn't appear to have any effect.
        ;vpermd zmm3{k2}, zmm1, zmm3
        vpaddd zmm3{k2}, zmm3, zmm0
        vptestmd k0{k2}, zmm5, zmm2
        kmovw esi, k0
        and ecx, esi
        jz No_conflicts
        add bl, 1h
        cmp bl, 10h
        jb Resolve_conflicts

No_conflicts:
        kmovw k2, k1
        vpscatterdd [rax+zmm4*4]{k2}, zmm3
        add edx, 10h
        cmp edx, 20h
        jb Top


;
; Indicate success.
;

        mov rax, 1

;
; Restore non-volatile registers.
;

Th199:
        mov             rbp,   Locals.SavedRbp[rsp]
        mov             rbx,   Locals.SavedRbx[rsp]
        mov             rdi,   Locals.SavedRdi[rsp]
        mov             rsi,   Locals.SavedRsi[rsp]
        mov             r12,   Locals.SavedR12[rsp]
        mov             r13,   Locals.SavedR13[rsp]
        mov             r14,   Locals.SavedR14[rsp]
        mov             r15,   Locals.SavedR15[rsp]

        movdqa          xmm6,  Locals.SavedXmm6[rsp]
        movdqa          xmm7,  Locals.SavedXmm7[rsp]
        movdqa          xmm8,  Locals.SavedXmm8[rsp]
        movdqa          xmm9,  Locals.SavedXmm9[rsp]
        movdqa          xmm10, Locals.SavedXmm10[rsp]
        movdqa          xmm11, Locals.SavedXmm11[rsp]
        movdqa          xmm12, Locals.SavedXmm12[rsp]
        movdqa          xmm13, Locals.SavedXmm13[rsp]
        movdqa          xmm14, Locals.SavedXmm14[rsp]
        movdqa          xmm15, Locals.SavedXmm15[rsp]

;
; Begin epilogue.  Deallocate stack space and return.
;

        add     rsp, LOCALS_SIZE
        ret


        NESTED_END Histo1710, _TEXT$00

I'm confused as to the purpose of the vpermd instructions in both examples. Even in the latter example, which actually works, I can remove vpermd and it has no effect on the histogram calculation. I believe the mask update logic takes care of the "conflict permutation" referred to in section 15.

Can someone review both sections and provide some insight?

Christophe_H_Intel1 · ‎03-02-2018

Thank you for pointing out the issues here.

Example 15-18 does indeed have a few typos, which we’ll fix.

We should initialize rcx to 0 at the top of the code, e.g. “xor rcx, rcx”
The vpandd instruction has the memory operand in an illegal place. Rather than “vpandd zmm3, [r10+rcx*4], zmm7” the instruction should be “vpandd zmm3, zmm7, [r10+rcx*4]”
The vector comparison inside conflict_loop similarly has the operands in the wrong order. The immediate “4” is meant to indicate a not-equals test, and should be at the end. Alternatively, we could just make this instruction “vpcmpned k1, zmm5, zmm0” to keep it simple.
Continuing the theme of operand order mistakes, the comparison at the end of the code example is backwards. It should be “cmpl ecx, ebx” rather than the opposite.

I’m not sure if any/all of these contributed to your confusion about the algorithm in the example.

In particular, the permutes in conflict_loop are key to combining information from vector elements with the same index. In the example, zmm2 holds the values that we will be adding to memory; the first permute “lines up” partial results from two vector elements with the same index so that the vpadd will create a new partial/full result. The second permute is key. It updates the permute indices for the first permute, so that we will combine the next set(s) of partial results in the next iteration, rather than repeating the same task over and over.

Perhaps an example will help illustrate what’s going on. Let’s imagine we only have four elements in a vector rather than 16 (i.e., 128 bits instead of 512 bits), to keep things manageable. For this example, the input values for conflict_loop are zmm3 = (5, 5, 5, 5). The rightmost element in the set is element zero of the vector (i.e., the one closest to the LSB).

Starting values:
- zmm0 = (2, 1, 0, -1)
- zmm2 = (1, 1, 1, 1)
- k1 = (1, 1, 1, 0)
After the first iteration:
- zmm8 = (1, 1, 1, 0)
- zmm0 = (1, 0, -1, -1)
- zmm2 = (2, 2, 2, 1)
- k1 = (1, 1, 0, 0)
After the second iteration:
- zmm8 = (2, 1, 0, 0)
- zmm0 = (-1, -1, -1, -1)
- zmm2 = (4, 3, 2, 1)
- k1 = (0, 0, 0, 0)

Without the permutes, we would not get this progression. In particular, notice that in the first iteration, we’re effectively combining the first two values from zmm2 into the second element, and the last two values into the last element (the outputs for the first and third elements aren’t very interesting). Then, in the second iteration, we combine the two “interesting” results – the second value of zmm2 is added into the last value (via zmm8). This way, we sum up all four values that start off in zmm2, when we first enter the conflict_loop, in only two iterations.

I’ll also note that histogram is a pretty simple computation, where we always want to add “1” to each element referenced. But the code here will also work for more complex computations where we’ve pre-computed arbitrary values to add to each index. (For that, we’d need to load these values into zmm2 at the top of the histogram_loop instead of copying over zmm4.)

Regarding 17.2.3, as you point out, the example there also appears to have some typos (and missing details). More importantly, this is an example of an older technique for leveraging vconflict. The code in 15-18 is a more recently developed technique, and should be faster on Knights Landing as well as Skylake. So I recommend sticking to 15-18.

View solution in original post

Nelson__Trent · ‎03-01-2018

For what it's worth, here's my final version. I was able to omit the vplzcnt, vpsubd and vpermd elements from the algorithm without any side-effect to the histogram accuracy.

        title "Histogram AVX Routines"
        option nokeyword:<Length>

;++
;
; Copyright (c) 2018 Trent Nelson <trent@trent.me>
;
; Module Name:
;
;   Histogram.asm
;
; Abstract:
;
;   This module implements various routines for constructing histograms.
;
;--

include ksamd64.inc

;
; Define constants.
;

OP_EQ   equ 0
OP_NEQ  equ 4

;
; Define constant variables.
;

ZMM_ALIGN equ 64
YMM_ALIGN equ 32
XMM_ALIGN equ 16

_DATA$00 SEGMENT PAGE 'DATA'

        align   ZMM_ALIGN
QuickLazy       db      "The quick brown fox jumps over the lazy dog.  "

        align   ZMM_ALIGN
        public  Test1
Test1           db      "ABACAEEFGIHIJJJKLMNDOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!!"

        align   ZMM_ALIGN
        public  Shuffle1
Shuffle1        db  64  dup (02h, 00h, 00h, 00h, 60 dup (00h))

        align   ZMM_ALIGN
        public  AllOnes
AllOnes         dd  16  dup (1)

        align   ZMM_ALIGN
        public  AllNegativeOnes
AllNegativeOnes dd  16  dup (-1)

        align   ZMM_ALIGN
        public  AllBinsMinusOne
AllBinsMinusOne dd  16  dup (254)

        align   ZMM_ALIGN
        public  AllThirtyOne
AllThirtyOne    dd  16  dup (31)

_DATA$00 ends

;
; Define the LONG_STRING structure used to encapsulate string information.
;

LONG_STRING struct
    Length  dd  ?
    Hash    dd  ?
    Buffer  dq  ?
LONG_STRING ends
PLONG_STRING typedef ptr LONG_STRING

;
; Define the CHARACTER_HISTOGRAM structure used to encapsulate the histogram.
;

CHARACTER_HISTOGRAM struct
    Count   dd 256 dup (?)
CHARACTER_HISTOGRAM ends
PCHARACTER_HISTOGRAM typedef ptr CHARACTER_HISTOGRAM

CHARACTER_HISTOGRAM_V4 struct
    Histogram1 CHARACTER_HISTOGRAM { }
    Histogram2 CHARACTER_HISTOGRAM { }
    Histogram3 CHARACTER_HISTOGRAM { }
    Histogram4 CHARACTER_HISTOGRAM { }
CHARACTER_HISTOGRAM_V4 ends
PCHARACTER_HISTOGRAM_V4 typedef ptr CHARACTER_HISTOGRAM_V4

;
; Define a Locals structure used for referencing our register homing space from
; rsp.
;

Locals struct
    Temp dq ?

    ;
    ; Define non-volatile register storage.
    ;

    union
        FirstNvRegister     dq      ?
        SavedRbp            dq      ?
    ends

    SavedRbx                dq      ?
    SavedRdi                dq      ?
    SavedRsi                dq      ?
    SavedR12                dq      ?
    SavedR13                dq      ?
    SavedR14                dq      ?

    SavedXmm6               XMMWORD { }
    SavedXmm7               XMMWORD { }
    SavedXmm8               XMMWORD { }
    SavedXmm9               XMMWORD { }
    SavedXmm10              XMMWORD { }
    SavedXmm11              XMMWORD { }
    SavedXmm12              XMMWORD { }
    SavedXmm13              XMMWORD { }
    SavedXmm14              XMMWORD { }
    SavedXmm15              XMMWORD { }

    ;
    ; Stash R15 after the return address to ensure the XMM register space
    ; is aligned on a 16 byte boundary, as we use movdqa (i.e. aligned move)
    ; which will fault if we're only 8 byte aligned.
    ;

    SavedR15                dq      ?

    ReturnAddress           dq  ?
    HomeRcx                 dq  ?
    HomeRdx                 dq  ?
    HomeR8                  dq  ?
    HomeR9                  dq  ?
Locals ends

;
; Exclude the return address onward from the frame calculation size.
;

LOCALS_SIZE  equ ((sizeof Locals) + (Locals.ReturnAddress - (sizeof Locals)))

;
; Define helper typedefs that are nicer to work with in assembly than their long
; uppercase name counterparts.
;

String typedef LONG_STRING
Histogram typedef CHARACTER_HISTOGRAM_V4

;++
;
; BOOLEAN
; CreateHistogramAvx2AlignedAsm(
;     _In_ PCLONG_STRING String,
;     _Inout_updates_bytes_(sizeof(*Histogram))
;         PCHARACTER_HISTOGRAM_V4 Histogram
;     );
;
; Routine Description:
;
;   This routine creates a histogram for a given string.
;
; Arguments:
;
;   String (rcx) - Supplies a pointer to a LONG_STRING structure that contains
;       the string for which a histogram is to be created.
;
;   Histogram (rdx) - Supplies an address that receives the histogram for the
;       given input string.
;
; Return Value:
;
;   TRUE on success, FALSE on failure.
;
;--

        LEAF_ENTRY CreateHistogramAvx2AlignedAsm, _TEXT$00

;
; Clear return value (Success = FALSE).
;

        xor     rax, rax                                ; Clear rax.

;
; Validate parameters.
;

        test    rcx, rcx                                ; Is rcx NULL?
        jz      Cha99                                   ; Yes, abort.
        test    rdx, rdx                                ; Is rdx NULL?
        jz      Cha99                                   ; Yes, abort.

;
; Verify the string is at least 64 bytes long.
;
        mov     r9, 64                                  ; Initialize r9 to 64.
        cmp     String.Length[rcx], r9d                 ; Compare Length to 64.
        jl      Cha99                                   ; String is too short.

;
; Ensure the incoming string and histogram buffers are aligned to 32-byte
; boundaries.
;

        mov     r9, 31                                  ; Initialize r9 to 31.
        test    String.Buffer[rcx], r9                  ; Is string aligned?
        jnz     Cha99                                   ; No, abort.

;
; Initialize loop variables.
;
;   rcx - Counter.
;
;   rdx - Base string buffer.
;
;   r8  - Base address of first histogram buffer.
;
;   r9  - Base address of second histogram buffer.
;
;   r10 - Byte counter.
;
;   r11 - Byte counter.
;

        mov     r8, rdx                             ; Load 1st histo buffer.
        lea     r9, Histogram.Histogram2[r8]        ; Load 2nd histo buffer.
        mov     rdx, String.Buffer[rcx]             ; Load string buffer.
        mov     ecx, String.Length[rcx]             ; Load string length.
        shr     ecx, 5                              ; Divide by 32 to get loop
                                                    ; iterations.

;
; Top of the histogram loop.
;

        align 16

;
; Load the first 32 bytes into ymm0.  Duplicate the lower 16 bytes in xmm0 and
; xmm1, then extract the high 16 bytes into xmm2 and xmm3.
;

Cha50:  vmovntdqa       ymm0, ymmword ptr [rdx]     ; Load 32 bytes into ymm0.
        vextracti128    xmm2, ymm0, 1               ; Copy 16-31 bytes to xmm2.
        vmovdqa         xmm1, xmm0                  ; Duplicate xmm0 into xmm1.
        vmovdqa         xmm3, xmm2                  ; Duplicate xmm2 into xmm3.
        add             rdx, 32                     ; Increment pointer.

;
; Process bytes 0-15, registers xmm0 and xmm1.
;

        vpextrb     r10, xmm0, 0                        ; Extract byte 0.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm1, 1                        ; Extract byte 1.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm0, 2                        ; Extract byte 2.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm1, 3                        ; Extract byte 3.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm0, 4                        ; Extract byte 4.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm1, 5                        ; Extract byte 5.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm0, 6                        ; Extract byte 6.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm1, 7                        ; Extract byte 7.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm0, 8                        ; Extract byte 8.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm1, 9                        ; Extract byte 9.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm0, 10                       ; Extract byte 10.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm1, 11                       ; Extract byte 11.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm0, 12                       ; Extract byte 12.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm1, 13                       ; Extract byte 13.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm0, 14                       ; Extract byte 14.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm1, 15                       ; Extract byte 15.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

;
; Continue processing the second set of 16 bytes (16-31) via xmm2 and xmm3.
;

        vpextrb     r10, xmm2, 0                        ; Extract byte 16.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm3, 1                        ; Extract byte 17.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm2, 2                        ; Extract byte 18.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm3, 3                        ; Extract byte 19.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm2, 4                        ; Extract byte 20.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm3, 5                        ; Extract byte 21.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm2, 6                        ; Extract byte 22.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm3, 7                        ; Extract byte 23.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm2, 8                        ; Extract byte 24.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm3, 9                        ; Extract byte 25.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm2, 10                       ; Extract byte 26.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm3, 11                       ; Extract byte 27.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm2, 12                       ; Extract byte 28.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm3, 13                       ; Extract byte 29.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

        vpextrb     r10, xmm2, 14                       ; Extract byte 30.
        add         dword ptr [r8 + r10 * 4], 1         ; Update count.

        vpextrb     r11, xmm3, 15                       ; Extract byte 31.
        add         dword ptr [r9 + r11 * 4], 1         ; Update count.

;
; End of loop.  Update loop counter and determine if we're finished.
;

        sub         ecx, 1                              ; Decrement counter.
        jnz         Cha50                               ; Continue loop if != 0.

;
; We've finished creating the histogram.  Merge the two histograms 64 bytes at
; a time using YMM registers.
;

        mov         ecx, 16                             ; Initialize counter.

        align 16

Cha75:  vmovntdqa   ymm0, ymmword ptr [r8+rax]      ; Load 1st histo  0-31.
        vmovntdqa   ymm1, ymmword ptr [r8+rax+20h]  ; Load 1st histo 32-63.
        vmovntdqa   ymm2, ymmword ptr [r9+rax]      ; Load 2nd histo  0-31.
        vmovntdqa   ymm3, ymmword ptr [r9+rax+20h]  ; Load 2nd histo 32-63.

        vpaddd      ymm4, ymm0, ymm2                ; Add  0-31 counts.
        vpaddd      ymm5, ymm1, ymm3                ; Add 32-63 counts.

        vmovntdq    ymmword ptr [r8+rax], ymm4      ; Save counts for  0-31.
        vmovntdq    ymmword ptr [r8+rax+20h], ymm5  ; Save counts for 32-63.

        add         rax, 40h                        ; Advance to next 64 bytes.
        sub         ecx, 1                          ; Decrement loop counter.
        jnz         short Cha75                     ; Continue if != 0.

;
; Indicate success and return.
;

Cha98:  mov rax, 1

Cha99:  ret

        LEAF_END CreateHistogramAvx2AlignedAsm, _TEXT$00

;++
;
; BOOLEAN
; CreateHistogramAvx512AlignedAsm(
;     _In_ PCLONG_STRING String,
;     _Inout_updates_bytes_(sizeof(*Histogram))
;         PCHARACTER_HISTOGRAM_V4 Histogram
;     );
;
; Routine Description:
;
;   This routine creates a histogram for a given string.
;
; Arguments:
;
;   String (rcx) - Supplies a pointer to a LONG_STRING structure that contains
;       the string for which a histogram is to be created.
;
;   Histogram (rdx) - Supplies an address that receives the histogram for the
;       given input string.
;
; Return Value:
;
;   TRUE on success, FALSE on failure.
;
;--

        NESTED_ENTRY CreateHistogramAvx512AlignedAsm, _TEXT$00

;
; Begin prologue.  Allocate stack space and save non-volatile registers.
;

        alloc_stack LOCALS_SIZE

        save_reg    rbp, Locals.SavedRbp        ; Save non-volatile rbp.
        save_reg    rbx, Locals.SavedRbx        ; Save non-volatile rbx.
        save_reg    rdi, Locals.SavedRdi        ; Save non-volatile rdi.
        save_reg    rsi, Locals.SavedRsi        ; Save non-volatile rsi.
        save_reg    r12, Locals.SavedR12        ; Save non-volatile r12.
        save_reg    r13, Locals.SavedR13        ; Save non-volatile r13.
        save_reg    r14, Locals.SavedR14        ; Save non-volatile r14.
        save_reg    r15, Locals.SavedR15        ; Save non-volatile r15.

        save_xmm128 xmm6, Locals.SavedXmm6      ; Save non-volatile xmm6.
        save_xmm128 xmm7, Locals.SavedXmm7      ; Save non-volatile xmm7.
        save_xmm128 xmm8, Locals.SavedXmm8      ; Save non-volatile xmm8.
        save_xmm128 xmm9, Locals.SavedXmm9      ; Save non-volatile xmm9.
        save_xmm128 xmm10, Locals.SavedXmm10    ; Save non-volatile xmm10.
        save_xmm128 xmm11, Locals.SavedXmm11    ; Save non-volatile xmm11.
        save_xmm128 xmm12, Locals.SavedXmm12    ; Save non-volatile xmm12.
        save_xmm128 xmm13, Locals.SavedXmm13    ; Save non-volatile xmm13.
        save_xmm128 xmm14, Locals.SavedXmm14    ; Save non-volatile xmm14.
        save_xmm128 xmm15, Locals.SavedXmm15    ; Save non-volatile xmm15.

        END_PROLOGUE


;
; Clear return value (Success = FALSE).
;

        xor     rax, rax                                ; Clear rax.

;
; Validate parameters.
;

        test    rcx, rcx                                ; Is rcx NULL?
        jz      Chb99                                   ; Yes, abort.
        test    rdx, rdx                                ; Is rdx NULL?
        jz      Chb99                                   ; Yes, abort.

;
; Verify the string is at least 64 bytes long.
;
        mov     r9, 64                                  ; Initialize r9 to 64.
        cmp     String.Length[rcx], r9d                 ; Compare Length to 64.
        jl      Chb99                                   ; String is too short.

;
; Ensure the incoming string and histogram buffers are aligned to 32-byte
; boundaries.
;

        mov     r9, 31                                  ; Initialize r9 to 31.
        test    String.Buffer[rcx], r9                  ; Is string aligned?
        jnz     Chb99                                   ; No, abort.

;
; Initialize loop variables.
;
;   rcx - Counter.
;
;   rdx - Base string buffer.
;
;   r8  - Base address of first histogram buffer.
;
;   r9  - Base address of second histogram buffer.
;
;   r10 - Base address of third histogram buffer.
;
;   r11 - Base address of fourth histogram buffer.
;

        mov     r8,  rdx                            ; Load 1st histo buffer.
        lea     r9,  Histogram.Histogram2[r8]       ; Load 2nd histo buffer.
        lea     r10, Histogram.Histogram3[r8]       ; Load 3rd histo buffer.
        lea     r11, Histogram.Histogram4[r8]       ; Load 4th histo buffer.
        mov     rdx, String.Buffer[rcx]             ; Load string buffer.
        mov     ecx, String.Length[rcx]             ; Load string length.
        shr     ecx, 6                              ; Divide by 64 to get loop
                                                    ; iterations.


        vmovntdqa       zmm28, zmmword ptr [AllOnes]
        vmovntdqa       zmm29, zmmword ptr [AllNegativeOnes]
        vmovntdqa       zmm30, zmmword ptr [AllBinsMinusOne]
        vmovntdqa       zmm31, zmmword ptr [AllThirtyOne]

        mov             rax, 1111111111111111h
        kmovq           k1, rax
        vpmovm2b        zmm1, k1

        kshiftlq        k2, k1, 1
        vpmovm2b        zmm2, k2

        kshiftlq        k3, k1, 2
        vpmovm2b        zmm3, k3

        kshiftlq        k4, k1, 3
        vpmovm2b        zmm4, k4

        align 16

Chb30:  vmovntdqa       zmm0, zmmword ptr [rdx]     ; Load 64 bytes into zmm0.

;
; Advance the source input pointer by 64 bytes.  Clear the k1 mask
; which we'll use later for the vpgatherds.
;

        add             rdx, 40h                    ; Advance 64 bytes.

;
; Extract byte values for each doubleword into separate registers.
;

        vpandd          zmm5, zmm1, zmm0
        vpandd          zmm6, zmm2, zmm0
        vpandd          zmm7, zmm3, zmm0
        vpandd          zmm8, zmm4, zmm0

;
; Toggle all bits in the writemasks.
;

        kxnorq          k1, k5, k5
        kxnorq          k2, k6, k6
        kxnorq          k3, k7, k7
        kxnorq          k4, k0, k0

;
; Shift the second to fourth registers such that the byte value is moved to the
; least significant portion of the doubleword element of the register.
;

        vpsrldq         zmm6, zmm6, 1
        vpsrldq         zmm7, zmm7, 2
        vpsrldq         zmm8, zmm8, 3

;
; Gather the counts for the byte locations.
;

        vpgatherdd      zmm20 {k1}, [r8+4*zmm5]     ; Gather 1st counts.
        vpgatherdd      zmm21 {k2}, [r9+4*zmm6]     ; Gather 2nd counts.
        vpgatherdd      zmm22 {k3}, [r10+4*zmm7]    ; Gather 3rd counts.
        vpgatherdd      zmm23 {k4}, [r11+4*zmm8]    ; Gather 4th counts.

;
; Determine if the characters loaded into each ZMM register conflict.
;

        vpconflictd     zmm10, zmm5
        vpconflictd     zmm11, zmm6
        vpconflictd     zmm12, zmm7
        vpconflictd     zmm13, zmm8

        vpord           zmm14, zmm10, zmm11
        vpord           zmm15, zmm12, zmm13
        vpord           zmm14, zmm14, zmm15

        vptestmd        k1, zmm14, zmm14            ; Any conflicts anywhere?
        kortestw        k1, k1
        jne             Chb40                       ; At least one conflict.


;
; No conflicts across all registers.  Proceed with adding 1 (zmm28) to all
; the counts and then scattering the results back into memory.
;

        align           16

Chb35:  vpaddd          zmm24, zmm20, zmm28         ; Add 1st counts.
        vpaddd          zmm25, zmm21, zmm28         ; Add 2nd counts.
        vpaddd          zmm26, zmm22, zmm28         ; Add 3rd counts.
        vpaddd          zmm27, zmm23, zmm28         ; Add 4th counts.

;
; Toggle all bits in the writemasks.
;

        kxnorq          k1, k5, k5
        kxnorq          k2, k6, k6
        kxnorq          k3, k7, k7
        kxnorq          k4, k0, k0

;
; Scatter the counts back into their respective locations.
;

        vpscatterdd     [r8+4*zmm5]  {k1}, zmm24    ; Save 1st counts.
        vpscatterdd     [r9+4*zmm6]  {k2}, zmm25    ; Save 2nd counts.
        vpscatterdd     [r10+4*zmm7] {k3}, zmm26    ; Save 3rd counts.
        vpscatterdd     [r11+4*zmm8] {k4}, zmm27    ; Save 4th counts.

;
; Decrement loop counter and jump back to the top of the processing loop if
; there are more elements.
;

Chb38:  sub             ecx, 1                          ; Decrement counter.
        jnz             Chb30                           ; Continue if != 0.

;
; We've finished processing the input string, jump to finalization logic.
;

        jmp             Chb85

;
; Conflict handling logic.  At least one of the zmm registers had a conflict.
;


;
; Test the first register (zmm5) to see if it has any conflicts (zmm10).
;

Chb40:  vptestmd        k1, zmm10, zmm10            ; Test 1st for conflicts.
        vmovaps         zmm16, zmm28                ; Copy AllOnes into zmm16.
        kmovw           ebx, k1                     ; Move mask into ebx.
        vpaddd          zmm16, zmm16, zmm20         ; Add partial counts.
        test            ebx, ebx                    ; Any conflicts?
        jz              short Chb47                 ; No conflicts.

;
; First register (zmm5) has a conflict (zmm10).
;

        align           16
Chb45:  vpbroadcastd    zmm18, ebx                  ; Bcast mask into zmm18.
        kmovw           k1, ebx                     ; Move mask into k1.
        vpaddd          zmm16 {k1}, zmm16, zmm28    ; Add masked counts.
        vptestmd        k0 {k1}, zmm18, zmm10       ; Test against conflict.
        kmovw           esi, k0                     ; Move new mask into esi.
        and             ebx, esi                    ; Mask off recent bit.
        jnz             short Chb45                 ; Continue loop if bits left

;
; Conflicts have been resolved for first register (zmm5), with the final count
; now living in zmm16.  Scatter back to memory.
;

Chb47:  kxnorw          k7, k3, k3                  ; Set all bits in writemask.
        vpscatterdd     [r8+4*zmm5] {k7}, zmm16     ; Save counts.

;
; Intentional follow-on.
;

;
; Test the second register (zmm6) to see if it has any conflicts (zmm11).
;

Chb50:  vptestmd        k1, zmm11, zmm11            ; Test 2nd for conflicts.
        vmovaps         zmm16, zmm28                ; Copy AllOnes into zmm16.
        kmovw           ebx, k1                     ; Move mask into ebx.
        vpaddd          zmm16, zmm16, zmm21         ; Add partial counts.
        test            ebx, ebx                    ; Any conflicts?
        jz              short Chb57                 ; No conflicts.

;
; Second register (zmm6) has a conflict (zmm11).
;

        align           16
Chb55:  vpbroadcastd    zmm18, ebx                  ; Bcast mask into zmm18.
        kmovw           k1, ebx                     ; Move mask into k1.
        vpaddd          zmm16 {k1}, zmm16, zmm28    ; Add masked counts.
        vptestmd        k0 {k1}, zmm18, zmm11       ; Test against conflict.
        kmovw           esi, k0                     ; Move new mask into esi.
        and             ebx, esi                    ; Mask off recent bit.
        jnz             short Chb55                 ; Continue loop if bits left

;
; Conflicts have been resolved for second register (zmm6), with the final count
; now living in zmm16.  Scatter back to memory.
;

Chb57:  kxnorw          k7, k3, k3                  ; Set all bits in writemask.
        vpscatterdd     [r9+4*zmm6] {k7}, zmm16     ; Save counts.

;
; Intentional follow-on.
;

;
; Test the third register (zmm7) to see if it has any conflicts (zmm16).
;

Chb60:  vptestmd        k1, zmm12, zmm12            ; Test 3rd for conflicts.
        vmovaps         zmm16, zmm28                ; Copy AllOnes into zmm16.
        kmovw           ebx, k1                     ; Move mask into ebx.
        vpaddd          zmm16, zmm16, zmm22         ; Add partial counts.
        test            ebx, ebx                    ; Any conflicts?
        jz              short Chb67                 ; No conflicts.

;
; Third register (zmm7) has a conflict (zmm16).
;

        align           16
Chb65:  vpbroadcastd    zmm18, ebx                  ; Bcast mask into zmm18.
        kmovw           k1, ebx                     ; Move mask into k1.
        vpaddd          zmm16 {k1}, zmm16, zmm28    ; Add masked counts.
        vptestmd        k0 {k1}, zmm18, zmm12       ; Test against conflict.
        kmovw           esi, k0                     ; Move new mask into esi.
        and             ebx, esi                    ; Mask off recent bit.
        jnz             short Chb65                 ; Continue loop if bits left

;
; Conflicts have been resolved for third register (zmm7), with the final count
; now living in zmm16.  Scatter back to memory.
;

Chb67:  kxnorw          k7, k3, k3                  ; Set all bits in writemask.
        vpscatterdd     [r10+4*zmm7] {k7}, zmm16    ; Save counts.

;
; Intentional follow-on.
;

;
; Test the second register (zmm8) to see if it has any conflicts (zmm13).
;

Chb70:  vptestmd        k1, zmm13, zmm13            ; Test 4th for conflicts.
        vmovaps         zmm16, zmm28                ; Copy AllOnes into zmm16.
        kmovw           ebx, k1                     ; Move mask into ebx.
        vpaddd          zmm16, zmm16, zmm23         ; Add partial counts.
        test            ebx, ebx                    ; Any conflicts?
        jz              short Chb77                 ; No conflicts.

;
; Fourth register (zmm8) has a conflict (zmm13).
;

        align           16
Chb75:  vpbroadcastd    zmm18, ebx                  ; Bcast mask into zmm18.
        kmovw           k1, ebx                     ; Move mask into k1.
        vpaddd          zmm16 {k1}, zmm16, zmm28    ; Add masked counts.
        vptestmd        k0 {k1}, zmm18, zmm13       ; Test against conflict.
        kmovw           esi, k0                     ; Move new mask into esi.
        and             ebx, esi                    ; Mask off recent bit.
        jnz             short Chb75                 ; Continue loop if bits left

;
; Conflicts have been resolved for second register (zmm8), with the final count
; now living in zmm16.  Scatter back to memory.
;

Chb77:  kxnorw          k7, k3, k3                  ; Set all bits in writemask.
        vpscatterdd     [r11+4*zmm8] {k7}, zmm16    ; Save counts.

;
; Check for loop termination.
;

        sub             ecx, 1                          ; Decrement counter.
        jnz             Chb30                           ; Continue if != 0.

;
; Merge four histograms into one, 128 bytes at a time (two zmm registers in
; flight per histogram).
;

Chb85:  mov         ecx, 8
        xor         rax, rax

        align       16

Chb90:  vmovdqa32       zmm20, zmmword ptr [r8+rax]     ; Load 1st histo  0-63.
        vmovdqa32       zmm21, zmmword ptr [r8+rax+40h] ; Load 1st histo 64-127.

        vmovdqa32       zmm22, zmmword ptr [r9+rax]     ; Load 2nd histo  0-63.
        vmovdqa32       zmm23, zmmword ptr [r9+rax+40h] ; Load 2nd histo 64-127.

        vmovdqa32       zmm24, zmmword ptr [r10+rax]    ; Load 3rd histo  0-63.
        vmovdqa32       zmm25, zmmword ptr [r10+rax+40h]; Load 3rd histo 64-127.

        vmovdqa32       zmm26, zmmword ptr [r11+rax]    ; Load 4th histo  0-63.
        vmovdqa32       zmm27, zmmword ptr [r11+rax+40h]; Load 4th histo 64-127.

        vpaddd          zmm10, zmm20, zmm22             ; Add 1-2 0-63 histo.
        vpaddd          zmm11, zmm24, zmm26             ; Add 3-4 0-63 histo.
        vpaddd          zmm12, zmm10, zmm11             ; Merge 0-63 histo.

        vpaddd          zmm13, zmm21, zmm23             ; Add 1-2 64-127 histo.
        vpaddd          zmm14, zmm25, zmm27             ; Add 3-4 64-127 histo.
        vpaddd          zmm15, zmm13, zmm14             ; Merge 64-127 histo.

        vmovdqa32       zmmword ptr [r8+rax], zmm12     ; Save  0-63  histo.
        vmovdqa32       zmmword ptr [r8+rax+40h], zmm15 ; Save 64-127 histo.

        add             rax, 80h                        ; Advance to next 128b.
        sub             ecx, 1                          ; Decrement loop cntr.
        jnz             Chb90                           ; Continue if != 0.

;
; Indicate success.
;

        mov     rax, 1

;
; Restore non-volatile registers.
;

Chb99:
        mov             rbp,   Locals.SavedRbp[rsp]
        mov             rbx,   Locals.SavedRbx[rsp]
        mov             rdi,   Locals.SavedRdi[rsp]
        mov             rsi,   Locals.SavedRsi[rsp]
        mov             r12,   Locals.SavedR12[rsp]
        mov             r13,   Locals.SavedR13[rsp]
        mov             r14,   Locals.SavedR14[rsp]
        mov             r15,   Locals.SavedR15[rsp]

        movdqa          xmm6,  Locals.SavedXmm6[rsp]
        movdqa          xmm7,  Locals.SavedXmm7[rsp]
        movdqa          xmm8,  Locals.SavedXmm8[rsp]
        movdqa          xmm9,  Locals.SavedXmm9[rsp]
        movdqa          xmm10, Locals.SavedXmm10[rsp]
        movdqa          xmm11, Locals.SavedXmm11[rsp]
        movdqa          xmm12, Locals.SavedXmm12[rsp]
        movdqa          xmm13, Locals.SavedXmm13[rsp]
        movdqa          xmm14, Locals.SavedXmm14[rsp]
        movdqa          xmm15, Locals.SavedXmm15[rsp]

;
; Begin epilogue.  Deallocate stack space and return.
;

        add     rsp, LOCALS_SIZE
        ret


        NESTED_END CreateHistogramAvx512AlignedAsm, _TEXT$00


; vim:set tw=80 ts=8 sw=4 sts=4 et syntax=masm fo=croql comments=\:;           :

end

Christophe_H_Intel1 · ‎03-02-2018

Thank you for pointing out the issues here.

Example 15-18 does indeed have a few typos, which we’ll fix.

We should initialize rcx to 0 at the top of the code, e.g. “xor rcx, rcx”
The vpandd instruction has the memory operand in an illegal place. Rather than “vpandd zmm3, [r10+rcx*4], zmm7” the instruction should be “vpandd zmm3, zmm7, [r10+rcx*4]”
The vector comparison inside conflict_loop similarly has the operands in the wrong order. The immediate “4” is meant to indicate a not-equals test, and should be at the end. Alternatively, we could just make this instruction “vpcmpned k1, zmm5, zmm0” to keep it simple.
Continuing the theme of operand order mistakes, the comparison at the end of the code example is backwards. It should be “cmpl ecx, ebx” rather than the opposite.

I’m not sure if any/all of these contributed to your confusion about the algorithm in the example.

In particular, the permutes in conflict_loop are key to combining information from vector elements with the same index. In the example, zmm2 holds the values that we will be adding to memory; the first permute “lines up” partial results from two vector elements with the same index so that the vpadd will create a new partial/full result. The second permute is key. It updates the permute indices for the first permute, so that we will combine the next set(s) of partial results in the next iteration, rather than repeating the same task over and over.

Perhaps an example will help illustrate what’s going on. Let’s imagine we only have four elements in a vector rather than 16 (i.e., 128 bits instead of 512 bits), to keep things manageable. For this example, the input values for conflict_loop are zmm3 = (5, 5, 5, 5). The rightmost element in the set is element zero of the vector (i.e., the one closest to the LSB).

Starting values:
- zmm0 = (2, 1, 0, -1)
- zmm2 = (1, 1, 1, 1)
- k1 = (1, 1, 1, 0)
After the first iteration:
- zmm8 = (1, 1, 1, 0)
- zmm0 = (1, 0, -1, -1)
- zmm2 = (2, 2, 2, 1)
- k1 = (1, 1, 0, 0)
After the second iteration:
- zmm8 = (2, 1, 0, 0)
- zmm0 = (-1, -1, -1, -1)
- zmm2 = (4, 3, 2, 1)
- k1 = (0, 0, 0, 0)

Without the permutes, we would not get this progression. In particular, notice that in the first iteration, we’re effectively combining the first two values from zmm2 into the second element, and the last two values into the last element (the outputs for the first and third elements aren’t very interesting). Then, in the second iteration, we combine the two “interesting” results – the second value of zmm2 is added into the last value (via zmm8). This way, we sum up all four values that start off in zmm2, when we first enter the conflict_loop, in only two iterations.

I’ll also note that histogram is a pretty simple computation, where we always want to add “1” to each element referenced. But the code here will also work for more complex computations where we’ve pre-computed arbitrary values to add to each index. (For that, we’d need to load these values into zmm2 at the top of the histogram_loop instead of copying over zmm4.)

Regarding 17.2.3, as you point out, the example there also appears to have some typos (and missing details). More importantly, this is an example of an older technique for leveraging vconflict. The code in 15-18 is a more recently developed technique, and should be faster on Knights Landing as well as Skylake. So I recommend sticking to 15-18.

Nelson__Trent · ‎03-02-2018

Chris, this is great information, much appreciated. I am relieved that most of the issues were due to typos. The comment about 17.2.3 being an older, slower technique than 15-18 is particularly enlightening! I was confused by two seemingly different algorithms being depicted to achieve the same goals.

I'll update my implementation tomorrow to match 15-18 and report back with some sample code links, hopefully this thread will help people trying to create histograms via AVX-512 in the future.

Nelson__Trent · ‎03-03-2018

Chris, I think there's still a flaw in the logic for 15-18. Everything appears to go awry after the very first vpermd zmm8. Here's a step-by-step through the assembly with relevant registers dumped and inline comments. I've used the exact values as they appear in 15-18 (see zmm3):

histogram_loop: 
        vmovntdqa       zmm3, zmmword ptr [r10]     ; Load 64 bytes into zmm0.
        vpconflictd     zmm0, zmm3
        kxnorw          k1, k1, k1
        
ZMM0 = 0000200600000000-0000000600000000-0000000000000000-0000000000000000-0000000800000000-0000000000000000-0000000000000002-0000000000000000 
ZMM3 = 000000030000000A-0000000300000009-0000000400000006-0000000700000000-0000000100000032-0000000200000008-0000000100000003-0000000300000005
K1 = 000000000000FFFF

        vmovaps         zmm2, zmm4
        vpxord          zmm1, zmm1, zmm1

ZMM1 = 0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000 
ZMM2 = 0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001 

        vpgatherdd      zmm1 {k1}, [r15+zmm3*4]
        vptestmd        k1, zmm0, zmm0
        
ZMM1 = 0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000 
K1 = 000000000000A084

        vplzcntd        zmm0, zmm0
        
ZMM0 = 0000001200000020-0000001D00000020-0000002000000020-0000002000000020-0000001C00000020-0000002000000020-000000200000001E-0000002000000020         
        
        vpsubd          zmm0, zmm6, zmm0
        
ZMM0 = 0000000DFFFFFFFF-00000002FFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-00000003FFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFF00000001-FFFFFFFFFFFFFFFF 
ZMM6 = 0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F

conflict_loop:

ZMM0 = 0000000DFFFFFFFF-00000002FFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-00000003FFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFF00000001-FFFFFFFFFFFFFFFF
ZMM2 = 0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001
  K1 = 000000000000A084 (0b1010000010000100)

;
; zmm0 looks good at this point, zmm2 is filled with 1s as we want, and the k1 mask is correct.
; However, everything falls apart after the first vpermd against zmm8:
;

        vpermd          zmm8 {k1} {z},  zmm2, zmm0

ZMM8 = FFFFFFFF00000000-FFFFFFFF00000000-0000000000000000-0000000000000000-FFFFFFFF00000000-0000000000000000-00000000FFFFFFFF-0000000000000000 

        vpermd          zmm0 {k1},      zmm0, zmm0

ZMM0 = 00000002FFFFFFFF-00000001FFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF 

        vpaddd          zmm2 {k1},      zmm2, zmm8

ZMM2 = 0000000000000001-0000000000000001-0000000100000001-0000000100000001-0000000000000001-0000000100000001-0000000100000000-0000000100000001 


ZMM5 = FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF 

        vpcmpd          k1, zmm5, zmm0, OP_NEQ ; vpcmpd k1, 4, zmm5, zmm0

K1 = 000000000000A000

;
; Iteration 2 of conflict loop.  Current state of registers:
;

ZMM0 = 00000002FFFFFFFF-00000001FFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF 
ZMM1 = 0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000 
ZMM2 = 0000000000000001-0000000000000001-0000000100000001-0000000100000001-0000000000000001-0000000100000001-0000000100000000-0000000100000001 
ZMM3 = 000000030000000A-0000000300000009-0000000400000006-0000000700000000-0000000100000032-0000000200000008-0000000100000003-0000000300000005 
ZMM4 = 0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001 
ZMM5 = FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF 
ZMM6 = 0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F 
ZMM7 = 000000FE000000FE-000000FE000000FE-000000FE000000FE-000000FE000000FE-000000FE000000FE-000000FE000000FE-000000FE000000FE-000000FE000000FE 
ZMM8 = FFFFFFFF00000000-FFFFFFFF00000000-0000000000000000-0000000000000000-FFFFFFFF00000000-0000000000000000-00000000FFFFFFFF-0000000000000000 
K1 = 000000000000A000

conflict_loop:

        vpermd          zmm8 {k1} {z},  zmm2, zmm0

ZMM8 = FFFFFFFF00000000-FFFFFFFF00000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000 

        vpermd          zmm0 {k1},      zmm0, zmm0
        
ZMM0 = FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF 

        vpaddd          zmm2 {k1},      zmm2, zmm8
        
ZMM2 = FFFFFFFF00000001-FFFFFFFF00000001-0000000100000001-0000000100000001-0000000000000001-0000000100000001-0000000100000000-0000000100000001

        vpcmpd          k1, zmm5, zmm0, OP_NEQ ; vpcmpd k1, 4, zmm5, zmm0
        
K1 = 0000000000000000

        kortestw        k1, k1
        
ZR = 1        
        
        jne             short conflict_loop

;
; Loop falls through as k1 is zero.  Registers at this point:
;

ZMM0 = FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF 
ZMM1 = 0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000 
ZMM2 = FFFFFFFF00000001-FFFFFFFF00000001-0000000100000001-0000000100000001-0000000000000001-0000000100000001-0000000100000000-0000000100000001 
ZMM3 = 000000030000000A-0000000300000009-0000000400000006-0000000700000000-0000000100000032-0000000200000008-0000000100000003-0000000300000005 
ZMM4 = 0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001-0000000100000001 
ZMM5 = FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFF 
ZMM6 = 0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F-0000001F0000001F 
ZMM7 = 000000FE000000FE-000000FE000000FE-000000FE000000FE-000000FE000000FE-000000FE000000FE-000000FE000000FE-000000FE000000FE-000000FE000000FE 
ZMM8 = FFFFFFFF00000000-FFFFFFFF00000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000-0000000000000000 


update:
        vpaddd          zmm0, zmm2, zmm1

ZMM0 = FFFFFFFF00000001-FFFFFFFF00000001-0000000100000001-0000000100000001-0000000000000001-0000000100000001-0000000100000000-0000000100000001 

;
; So the final value it's attempting to store as the counts is:
;      (-1, 1,          -1, 1,            1, 1,           1, 1              0, 1            1, 1,             1, 0,             1, 1)
;
; It should be:
;
;      (4, 1,           3, 1,             1, 1,           1, 1,             2, 1,           1, 1,             1, 2              1, 1)

        kxnorw          k1, k1, k1
        add             rcx, 16
        vpscatterdd     [r15+zmm3*4] {k1}, zmm0
        cmp             ecx, ebx
        jb              histogram_loop

Nelson__Trent · ‎03-03-2018

Ah, I think you've got the argument order mixed up for the first vpermd with zmm8.

Instead of:

    vpermd zmm8 {k1} {z}, zmm2, zmm0

It should be:

    vpermd zmm8 {k1} {z}, zmm0, zmm2

With that change in place, the histogram counts are computed correctly in two iterations.

Christophe_H_Intel1 · ‎03-03-2018

You're absolutely right. Great catch!