; Header Section [BITS 64] [default rel] global Main_Entry_fn global Complex_If_fn extern sched_getcpu extern fopen, fread, fwrite, fclose extern malloc, calloc, realloc, free extern posix_memalign extern get_core_count_C, thread_create_in_C extern Start_Clock, End_Clock, Print_Time extern open_shm, get_shm_name, remap_shm, unlink_shm extern fwrite_file_C %define ANNOTATE_SITE_BEGIN(Complex_If) %define ANNOTATE_ITERATION_TASK(for_taskA) %define ANNOTATE_SITE_END(Complex_If) section .data align=64 Return_Pointer_Array: dq 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 Clock_Stop: dq 0 list1: dq 10.364, 84.02, 29.127, 2.0, 87.35, 7.64, 14.2, 98.0, 54.65, 54.22, 8.23, 16.21, 129.7, 22.0, 87.35, 17.64, 214.32, 1598.4235, 54.65, 54.22 pad_list1: times 4 dq 0 list2: dq 10.364, 84.02, 29.127, 2.0, 87.35, 7.64, 14.2, 98.0, 54.65, 54.22, 1598.4235 pad_list2: times 5 dq 0 list1_length_inbytes: dq 160 list2_length_inbytes: dq 88 input_array_ptr: dq 0 data_master_ptr: dq 0 Number_Of_Cores: dq 0 ; core count multiplied by 8 _e.g. 32 Number_Of_Cores_Seq: dq 0 ;sequential core count _e.g. 4 cores Number_Of_Cores_Open: dq 0 Number_Of_Cores_Calc: dq 0 stride: dq 0 AVX_Width: dq 0 Start_Time_C: dq 0 writer_core: dq 0 ; set to 1 if we have separate writer core: dq 0 out_fname_ptr: dq 0 shm_size: dq 0 shm_fds_ptr: dq 0 shm_buffers_ptr: dq 0 shm_buffers_count: dq 0 core_count: dq 0 data_ptr: dq 0 data_length: dq 0 list1_length: dq 20 list1_ctr: dq 0 list2_length: dq 11 list2_ctr: dq 0 stack_lengths: dq 0, 0, 0, 0 stack_ptrs: dq 0, 0, 0, 0 data_ptrs_for_fwrite: times 4 dq 0 data_ctrs_for_fwrite: times 4 dq 0 pad_data_section: dq 0 section .rodata align=64 shm_base_name: db "shm_object_%d_%d",0x00 pad_const_section: dq 0 CURR_DTM: db 1609800081.581296,0 out_fname_base: db "/opt/Test_Output_Files/Complex_If_NASM_Test_%d_%d",0x00 ; _________ section .text Init_Cores_fn: xor rbx,rbx ;before cores are created get_cores: mov rdi,1 call [rel get_core_count_C wrt ..got] mov rax,1 mov [Number_Of_Cores_Seq],rax mov [Number_Of_Cores_Open],rax mov rbx,8 mul rbx mov [Number_Of_Cores],rax ; Calculate stride based on number of cores mov rax,[AVX_Width] mov rbx,[Number_Of_Cores_Seq] sub rbx,[writer_core] ; Don't count the writer core mul rbx mov [stride],rax mov rax,[Number_Of_Cores_Seq] sub rax,1 ; -1 for writer core mov [Number_Of_Cores_Calc],rax ; _____ ; Create POSIX shared memory mov rbx,4 %include "/opt/P01_SH/_Include_Utilities/POSIX_Shared_Memory_Multi.asm" ; Create Threads label_first: mov rdi,[Number_Of_Cores_Seq] ; see above call [rel thread_create_in_C wrt ..got] ; Create_Threads_in_C.c label_after: jmp label_900 ;____________ Complex_If_fn: mov rbp,rsp sub rsp,192 ; Get the core number call [rel sched_getcpu wrt ..got] sub rax,[writer_core] mov rbx,8 mul rbx ; multiply by 8 mov [rbp-8],rax ; Populate registers lea rax,[list1] mov r15,rax xor r14,r14 lea rax,[list2] mov r13,rax xor r12,r12 mov r11,[data_ptr] xor r10,r10 mov r9,[data_length] mov r8,[AVX_Width] ; Get pointer offset xor rax,rax xor rcx,rcx mov rdi,[shm_buffers_ptr] mov rax,[rbp-8] ; add core# offset shr rax,3 mov rbx,32 xor rdx,rdx mul rdx add rdi,rax push r8 push r9 lea r8,[stack_lengths] ; for remap lea r9,[stack_ptrs] ; for remap mov rbx,[rdi+0] ; collect1_ptr mov [rbp-16],rbx mov qword [r9+0],16 mov [rbp-24],rcx ; collect1_ctr mov rbx,[shm_size] mov [rbp-32],rbx ; collect1_length mov qword [r8+0],32 mov rbx,[rdi+8] ; collect2_ptr mov [rbp-40],rbx mov qword [r9+8],40 mov [rbp-48],rcx ; collect2_ctr mov rbx,[shm_size] mov [rbp-56],rbx ; collect2_length mov qword [r8+8],56 mov rbx,[rdi+16] ; collect3_ptr mov [rbp-64],rbx mov qword [r9+16],64 mov [rbp-72],rcx ; collect3_ctr mov rbx,[shm_size] mov [rbp-80],rbx ; collect3_length mov qword [r8+16],80 mov rbx,[rdi+24] ; collect4_ptr mov [rbp-88],rbx mov qword [r9+24],88 mov [rbp-96],rcx ; collect4_ctr mov rbx,[shm_size] mov [rbp-104],rbx ; collect4_length mov qword [r8+24],104 pop r9 pop r8 ANNOTATE_SITE_BEGIN(Complex_If) label_401: ANNOTATE_ITERATION_TASK(for_taskA) vmovsd xmm31,[r11+r10] ;______ label_8010: label_801: cmp r10,0 jne label_8020 ;______ label_12010: xor r14,r14 ; _____ label_1201: cmp r14,160 jge label_8020 vmovsd xmm31,[r15+r14] add r14,8 ;______ label_16010: xor r12,r12 label_1601: xor rax,rax mov [rbp-112],rax List_Test_17: movsd xmm0,[r13+r12] vucomisd xmm0,xmm31 jne next_18 mov qword [rbp-112],1 ;______ mov rax,[rbp-16] mov rbx,[rbp-24] vmovsd [rax+rbx],xmm31 ; REMAP add rbx,8 mov [rbp-24],rbx cmp rbx,[rbp-32] jl next_18 mov rbx,0 call remap_collect next_18: add r12,8 cmp r12, 88 jge label_1602 jmp List_Test_17 ;______ label_16020: label_1602: cmp qword[rbp-112],1 je label_1201 ;______ mov rax,[rbp-40] mov rbx,[rbp-48] vmovsd [rax+rbx],xmm31 ;collect2.append(num) ; REMAP add rbx,8 mov [rbp-48],rbx cmp rbx,[rbp-56] jl next_20 mov rbx,8 call remap_collect next_20: jmp label_1201 ;______ label_8020: xor r14,r14 vbroadcastsd ymm0,xmm31 label_802: vmovupd ymm1,[r15+r14] ;______ label_12020: label_1202: label_15010: label_1501: VCMPPD k1,ymm1,ymm0,14 ; if list_num > num vpcompressq ymm2{k1}{z},ymm1 kmovb edx,k1 popcnt rax,rdx shl rax,3 mov rdx,rax ;______ mov rax,[rbp-64] mov rbx,[rbp-72] vmovupd [rax+rbx],ymm2 ;collect3.append(list_num) ; REMAP add rbx,rdx mov [rbp-72],rbx mov rax,[rbp-80] sub rax,32 cmp rbx,rax jl next_25 mov rbx,16 call remap_collect next_25: ;______ label_15020: label_1502: VCMPPD k2,ymm1,ymm0,1 ; if list_num < num vpcompressq ymm2{k2}{z},ymm1 kmovb edx,k2 popcnt rax,rdx shl rax,3 mov rdx,rax ;______ mov rax,[rbp-88] mov rbx,[rbp-96] vmovupd [rax+rbx],ymm2 ; REMAP add rbx,rdx mov [rbp-96],rbx mov rax,[rbp-104] sub rax,32 cmp rbx,rax jl next_27 mov rbx,24 call remap_collect next_27: return_to_top: add r10,8 cmp r10,r9 jl label_401 ANNOTATE_SITE_END(Complex_If) ; __________ label_899: %include "/opt/P01_SH/_Include_Utilities/Label_899_Clock_Stop.asm" mov r14,4 %include "/opt/P01_SH/_Include_Utilities/Label_899_SglCore-MultiOutput.asm" add rsp,192 ret ; __________ label_900: mov rax,32 %include "/opt/P01_SH/_Include_Utilities/POSIX_Shared_Memory_Delete.asm" mov rdi,Return_Pointer_Array mov rax,rdi ret ;__________ ;Free the memory FreeMem_fn: ;The pointer is passed back in rcx (of course) sub rsp,40 call [rel free wrt ..got] add rsp,40 ret ; __________ ; Main Entry Main_Entry_fn: push rdi push rsi call [rel Start_Clock wrt ..got] mov [Start_Time_C],rax pop rsi pop rdi push rdi push rbp push rbx mov [input_array_ptr],rdi mov [data_master_ptr],rsi ; Assign input array ptrs lea rdi,[input_array_ptr] mov rbp,[rdi] movsd xmm0,qword[rbp+0] cvttsd2si rax,xmm0 mov [data_ptr],rax ; Assign input data length(s) lea rdi,[data_master_ptr] mov rbp,[rdi] xor rcx,rcx movsd xmm0,qword[rbp+rcx] cvttsd2si rax,xmm0 mov [data_length],rax add rcx,8 ; Assign output buffer length(s) lea rdi,[data_master_ptr] mov rbp,[rdi] vmovsd xmm0,qword[rbp] cvttsd2si rax,xmm0 mov [shm_size],rax ; __________ call Init_Cores_fn pop rbx pop rbp pop rdi ret remap_collect: %include "/opt/P01_SH/_Include_Utilities/Registers_Push_NoAVX.asm" %include "/opt/P01_SH/_Include_Utilities/Remap_Collect-MultiOutput.asm" %include "/opt/P01_SH/_Include_Utilities/Registers_Pop_NoAVX.asm" ret