; Header Section [BITS 64] [default rel] global Main_Entry_fn global Complex_Calc_YZ_fn extern sched_getcpu extern fopen, fread, fwrite, fclose extern malloc, calloc, realloc, free extern posix_memalign extern get_core_count_C, thread_create_in_C extern Start_Clock, End_Clock, Print_Time extern open_shm, get_shm_name, remap_shm, unlink_shm extern site_begin, iter_task, site_end ; Intel Advisor annotations section .data align=64 Return_Pointer_Array: dq 0, 0, 0 ;File_Create_Ptrs: dq 0 ;Output_File_Ptrs: dq 0 ;Output_File_Ctrs: dq 0 Clock_Stop: dq 0 input_array_ptr: dq 0 data_master_ptr: dq 0 Number_Of_Cores: dq 0 ; core count multiplied by 8 _e.g. 32 Number_Of_Cores_Seq: dq 0 ;sequential core count _e.g. 4 cores Number_Of_Cores_Open: dq 0 Number_Of_Cores_Calc: dq 0 cores_to_use: dq 1 stride: dq 0 AVX_Width: dq 64 Start_Time_C: dq 0 writer_core: dq 0 out_fname_ptr: dq 0 shm_size: dq 0 shm_fd: dq 0 core_count: dq 0 remap_new_ptr: dq 0 remap_new_len: dq 0 n_ptr: dq 0 n_length: dq 0 collect_ptr: dq 0 collect_length: dq 0 stack_lengths: dq 0 stack_ptrs: dq 0 pad_data_section: times 3 dq 0 section .rodata align=64 shm_base_name: db "shm_object_%d_%d",0x00 const_0.1: times 2 dq 0.1 const_1.962: times 2 dq 1.962 const_0.5: times 2 dq 0.5 const_100.0: times 2 dq 100.0 const_0.01: times 2 dq 0.01 pad_const_section: times 6 dq 0 CURR_DTM: db 1611953332.757706,0 out_fname_L: db "/opt/Test_Output_Files/Complex_Calc_YZ_NASM_Test",0x00 file_mode_create: db "wb+",0x00 file_mode_open: db "a",0x00 file_mode_open_read: db "r",0x00 ; _________ section .text Init_Cores_fn: %include "/opt/P01_SH/_Include_Utilities/Output_Files.asm" %include "/opt/P01_SH/_Include_Utilities/Init_Cores.asm" %include "/opt/P01_SH/_Include_Utilities/POSIX_Shared_Memory_Sgl.asm" ; Create Threads label_first: mov rdi,[Number_Of_Cores_Seq] ; see above call [rel thread_create_in_C wrt ..got] ; Create_Threads_in_C.c label_after: jmp label_900 ;____________ Complex_Calc_YZ_fn: mov rbp,rsp sub rsp,128 ; Get the core number call [rel sched_getcpu wrt ..got] sub rax,[writer_core] mov rbx,8 mul rbx ; multiply by 8 mov [rbp-8],rax ; Populate registers mov r15,[collect_ptr] xor r14,r14 mov r13,[n_ptr] xor r12,r12 xor r8,r8 mov r11,[collect_length] mov r10,[n_length] mov r9,[AVX_Width] ; Load constants into registers vmovupd xmm31,[const_1.962] vmovupd xmm30,[const_0.1] vmovupd xmm29,[const_0.5] vmovupd xmm28,[const_100.0] vmovupd xmm27,[const_0.01] mov rcx,1 Exponent_Label_0: vmulpd xmm26,xmm30,xmm30 add rcx,1 cmp rcx,2 jl Exponent_Label_0 ; Populate stack mov rax,65535 kmovq k7,rax ;set the even_boundary for loop unrolling mov rax,[n_length] mov rbx,32 div rbx mul rbx ; we multiply by the floor of the division mov [rbp-24],rax ; _____ call [rel site_begin wrt ..got] ; Intel Advisor label_401: call [rel iter_task wrt ..got] ; Intel Advisor movupd xmm0,[r13+r12] ;for a in n ;______ ; CODE : v0 = a ;______ ; CODE : g = v0 * 1.962 vmulpd xmm20,xmm0,xmm31 ; g=v0*1.962 ;______ ; CODE : t = 0.1 ;______ ; CODE : h = v0 * t - 0.5 * g * t @ 2 vmulpd xmm19,xmm0,xmm30 ; v0 * t vmulpd xmm18,xmm29,xmm20 ; 0.5 * g vmulpd xmm17,xmm26,xmm18 vsubpd xmm16,xmm19,xmm17 ;______ ; CODE : result = round(h,2) vmulpd xmm1,xmm16,xmm28 roundpd xmm0,xmm1,00B ;result = round(h,2) vmulpd xmm0,xmm27 ;______ ; CODE : collect.append(result) vmovupd [r15+r14+0],xmm0 ;collect.append(result) bt r8,0 jc next_singlestep ;__________ movupd xmm0,[r13+r12+16],;for a in n ;0 vmulpd xmm20,xmm0,xmm31 ; g=v0*1.962 ;0 vmulpd xmm19,xmm0,xmm30 ; v0 * t ;0 vmulpd xmm18,xmm29,xmm20 ; 0.5 * g ;0 vmulpd xmm17,xmm26,xmm18 ;* ;0 vsubpd xmm16,xmm19,xmm17 ;- ;0 vmulpd xmm1,xmm16,xmm28 ;0 roundpd xmm0,xmm1,00B ;result = round(h,2) ;0 vmulpd xmm0,xmm27 ;0 vmovupd [r15+r14+16],xmm0 ;__________ movupd xmm0,[r13+r12+32],;for a in n ;0 vmulpd xmm20,xmm0,xmm31 ; g=v0*1.962 ;0 vmulpd xmm19,xmm0,xmm30 ; v0 * t ;0 vmulpd xmm18,xmm29,xmm20 ; 0.5 * g ;0 vmulpd xmm17,xmm26,xmm18 ;* ;0 vsubpd xmm16,xmm19,xmm17 ;- ;0 vmulpd xmm1,xmm16,xmm28 ;0 roundpd xmm0,xmm1,00B ;result = round(h,2) ;0 vmulpd xmm0,xmm27 ;0 vmovupd [r15+r14+32],xmm0 ;__________ movupd xmm0,[r13+r12+48],;for a in n ;0 vmulpd xmm20,xmm0,xmm31 ; g=v0*1.962 ;0 vmulpd xmm19,xmm0,xmm30 ; v0 * t ;0 vmulpd xmm18,xmm29,xmm20 ; 0.5 * g ;0 vmulpd xmm17,xmm26,xmm18 ;* ;0 vsubpd xmm16,xmm19,xmm17 ;- ;0 vmulpd xmm1,xmm16,xmm28 ;0 roundpd xmm0,xmm1,00B ;result = round(h,2) ;0 vmulpd xmm0,xmm27 ;0 vmovupd [r15+r14+48],xmm0 add r14,r9 ; check for remap based on length - 64 mov rax,[collect_length] sub rax,64 cmp r14,rax jl NoRemap ;If this is last iteration, no remap add rax,r9 cmp rax,r12 jge NoRemap jmp do_remap next_singlestep: ; this code is only reached on singlestep add r14,16 add r12,16 cmp r12,r10 jge return_to_top do_remap: mov [rbp-56],r15 mov [rbp-64],r11 call remap_collect mov r15,[rbp-56] mov r11,[rbp-64] mov [collect_ptr],r15 mov [collect_length],r11 NoRemap: ;__________ return_to_top: add r12,r9 cmp r12,r10 jl label_401 ; __________ label_899: call [rel site_end wrt ..got] ; Intel Advisor %include "/opt/P01_SH/_Include_Utilities/Label_899_Clock_Stop.asm" mov [collect_length],r14; For file write add rsp,128 ret ; __________ label_900: %include "/opt/P01_SH/_Include_Utilities/Label_900_SglCore-SglOutput.asm" mov rax,8 %include "/opt/P01_SH/_Include_Utilities/POSIX_Shared_Memory_Delete.asm" mov rdi,Return_Pointer_Array mov rax,rdi ret ; __________ ; Main Entry Main_Entry_fn: push rdi push rsi call [rel Start_Clock wrt ..got] mov [Start_Time_C],rax pop rsi pop rdi push rdi push rbp push rbx mov [input_array_ptr],rdi mov [data_master_ptr],rsi ; Get input array ptr(s) and length(s) mov rax,[rdi+0] mov [n_ptr],rax mov rax,[rsi+0] mov [n_length],rax mov [shm_size],rax mov [collect_length],rax ; __________ call Init_Cores_fn pop rbx pop rbp pop rdi ret remap_collect: %include "/opt/P01_SH/_Include_Utilities/Registers_Push_NoAVX.asm" mov rdi,r15 mov rsi,r11 %include "/opt/P01_SH/_Include_Utilities/Remap_Collect-SglOutput.asm" %include "/opt/P01_SH/_Include_Utilities/Registers_Pop_NoAVX.asm" mov r15,[remap_new_ptr] mov [collect_ptr],r15; to write in label_900 mov r11,[remap_new_len] ret