Hardware Events Hardware Event Type Hardware Event Count ARITH.FPU_DIV_ACTIVE Not changed, 0 BR_MISP_RETIRED.ALL_BRANCHES_PS Not changed, 0 CPU_CLK_UNHALTED.REF_TSC 10,542,015,813 - 33,044,049,566 = -22,502,033,753 CPU_CLK_UNHALTED.THREAD 9,408,014,112 - 29,498,044,247 = -20,090,030,135 DSB2MITE_SWITCHES.COUNT Not changed, 0 DTLB_LOAD_MISSES.DEMAND_LD_WALK_DURATION Not changed, 0 ICACHE.MISSES Not changed, 0 IDQ.MS_CYCLES Not changed, 0 IDQ_UOPS_NOT_DELIVERED.CORE 14,000,021 - 56,000,084 = - 42,000,063 INST_RETIRED.ANY 13,246,019,869 - 28,232,042,348 = -14,986,022,479 ITLB_MISSES.WALK_DURATION Not changed, 0 L1D.REPLACEMENT 1,484,002,226 - 8,820,013,230 = - 7,336,011,004 L2_LINES_IN.ALL 191,805,754 - 792,423,772 = - 600,618,018 LD_BLOCKS.STORE_FORWARD Not changed, 0 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS 402,512,075 - 173,605,208 = 228,906,867 MACHINE_CLEARS.MASKMOV Not changed, 0 MACHINE_CLEARS.MEMORY_ORDERING Not changed, 0 MACHINE_CLEARS.SMC Not changed, 0 MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS Not changed, 0 MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT_PS Not changed, 0 MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM Not changed, 0 MEM_LOAD_UOPS_RETIRED.LLC_HIT_PS 54,622,932 - 482,502,566 = - 427,879,634 MEM_UOPS_RETIRED.ALL_STORES_PS 1,862,002,793 - 1,036,001,554 = 826,001,239 MEM_UOPS_RETIRED.SPLIT_LOADS_PS Not changed, 0 MEM_UOPS_RETIRED.SPLIT_STORES_PS Not changed, 0 OFFCORE_RESPONSE.ALL_DATA_RD.LLC_MISS.DRAM_0 Not changed, 0 TLB_ACCESS.LOAD_STLB_HIT 205,806,174 - 851,925,557 = - 646,119,383 UOPS_ISSUED.ANY 14,630,021,945 - 38,878,058,317 = -24,248,036,372 UOPS_RETIRED.RETIRE_SLOTS 14,616,021,924 - 38,864,058,296 = -24,248,036,372 /////////////////////////////////////////////////////////////////////////////// // tensor_kernel() Back-end Bound Pipeline Slots: 0.619 DIV Active: 0.000 Memory Latency LLC Miss: 0.000 LLC Hit: 0.167 DTLB Overhead: 0.182 Contested Accesses: 0.000 Data Sharing: 0.000 Memory Reissues Loads Blocked by Store Forwarding: 0.000 Split Loads: 0.000 Split Stores: 0.000 4K Aliasing: 0.154 /////////////////////////////////////////////////////////////////////////////// // tensor_kernel_1() Back-end Bound Pipeline Slots: 0.659 DIV Active: 0.000 Memory Latency LLC Miss: 0.000 LLC Hit: 0.493 DTLB Overhead: 0.198 Contested Accesses: 0.000 Data Sharing: 0.000 Memory Reissues Loads Blocked by Store Forwarding: 0.000 Split Loads: 0.000 Split Stores: 0.000 4K Aliasing: 0.028 ...A significant proportion of cycles is being spent on data fetches that miss in the L2 but hit in the LLC...