#include #include #include #include #include #include #include #include #define CACHE_LINE_SIZE 64 /** * Copy 64 bytes from one location to another, * locations should not overlap. */ static inline __attribute__((always_inline)) void mov64(uint8_t *dst, const uint8_t *src) { __m512i zmm0; zmm0 = _mm512_load_si512((const void *)src); _mm512_store_si512((void *)dst, zmm0); } #define likely(x) __builtin_expect((x), 1) #define unlikely(x) __builtin_expect((x), 0) static inline uint64_t rdtsc(void) { union { uint64_t tsc_64; __extension__ struct { uint32_t lo_32; uint32_t hi_32; }; } tsc; __asm__ volatile("rdtsc" : "=a" (tsc.lo_32), "=d" (tsc.hi_32)); return tsc.tsc_64; } union levels { __m512i zmm0; struct { uint32_t x1; uint64_t x2; uint64_t x3; uint32_t x4; uint32_t x5; uint32_t x6; uint32_t x7; }; } __attribute__((aligned(CACHE_LINE_SIZE))); union levels g_shared; uint32_t g_main_cpu; uint32_t g_worker_cpu; void *worker_loop(void *param) { _mm_mfence(); cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(g_worker_cpu, &cpuset); pthread_t thread = pthread_self(); pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); union levels lshared; uint32_t old_x1 = 1; uint64_t min = 10000, max = 0, sum = 0; int i = 0; while (i < 300) { __asm__ ("" ::: "memory"); lshared.zmm0 = _mm512_load_si512((const void *)&g_shared); if (unlikely(lshared.x1 <= old_x1)) { continue; } else if (unlikely(lshared.x1 != lshared.x7)) { exit(EXIT_FAILURE); } else { uint64_t val = rdtsc(); uint64_t diff = val - lshared.x2; sum += diff; if (min > diff) min = diff; if (diff > max) max = diff; i++; } old_x1 = lshared.x1; _mm_pause(); } printf("(M=%u-W=%u) min=%lu max=%lu mean=%lu\n", g_main_cpu, g_worker_cpu, min, max, sum / 300); return NULL; } int main(int argc, char *argv[]) { for (int main_cpu = 2; main_cpu <= 17; ++main_cpu) { for (int worker_cpu = 2; worker_cpu <= 17; ++worker_cpu) { if (main_cpu == worker_cpu) { continue; } _mm_mfence(); g_main_cpu = main_cpu; g_worker_cpu = worker_cpu; cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(g_main_cpu, &cpuset); pthread_t thread = pthread_self(); memset(&g_shared, 0, sizeof(g_shared)); pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); pthread_t worker; pthread_create(&worker, NULL, worker_loop, NULL); uint32_t val = 0; union levels lshared; for (int i = 0; i < 350; ++i) { lshared.x1 = val; lshared.x2 = rdtsc(); lshared.x3 = val; lshared.x4 = val; lshared.x5 = val; lshared.x6 = val; lshared.x7 = val; _mm512_store_si512((void *)&g_shared, lshared.zmm0); __asm__ ("" ::: "memory"); usleep(100000); val++; _mm_pause(); } pthread_join(worker, NULL); } } return EXIT_SUCCESS; }