// gcc -DUSE_PREFETCH -fno-tree-vectorize -fno-inline -std=c99 -Wall -O3 -g loop.c -o loop // gcc -fno-tree-vectorize -fno-inline -std=c99 -Wall -O3 -g loop.c -o loop // For icc and gcc, -fno-tree-vectorize helps keep the loop as designed. clang ignores it. // If times are extremely fast, check the assembly to see if the loop was vectorized. #include #include #include #include #include #include // number of elements in test array #define ARRAY_LENGTH (1024 * 1024) // report minimum cycles after this number of attempts #define TIMING_REPEATS (100) #ifdef USE_PREFETCH #undef PREFETCH #define PREFETCH(x) _mm_prefetch((char *)(x), 0) #else #define PREFETCH(x) #endif #define RDTSC_START(cycles) \ do { \ register unsigned cyc_high, cyc_low; \ __asm volatile("cpuid\n\t" \ "rdtsc\n\t" \ "mov %%edx, %0\n\t" \ "mov %%eax, %1\n\t" \ : "=r" (cyc_high), "=r" (cyc_low) \ :: "%rax", "%rbx", "%rcx", "%rdx"); \ (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ } while (0) #define RDTSC_FINAL(cycles) \ do { \ register unsigned cyc_high, cyc_low; \ __asm volatile("rdtscp\n\t" \ "mov %%edx, %0\n\t" \ "mov %%eax, %1\n\t" \ "cpuid\n\t" \ : "=r" (cyc_high), "=r" (cyc_low) \ :: "%rax", "%rbx", "%rcx", "%rdx"); \ (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ } while(0) #define PRINT_TIMED_TEST(test, args...) \ do { \ printf("%20s: ", #test); \ fflush(NULL); \ uint64_t cycles_start, cycles_final, cycles_diff; \ uint64_t min_diff = (uint64_t) -1; \ for (int i = 0; i < TIMING_REPEATS; i++) { \ RDTSC_START(cycles_start); \ test(args); \ RDTSC_FINAL(cycles_final); \ cycles_diff = (cycles_final - cycles_start); \ if (cycles_diff < min_diff) min_diff = cycles_diff; \ } \ float cycles_per_element = min_diff / (float) ARRAY_LENGTH; \ printf("%.2f cycles/element\n", cycles_per_element); \ } while (0) void loop_forward_single(size_t length, uint32_t *array) { uint32_t *top = array; while(length--) { PREFETCH(top + 64); *top++ = 1; } } void loop_forward_double(size_t length, uint32_t *array) { size_t half = length / 2; uint32_t *top = array; uint32_t *middle = array + half; while(half--) { PREFETCH(top + 32); *top++ = 1; PREFETCH(middle + 32); *middle++ = 1; } } void loop_backward_single(size_t length, uint32_t *array) { uint32_t *bottom = array + length - 1; while(length--) { PREFETCH(bottom - 64); *bottom-- = 1; } } void loop_backward_double(size_t length, uint32_t *array) { size_t half = length / 2; uint32_t *middle = array + half; uint32_t *bottom = array + length - 1; while(half--) { PREFETCH(middle - 32); *middle-- = 1; PREFETCH(bottom - 32); *bottom-- = 1; } } void loop_to_middle(size_t length, uint32_t *array) { size_t half = length / 2; uint32_t *top = array; uint32_t *bottom = array + length - 1; while(half--) { PREFETCH(top + 32); *top++ = 1; PREFETCH(bottom - 32); *bottom-- = 1; } } int main(int argc, char **argv) { size_t array_bytes = ARRAY_LENGTH * sizeof(uint32_t); uint32_t *array = malloc(array_bytes); uint32_t *final = malloc(array_bytes); memset(array, 0, array_bytes); memset(final, 0, array_bytes); printf("Testing with array length=%d", ARRAY_LENGTH); #ifdef USE_PREFETCH printf(" with prefetch on\n"); #else printf(" without prefetch\n"); #endif PRINT_TIMED_TEST(loop_forward_single, ARRAY_LENGTH, array); PRINT_TIMED_TEST(loop_forward_double, ARRAY_LENGTH, array); PRINT_TIMED_TEST(loop_backward_single, ARRAY_LENGTH, array); PRINT_TIMED_TEST(loop_backward_double, ARRAY_LENGTH, array); PRINT_TIMED_TEST(loop_to_middle, ARRAY_LENGTH, array); }