- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hello, everyone!
I'm currently researching possibilities of graph algorithm implementations using vector instructions (working in KNL and using AVX-512) and need some help and advice. The first algorithm I tried to implement was Bellman-Ford for shortest paths computations. It's multicore version is relatively easy:
bool changes = true; #pragma omp parallel num_threads(omp_threads) { // do bellman-ford algorithm while (changes) { #pragma omp barrier changes = false; #pragma omp barrier #pragma omp for schedule(static) for (long long cur_edge = 0; cur_edge < edges_count; cur_edge++) { int src = src_ids[cur_edge]; int dst = dst_ids[cur_edge]; _TEdgeWeight weight = weights[cur_edge]; _TEdgeWeight src_distance = _distances[src]; _TEdgeWeight dst_distance = _distances[dst]; if (dst_distance > src_distance + weight) { _distances[dst] = src_distance + weight; changes = true; } } #pragma omp barrier } }
So we get data (source vertex id, destination vertex id and weight) about each graph edge from memory, and try to update distances array with data loaded (which is initially set to inf).
AVX version of this code is bit more complicated:
bool changes = true; __mmask16 *changes_mask = new __mmask16[omp_threads]; #pragma omp parallel num_threads(omp_threads) { while (changes) { #pragma omp barrier changes = false; int thread_id = omp_get_thread_num(); changes_mask[thread_id] = false_mask; #pragma omp barrier #pragma omp for schedule(static) for (int i = 0; i < (edges_count / 16); i++) { __m512i src_avx = _mm512_load_si512(&src_ids_ptrs); __m512i dst_avx = _mm512_load_si512(&dst_ids_ptrs); __m512 weight_avx = _mm512_load_ps(&aligned_weights[16*i]); __m512 src_distance_avx = _mm512_i32gather_ps(src_avx, aligned_distances, 4); __m512 dst_distance_avx = _mm512_i32gather_ps(dst_avx, aligned_distances, 4); __m512 new_distance_avx = _mm512_add_ps(src_distance_avx, weight_avx); __mmask16 cmp_result_mask = _mm512_cmp_ps_mask(new_distance_avx, dst_distance_avx, _MM_CMPINT_LT); changes_mask[thread_id] = _mm512_kor(changes_mask[thread_id], cmp_result_mask); _mm512_mask_i32scatter_ps(aligned_distances, cmp_result_mask, dst_avx, new_distance_avx, 4); } #pragma omp barrier CPUmask check_changes = { changes_mask[thread_id] }; if((check_changes.a[0] != 0)|| (check_changes.a[1] != 0)) changes = true; #pragma omp barrier } }
avx time : 119.692 sec
usual time: 99.2013 sec
avx time : 7.49615 sec
usual time: 2.62707 sec
- Tags:
- Intel® Advanced Vector Extensions (Intel® AVX)
- Intel® Streaming SIMD Extensions
- Parallel Computing
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Ilya,
Your first (simplified) code example has an error and inefficiencies. The following should be a better simplified example:
bool changes = true; #pragma omp parallel num_threads(omp_threads) { // do bellman-ford algorithm while (changes) { ! *** unnecessary barrier ! *** #pragma omp barrier changes = false; #pragma omp barrier ! (necessary) #pragma omp for schedule(static) for (long long cur_edge = 0; cur_edge < edges_count; cur_edge++) { int src = src_ids[cur_edge]; int dst = dst_ids[cur_edge]; _TEdgeWeight weight = weights[cur_edge]; _TEdgeWeight src_distance = _distances[src]; _TEdgeWeight dst_distance = _distances[dst]; if (dst_distance > src_distance + weight) { ! *** multiple threads may have reached the same conclusion #pragma omp critical { if (dst_distance > src_distance + weight) { _distances[dst] = src_distance + weight; changes = true; } } } } ! barrier is implicit on omp for ! *** #pragma omp barrier } }
The following *** untested *** intrinsic code might be better:
__mmask16 changes_mask = true_mask; // copy for all threads #pragma omp parallel num_threads(omp_threads) { int thread_id = omp_get_thread_num(); // inside parallel region, outside loop while (changes_mask) { changes_mask = false_mask; #pragma omp barrier #pragma omp for schedule(static) reduction(|:changes_mask) for (int i = 0; i < (edges_count / 16); i++) { __m512i src_avx = _mm512_load_si512(&src_ids_ptrs); __m512i dst_avx = _mm512_load_si512(&dst_ids_ptrs); __m512 weight_avx = _mm512_load_ps(&aligned_weights[16*i]); __m512 src_distance_avx = _mm512_i32gather_ps(src_avx, aligned_distances, 4); __m512 dst_distance_avx = _mm512_i32gather_ps(dst_avx, aligned_distances, 4); __m512 new_distance_avx = _mm512_add_ps(src_distance_avx, weight_avx); __mmask16 cmp_result_mask = _mm512_cmp_ps_mask(new_distance_avx, dst_distance_avx, _MM_CMPINT_LT); changes_mask = _mm512_kor(changes_mask, cmp_result_mask); _mm512_mask_i32scatter_ps(aligned_distances, cmp_result_mask, dst_avx, new_distance_avx, 4); } } }
Note, you may have issues with the type for changes_mask.
Also note: See if you can rearrange the data such that you can avoid use of gather and scatter.
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Could you share your test data so that one could test it? It seems that performance decrease is due to data race.
You may consider to have at most 1 thread to update each element of the '_distance' array in each iteration......
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page