Incorrect data distribution of subarrays with MPI_Scatterv

pavel_kratochvil · ‎04-22-2024

Working on an MPI application in which I scatter tiles of ints from the root rank across all other ranks using `MPI_Scatterv`, I encountered a bug in the distribution in certain configurations (sizes of the tiles and number of processes).

The program runs correctly with OpenMPI and as was confirmed here also with MPICH. However, produces incorrect results when launched with Intel MPI, where the data distributed across ranks is incorrect, although the counts and displacements used in `MPI_Scatterv` are verified to be correct.

This issue specifically arises when the `np * 4 == global_edge_size`, while other configurations with different decompositions and tile sizes work fine.

In the provided code, the `global_edge_size == 32` produces incorrect results for `-np 8`. The incorrect distribution is visible when the ranks 5, 6 and 7 abort when comparing the received values with the reference ones.

Code in C++ can be found in the original SO submission here.

main.c

#include "mpi.h"
#include <stdio.h>

int main(int argc, char** argv) {
    MPI_Init(&argc, &argv);

    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    const int global_edge_size = 32;

    // fill the initial values for distribution on root rank
    int values_for_distribution[32 * 32] = { 0 };
    if (rank == 0) {
        for (int i = 0; i < global_edge_size * global_edge_size; i++) {
            values_for_distribution[i] = i;
        }
    }

    // 1d decomposition
    // num_ranks in x and y direction
    const int dims[2] = {8, 1};
    const int n_dims = 1;

    const int tile_size_x = global_edge_size / size;
    const int tile_size_y = global_edge_size;
    // local tile size with border around the tile
    const int border_size = 2;
    const int tile_size_with_halo_x = tile_size_x + (2 * border_size);
    const int tile_size_with_halo_y = tile_size_y + (2 * border_size);
    
    MPI_Datatype global_tile_type_float, local_tile_type_float;
    
    // only root needs the global tile type
    if (rank == 0) {
        const int domain_dims[2] = {global_edge_size, global_edge_size};
        const int tile_dims[2] = {tile_size_y, tile_size_x};
        const int start_arr[2] = {0, 0};

        // initial datatypes to derive the resized types from
        MPI_Datatype tile_org_type_float = MPI_DATATYPE_NULL;
        MPI_Type_create_subarray(2, domain_dims, tile_dims, start_arr, MPI_ORDER_C, MPI_INT, &tile_org_type_float);
        MPI_Type_create_resized(tile_org_type_float, 0, 1 * sizeof(float), &global_tile_type_float);
        MPI_Type_commit(&global_tile_type_float);

        MPI_Type_free(&tile_org_type_float);
    }

    int local_tile_with_halo_dims[2] = {tile_size_with_halo_y, tile_size_with_halo_x};
    int local_tile_dims[2] = {tile_size_y, tile_size_x};
    int start_arr[2] = {border_size, border_size};

    MPI_Type_create_subarray(2, local_tile_with_halo_dims, local_tile_dims, start_arr, MPI_ORDER_C, MPI_INT, &local_tile_type_float);
    MPI_Type_commit(&local_tile_type_float);

    // array of ones, each rank receives one tile
    int counts[8] = {1, 1, 1, 1, 1, 1, 1, 1};
    int displacements[8] = {0, 4, 8, 12, 16, 20, 24, 28};
    // local buffer for receiving
    // (tile_size_y + (2 * border_size)) * (tile_size_x + (2 * border_size))
    int tile_temps[(32 + (2 * 2)) * (4 + (2 * 2))] = {0};

    MPI_Scatterv(values_for_distribution, counts, displacements, global_tile_type_float, tile_temps, 1, local_tile_type_float, 0, MPI_COMM_WORLD);

    for (int i = 0; i < size; i++) {
        if (rank == i) {
            for (int i = 0; i < global_edge_size * global_edge_size; i++) {
                values_for_distribution[i] = i;
            }
            for (int r = 0; r < tile_size_y; r++) {
                for (int c = 0; c < tile_size_x; c++) {
                    int rank_value = tile_temps[(r + border_size) * tile_size_with_halo_x + c + border_size];
                    
                    int offset_in_row = rank * tile_size_x;
                    int reference_value = values_for_distribution[r * 32 + offset_in_row + c];

                    if (rank_value != reference_value) MPI_Abort(MPI_COMM_WORLD, -1);
                }
            }
        }
    }

    MPI_Finalize();
    return 0;
}

CMakeLists.txt

cmake_minimum_required(VERSION 3.20)

project(test LANGUAGES C)

set(CMAKE_CXX_STANDARD          17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

find_package(MPI REQUIRED COMPONENTS C)

add_executable(main ${CMAKE_CURRENT_SOURCE_DIR}/main.c)
target_link_libraries(main MPI::MPI_C)

Produced output:

Abort(-1) on node 5 (rank 5 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, -1) - process 5
Abort(-1) on node 6 (rank 6 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, -1) - process 6
Abort(-1) on node 7 (rank 7 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, -1) - process 7

Commands used for compilation

cmake -Bbuild -S.
cmake --build build --config Release
mpirun -np 8 ./main

mpirun --version

$ mpirun --version
Intel(R) MPI Library for Linux* OS, Version 2021.10 Build 20230619 (id: c2e19c2f3e)
Copyright 2003-2023, Intel Corporation.

TobiasK · ‎04-24-2024

@pavel_kratochvil

Thank you for reporting this. We are looking into this issue and will fix the issue in a future release.