opemnp/mpi code for Matrix multiplication

dusear4la · ‎03-23-2011

Hi ,
I have this matrix multiplication code that works fine on MPI and I am trying to add Openmp directives to this code to make use of my quad-core resources, can someone please tell me where and what to add to this code to make it work for hybrid openmp/mpi. Can someone help please!

#include

typedef struct GridInfo {

int p; /* Total number of processes */

MPI_Comm comm; /* Communicator for entire grid */

MPI_Comm row_comm; /* Communicator for my row */

MPI_Comm col_comm; /* Communicator for my col */

int q; /* Order of grid */

int my_row; /* My row number */

int my_col; /* My column number */

int my_rank; /* My rank in the grid comm */

} GRID_INFO_T;

#define MAX 65536

#define RandVal rand()%10 // Value which filled the matrix

typedef struct LocalMatrix {

int n_bar;

#define Order(A) ((A)->n_bar)

float entries[MAX];

#define Entry(A,i,j) (*(((A)->entries) + ((A)->n_bar)*(i) + (j))) // Adress arithmetics

} LOCAL_MATRIX_T;

/* Function Declarations */

LOCAL_MATRIX_T* Local_matrix_allocate(int n_bar); // Memory for matrix

void Free_local_matrix(LOCAL_MATRIX_T** local_A); // Free memory

void Generate_matrix(char* title, LOCAL_MATRIX_T* local_A, // Fills the matrix

GRID_INFO_T* grid, int n);

void Print_matrix(char* title, LOCAL_MATRIX_T* local_A, // Prints the matrix on stdout

GRID_INFO_T* grid, int n);

void Set_to_zero(LOCAL_MATRIX_T* local_A); // Fills the matrix with zeroes

void Local_matrix_multiply(LOCAL_MATRIX_T* local_A, // Local matrix multiply

LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);

void Build_matrix_type(LOCAL_MATRIX_T* local_A); // Creation of new MPI type for matrix

MPI_Datatype local_matrix_mpi_t; // New MPI type for matrix

double Sequential_time(int n);

LOCAL_MATRIX_T* temp_mat;

/*********************************************************/

int main(int argc, char* argv[]) {

int p; // Number of processes

int my_rank; // Rank of process

GRID_INFO_T grid; // Grid info of process

LOCAL_MATRIX_T* local_A;

LOCAL_MATRIX_T* local_B;

LOCAL_MATRIX_T* local_C;

int n; // Order of matrix

int n_bar; // Order of submatrix

double begin_time, end_time, interval_p, interval_s;

void Setup_grid(GRID_INFO_T* grid); // Setup the grid info of process

void Fox(int n, GRID_INFO_T* grid, LOCAL_MATRIX_T* local_A, // Fox`s algorithm

LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);

MPI_Init(&argc, &argv);

MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

MPI_Comm_size(MPI_COMM_WORLD, &p);

Setup_grid(&grid); // Setup the grid info of process

if (my_rank == 0) {

printf("Enter the matrices order: ");

fflush(stdout);

scanf("%d", &n);

}

//Sending size to all processes

MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);

n_bar = n/grid.q; // Size of submatrix

local_A = Local_matrix_allocate(n_bar); // Memory for submatrix A in each process

Order(local_A) = n_bar;

Generate_matrix("\\nGenerate A", local_A, &grid, n); // Only P(0) do this

Print_matrix("Matrix A =", local_A, &grid, n); // Only P(0) do this

local_B = Local_matrix_allocate(n_bar); // Memory for submatrix B in each process

Order(local_B) = n_bar;

Generate_matrix("\\nGenerate B", local_B, &grid, n); // Only P(0) do this

Print_matrix("Matrix B =", local_B, &grid, n); // Only P(0) do this

Build_matrix_type(local_A); // Struct type for submatrix (MPI)

temp_mat = Local_matrix_allocate(n_bar); // Memory for Temp matrix

local_C = Local_matrix_allocate(n_bar); // Memory for result matrix C

Order(local_C) = n_bar; // Size of C

begin_time = MPI_Wtime();

Fox(n, &grid, local_A, local_B, local_C); // Block-algoritm of FOX

end_time = MPI_Wtime();

Print_matrix("\\nThe product is:", local_C, &grid, n);

interval_p = end_time - begin_time;

if (my_rank == 0) {

/* Sequential matrix multiplication */

interval_s = Sequential_time(n);

/* Speed-up */

printf("\\nTp = %.10f", interval_p);

printf("\\nTs = %.10f", interval_s);

printf("\\nS = %.10f\\n\\n", interval_s/interval_p);

}

Free_local_matrix(&local_A);

Free_local_matrix(&local_B);

Free_local_matrix(&local_C);

return MPI_Finalize();

} /* main */

/*********************************************************/

void Setup_grid(

GRID_INFO_T* grid /* out */) {

int old_rank;

int dimensions[2];

int wrap_around[2];

int coordinates[2];

int free_coords[2];

/* Set up Global Grid Information */

MPI_Comm_size(MPI_COMM_WORLD, &(grid->p));

MPI_Comm_rank(MPI_COMM_WORLD, &old_rank);

/* We assume p is a perfect square */

grid->q = (int) sqrt((double) grid->p);

dimensions[0] = dimensions[1] = grid->q;

/* We want a circular shift in second dimension. */

/* Don't care about first */

wrap_around[0] = wrap_around[1] = 1;

MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, wrap_around, 1, &(grid->comm));

MPI_Comm_rank(grid->comm, &(grid->my_rank));

MPI_Cart_coords(grid->comm, grid->my_rank, 2, coordinates);

grid->my_row = coordinates[0];

grid->my_col = coordinates[1];

/* Set up row communicators */

free_coords[0] = 0;

free_coords[1] = 1;

MPI_Cart_sub(grid->comm, free_coords, &(grid->row_comm));

/* Set up column communicators */

free_coords[0] = 1;

free_coords[1] = 0;

MPI_Cart_sub(grid->comm, free_coords, &(grid->col_comm));

} /* Setup_grid */

/*********************************************************/

void Fox(

int n /* in */,

GRID_INFO_T* grid /* in */,

LOCAL_MATRIX_T* local_A /* in */,

LOCAL_MATRIX_T* local_B /* in */,

LOCAL_MATRIX_T* local_C /* out */) {

LOCAL_MATRIX_T* temp_A; /* Storage for the sub- */

/* matrix of A used during */

/* the current stage */

int stage;

int bcast_root;

int n_bar; /* n/sqrt(p) */

int source;

int dest;

MPI_Status status;

n_bar = n/grid->q;

Set_to_zero(local_C);

/* Calculate addresses for circular shift of B */

source = (grid->my_row + 1) % grid->q; // (l + 1) mod q

dest = (grid->my_row + grid->q - 1) % grid->q; // (l + q - 1) mod q

/* Set aside storage for the broadcast block of A */

temp_A = Local_matrix_allocate(n_bar);

for (stage = 0; stage < grid->q; stage++) {

bcast_root = (grid->my_row + stage) % grid->q; // (l + stage) mod q --> (n)

if (bcast_root == grid->my_col) { // if (n == k)

MPI_Bcast(local_A, 1, local_matrix_mpi_t, // Broadcasting of A[l,k] (sub-block)

bcast_root, grid->row_comm);

Local_matrix_multiply(local_A, local_B, // C[l,k] += (A[l,n] * B[n,k])

local_C);

}

else {

MPI_Bcast(temp_A, 1, local_matrix_mpi_t,

bcast_root, grid->row_comm);

Local_matrix_multiply(temp_A, local_B,

local_C);

}

MPI_Sendrecv_replace(local_B, 1, local_matrix_mpi_t, //Sends and receives using a single buffer

dest, 0/* sendtag*/, source, 0/*recvtag*/, grid->col_comm, &status);

} /* for */

} /* Fox */

/*********************************************************/

LOCAL_MATRIX_T* Local_matrix_allocate(int local_order) {

LOCAL_MATRIX_T* temp;

temp = (LOCAL_MATRIX_T*) malloc(sizeof(LOCAL_MATRIX_T));

return temp;

} /* Local_matrix_allocate */

/*********************************************************/

void Free_local_matrix(

LOCAL_MATRIX_T** local_A_ptr /* in/out */) {

free(*local_A_ptr);

} /* Free_local_matrix */

/*********************************************************/

/* Read and distribute matrix:

* foreach global row of the matrix,

* foreach grid column

* read a block of n_bar floats on process 0

* and send them to the appropriate process.

*/

void Generate_matrix(

char* title /* in */,

LOCAL_MATRIX_T* local_A /* out */,

GRID_INFO_T* grid /* in */,

int n /* in */) {

int mat_row, mat_col;

int grid_row, grid_col;

int dest;

int coords[2];

float* temp;

MPI_Status status;

if (grid->my_rank == 0) {

//printf("%d\\n", grid->my_rank);

temp = (float*) malloc(Order(local_A)*sizeof(float));

printf("%s\\n", title);

fflush(stdout);

for (mat_row = 0; mat_row < n; mat_row++) {

grid_row = mat_row/Order(local_A);

coords[0] = grid_row;

for (grid_col = 0; grid_col < grid->q; grid_col++) {

coords[1] = grid_col;

MPI_Cart_rank(grid->comm, coords, &dest);

if (dest == 0) {

for (mat_col = 0; mat_col < Order(local_A); mat_col++)

//scanf("%f", (local_A->entries)+mat_row*Order(local_A)+mat_col);

*((local_A->entries)+mat_row*Order(local_A)+mat_col) = RandVal;

} else {

for(mat_col = 0; mat_col < Order(local_A); mat_col++)

//scanf("%f", temp + mat_col);

*(temp + mat_col) = RandVal;

MPI_Send(temp, Order(local_A), MPI_FLOAT, dest, 0,

grid->comm);

}

free(temp);

} else {

for (mat_row = 0; mat_row < Order(local_A); mat_row++)

MPI_Recv(&Entry(local_A, mat_row, 0), Order(local_A),

MPI_FLOAT, 0, 0, grid->comm, &status);

}

} /* Read_matrix */

/*********************************************************/

void Print_matrix(

char* title /* in */,

LOCAL_MATRIX_T* local_A /* out */,

GRID_INFO_T* grid /* in */,

int n /* in */) {

int mat_row, mat_col;

int grid_row, grid_col;

int source;

int coords[2];

float* temp;

MPI_Status status;

if (grid->my_rank == 0) {

temp = (float*) malloc(Order(local_A)*sizeof(float));

printf("%s\\n", title);

for (mat_row = 0; mat_row < n; mat_row++) {

grid_row = mat_row/Order(local_A);

coords[0] = grid_row;

for (grid_col = 0; grid_col < grid->q; grid_col++) {

coords[1] = grid_col;

MPI_Cart_rank(grid->comm, coords, &source);

if (source == 0) {

for(mat_col = 0; mat_col < Order(local_A); mat_col++)

printf("%6.1f ", Entry(local_A, mat_row, mat_col));

} else {

MPI_Recv(temp, Order(local_A), MPI_FLOAT, source, 0,

grid->comm, &status);

for(mat_col = 0; mat_col < Order(local_A); mat_col++)

printf("%6.1f ", temp[mat_col]);

}

printf("\\n");

}

free(temp);

} else {

for (mat_row = 0; mat_row < Order(local_A); mat_row++)

MPI_Send(&Entry(local_A, mat_row, 0), Order(local_A),

MPI_FLOAT, 0, 0, grid->comm);

}

} /* Print_matrix */

/*********************************************************/

void Set_to_zero(

LOCAL_MATRIX_T* local_A /* out */) {

int i, j;

for (i = 0; i < Order(local_A); i++)

for (j = 0; j < Order(local_A); j++)

Entry(local_A,i,j) = 0.0;

} /* Set_to_zero */

/*********************************************************/

void Build_matrix_type(

LOCAL_MATRIX_T* local_A /* in */) {

MPI_Datatype temp_mpi_t;

int block_lengths[2];

MPI_Aint displacements[2];

MPI_Datatype typelist[2];

MPI_Aint start_address;

MPI_Aint address;

MPI_Type_contiguous(Order(local_A)*Order(local_A),

MPI_FLOAT, &temp_mpi_t); //array[0..n-1,0..n-1] of float

block_lengths[0] = block_lengths[1] = 1;

typelist[0] = MPI_INT; //2 elements: 1) int, 2) array[...] of float

typelist[1] = temp_mpi_t;

MPI_Address(local_A, &start_address); //adress of submatrix - start

MPI_Address(&(local_A->n_bar), &address); //adress of Size of submatrix

displacements[0] = address - start_address; //between n_bar and beginning (sometimes <>0 )

MPI_Address(local_A->entries, &address); //adress of entries

displacements[1] = address - start_address; //between entries and beginning

MPI_Type_struct(2, block_lengths, displacements, //new struct type

typelist, &local_matrix_mpi_t);

MPI_Type_commit(&local_matrix_mpi_t); //return type?

} /* Build_matrix_type */

/*********************************************************/

void Local_matrix_multiply(

LOCAL_MATRIX_T* local_A /* in */,

LOCAL_MATRIX_T* local_B /* in */,

LOCAL_MATRIX_T* local_C /* out */) {

int i, j, k;

for (i = 0; i < Order(local_A); i++)

for (j = 0; j < Order(local_A); j++)

for (k = 0; k < Order(local_B); k++)

Entry(local_C,i,j) += Entry(local_A,i,k)*Entry(local_B,k,j);

} /* Local_matrix_multiply */

/*********************************************************/

double Sequential_time(int n) {

int i,j,k;

double begin_time, end_time;

float *A = (float *) malloc(n*n * sizeof(float));

float* B = (float *) malloc(n*n * sizeof(float));

float* C = (float *) malloc(n*n * sizeof(float));

for (i=0; i

A = RandVal; B = RandVal; C = 0.0;

}

begin_time = MPI_Wtime();

for (i = 0; i < n; i++) {

for (j = 0; j < n; j++) {

for (k = 0; k < n; k++) {

C[i*n+j] += A[i*n+k]*B[k*n+j];

}

end_time = MPI_Wtime();

free(A); free(B); free(C);

return end_time - begin_time;

}

Dmitry_K_Intel2 · ‎03-24-2011

You'd better ask in this forum.
I'm not sure that people knowing openMP read this forum.

Regards!
Dmitry

TimP · ‎03-24-2011

Parallel Studio might be useful for optimizing a single multi-threaded rank for a Windows target. Basically, you would like to thread the outermost loop. You might get a better combination of vector inner loop and parallel outer loop performance by borrowing some ideas from public sources, particularly if this is a class exercise. If not, you would likely get ahead faster by using one of the pre-optimized versions (MKL is already included with Intel compilers).