- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I have this matrix multiplication code that works fine on MPI and I am trying to add Openmp directives to this code to make use of my quad-core resources, can someone please tell me where and what to add to this code to make it work for hybrid openmp/mpi. Can someone help please!
#include
#include
#include
#include
typedef struct GridInfo {
int p; /* Total number of processes */
MPI_Comm comm; /* Communicator for entire grid */
MPI_Comm row_comm; /* Communicator for my row */
MPI_Comm col_comm; /* Communicator for my col */
int q; /* Order of grid */
int my_row; /* My row number */
int my_col; /* My column number */
int my_rank; /* My rank in the grid comm */
} GRID_INFO_T;
#define MAX 65536
#define RandVal rand()%10 // Value which filled the matrix
typedef struct LocalMatrix {
int n_bar;
#define Order(A) ((A)->n_bar)
float entries[MAX];
#define Entry(A,i,j) (*(((A)->entries) + ((A)->n_bar)*(i) + (j))) // Adress arithmetics
} LOCAL_MATRIX_T;
/* Function Declarations */
LOCAL_MATRIX_T* Local_matrix_allocate(int n_bar); // Memory for matrix
void Free_local_matrix(LOCAL_MATRIX_T** local_A); // Free memory
void Generate_matrix(char* title, LOCAL_MATRIX_T* local_A, // Fills the matrix
GRID_INFO_T* grid, int n);
void Print_matrix(char* title, LOCAL_MATRIX_T* local_A, // Prints the matrix on stdout
GRID_INFO_T* grid, int n);
void Set_to_zero(LOCAL_MATRIX_T* local_A); // Fills the matrix with zeroes
void Local_matrix_multiply(LOCAL_MATRIX_T* local_A, // Local matrix multiply
LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);
void Build_matrix_type(LOCAL_MATRIX_T* local_A); // Creation of new MPI type for matrix
MPI_Datatype local_matrix_mpi_t; // New MPI type for matrix
double Sequential_time(int n);
LOCAL_MATRIX_T* temp_mat;
/*********************************************************/
int main(int argc, char* argv[]) {
int p; // Number of processes
int my_rank; // Rank of process
GRID_INFO_T grid; // Grid info of process
LOCAL_MATRIX_T* local_A;
LOCAL_MATRIX_T* local_B;
LOCAL_MATRIX_T* local_C;
int n; // Order of matrix
int n_bar; // Order of submatrix
double begin_time, end_time, interval_p, interval_s;
void Setup_grid(GRID_INFO_T* grid); // Setup the grid info of process
void Fox(int n, GRID_INFO_T* grid, LOCAL_MATRIX_T* local_A, // Fox`s algorithm
LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &p);
Setup_grid(&grid); // Setup the grid info of process
if (my_rank == 0) {
printf("Enter the matrices order: ");
fflush(stdout);
scanf("%d", &n);
}
//Sending size to all processes
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
n_bar = n/grid.q; // Size of submatrix
local_A = Local_matrix_allocate(n_bar); // Memory for submatrix A in each process
Order(local_A) = n_bar;
Generate_matrix("\\nGenerate A", local_A, &grid, n); // Only P(0) do this
Print_matrix("Matrix A =", local_A, &grid, n); // Only P(0) do this
local_B = Local_matrix_allocate(n_bar); // Memory for submatrix B in each process
Order(local_B) = n_bar;
Generate_matrix("\\nGenerate B", local_B, &grid, n); // Only P(0) do this
Print_matrix("Matrix B =", local_B, &grid, n); // Only P(0) do this
Build_matrix_type(local_A); // Struct type for submatrix (MPI)
temp_mat = Local_matrix_allocate(n_bar); // Memory for Temp matrix
local_C = Local_matrix_allocate(n_bar); // Memory for result matrix C
Order(local_C) = n_bar; // Size of C
begin_time = MPI_Wtime();
Fox(n, &grid, local_A, local_B, local_C); // Block-algoritm of FOX
end_time = MPI_Wtime();
Print_matrix("\\nThe product is:", local_C, &grid, n);
interval_p = end_time - begin_time;
if (my_rank == 0) {
/* Sequential matrix multiplication */
interval_s = Sequential_time(n);
/* Speed-up */
printf("\\nTp = %.10f", interval_p);
printf("\\nTs = %.10f", interval_s);
printf("\\nS = %.10f\\n\\n", interval_s/interval_p);
}
Free_local_matrix(&local_A);
Free_local_matrix(&local_B);
Free_local_matrix(&local_C);
return MPI_Finalize();
} /* main */
/*********************************************************/
void Setup_grid(
GRID_INFO_T* grid /* out */) {
int old_rank;
int dimensions[2];
int wrap_around[2];
int coordinates[2];
int free_coords[2];
/* Set up Global Grid Information */
MPI_Comm_size(MPI_COMM_WORLD, &(grid->p));
MPI_Comm_rank(MPI_COMM_WORLD, &old_rank);
/* We assume p is a perfect square */
grid->q = (int) sqrt((double) grid->p);
dimensions[0] = dimensions[1] = grid->q;
/* We want a circular shift in second dimension. */
/* Don't care about first */
wrap_around[0] = wrap_around[1] = 1;
MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, wrap_around, 1, &(grid->comm));
MPI_Comm_rank(grid->comm, &(grid->my_rank));
MPI_Cart_coords(grid->comm, grid->my_rank, 2, coordinates);
grid->my_row = coordinates[0];
grid->my_col = coordinates[1];
/* Set up row communicators */
free_coords[0] = 0;
free_coords[1] = 1;
MPI_Cart_sub(grid->comm, free_coords, &(grid->row_comm));
/* Set up column communicators */
free_coords[0] = 1;
free_coords[1] = 0;
MPI_Cart_sub(grid->comm, free_coords, &(grid->col_comm));
} /* Setup_grid */
/*********************************************************/
void Fox(
int n /* in */,
GRID_INFO_T* grid /* in */,
LOCAL_MATRIX_T* local_A /* in */,
LOCAL_MATRIX_T* local_B /* in */,
LOCAL_MATRIX_T* local_C /* out */) {
LOCAL_MATRIX_T* temp_A; /* Storage for the sub- */
/* matrix of A used during */
/* the current stage */
int stage;
int bcast_root;
int n_bar; /* n/sqrt(p) */
int source;
int dest;
MPI_Status status;
n_bar = n/grid->q;
Set_to_zero(local_C);
/* Calculate addresses for circular shift of B */
source = (grid->my_row + 1) % grid->q; // (l + 1) mod q
dest = (grid->my_row + grid->q - 1) % grid->q; // (l + q - 1) mod q
/* Set aside storage for the broadcast block of A */
temp_A = Local_matrix_allocate(n_bar);
for (stage = 0; stage < grid->q; stage++) {
bcast_root = (grid->my_row + stage) % grid->q; // (l + stage) mod q --> (n)
if (bcast_root == grid->my_col) { // if (n == k)
MPI_Bcast(local_A, 1, local_matrix_mpi_t, // Broadcasting of A[l,k] (sub-block)
bcast_root, grid->row_comm);
Local_matrix_multiply(local_A, local_B, // C[l,k] += (A[l,n] * B[n,k])
local_C);
}
else {
MPI_Bcast(temp_A, 1, local_matrix_mpi_t,
bcast_root, grid->row_comm);
Local_matrix_multiply(temp_A, local_B,
local_C);
}
MPI_Sendrecv_replace(local_B, 1, local_matrix_mpi_t, //Sends and receives using a single buffer
dest, 0/* sendtag*/, source, 0/*recvtag*/, grid->col_comm, &status);
} /* for */
} /* Fox */
/*********************************************************/
LOCAL_MATRIX_T* Local_matrix_allocate(int local_order) {
LOCAL_MATRIX_T* temp;
temp = (LOCAL_MATRIX_T*) malloc(sizeof(LOCAL_MATRIX_T));
return temp;
} /* Local_matrix_allocate */
/*********************************************************/
void Free_local_matrix(
LOCAL_MATRIX_T** local_A_ptr /* in/out */) {
free(*local_A_ptr);
} /* Free_local_matrix */
/*********************************************************/
/* Read and distribute matrix:
* foreach global row of the matrix,
* foreach grid column
* read a block of n_bar floats on process 0
* and send them to the appropriate process.
*/
void Generate_matrix(
char* title /* in */,
LOCAL_MATRIX_T* local_A /* out */,
GRID_INFO_T* grid /* in */,
int n /* in */) {
int mat_row, mat_col;
int grid_row, grid_col;
int dest;
int coords[2];
float* temp;
MPI_Status status;
if (grid->my_rank == 0) {
//printf("%d\\n", grid->my_rank);
temp = (float*) malloc(Order(local_A)*sizeof(float));
printf("%s\\n", title);
fflush(stdout);
for (mat_row = 0; mat_row < n; mat_row++) {
grid_row = mat_row/Order(local_A);
coords[0] = grid_row;
for (grid_col = 0; grid_col < grid->q; grid_col++) {
coords[1] = grid_col;
MPI_Cart_rank(grid->comm, coords, &dest);
if (dest == 0) {
for (mat_col = 0; mat_col < Order(local_A); mat_col++)
//scanf("%f", (local_A->entries)+mat_row*Order(local_A)+mat_col);
*((local_A->entries)+mat_row*Order(local_A)+mat_col) = RandVal;
} else {
for(mat_col = 0; mat_col < Order(local_A); mat_col++)
//scanf("%f", temp + mat_col);
*(temp + mat_col) = RandVal;
MPI_Send(temp, Order(local_A), MPI_FLOAT, dest, 0,
grid->comm);
}
}
}
free(temp);
} else {
for (mat_row = 0; mat_row < Order(local_A); mat_row++)
MPI_Recv(&Entry(local_A, mat_row, 0), Order(local_A),
MPI_FLOAT, 0, 0, grid->comm, &status);
}
} /* Read_matrix */
/*********************************************************/
void Print_matrix(
char* title /* in */,
LOCAL_MATRIX_T* local_A /* out */,
GRID_INFO_T* grid /* in */,
int n /* in */) {
int mat_row, mat_col;
int grid_row, grid_col;
int source;
int coords[2];
float* temp;
MPI_Status status;
if (grid->my_rank == 0) {
temp = (float*) malloc(Order(local_A)*sizeof(float));
printf("%s\\n", title);
for (mat_row = 0; mat_row < n; mat_row++) {
grid_row = mat_row/Order(local_A);
coords[0] = grid_row;
for (grid_col = 0; grid_col < grid->q; grid_col++) {
coords[1] = grid_col;
MPI_Cart_rank(grid->comm, coords, &source);
if (source == 0) {
for(mat_col = 0; mat_col < Order(local_A); mat_col++)
printf("%6.1f ", Entry(local_A, mat_row, mat_col));
} else {
MPI_Recv(temp, Order(local_A), MPI_FLOAT, source, 0,
grid->comm, &status);
for(mat_col = 0; mat_col < Order(local_A); mat_col++)
printf("%6.1f ", temp[mat_col]);
}
}
printf("\\n");
}
free(temp);
} else {
for (mat_row = 0; mat_row < Order(local_A); mat_row++)
MPI_Send(&Entry(local_A, mat_row, 0), Order(local_A),
MPI_FLOAT, 0, 0, grid->comm);
}
} /* Print_matrix */
/*********************************************************/
void Set_to_zero(
LOCAL_MATRIX_T* local_A /* out */) {
int i, j;
for (i = 0; i < Order(local_A); i++)
for (j = 0; j < Order(local_A); j++)
Entry(local_A,i,j) = 0.0;
} /* Set_to_zero */
/*********************************************************/
void Build_matrix_type(
LOCAL_MATRIX_T* local_A /* in */) {
MPI_Datatype temp_mpi_t;
int block_lengths[2];
MPI_Aint displacements[2];
MPI_Datatype typelist[2];
MPI_Aint start_address;
MPI_Aint address;
MPI_Type_contiguous(Order(local_A)*Order(local_A),
MPI_FLOAT, &temp_mpi_t); //array[0..n-1,0..n-1] of float
block_lengths[0] = block_lengths[1] = 1;
typelist[0] = MPI_INT; //2 elements: 1) int, 2) array[...] of float
typelist[1] = temp_mpi_t;
MPI_Address(local_A, &start_address); //adress of submatrix - start
MPI_Address(&(local_A->n_bar), &address); //adress of Size of submatrix
displacements[0] = address - start_address; //between n_bar and beginning (sometimes <>0 )
MPI_Address(local_A->entries, &address); //adress of entries
displacements[1] = address - start_address; //between entries and beginning
MPI_Type_struct(2, block_lengths, displacements, //new struct type
typelist, &local_matrix_mpi_t);
MPI_Type_commit(&local_matrix_mpi_t); //return type?
} /* Build_matrix_type */
/*********************************************************/
void Local_matrix_multiply(
LOCAL_MATRIX_T* local_A /* in */,
LOCAL_MATRIX_T* local_B /* in */,
LOCAL_MATRIX_T* local_C /* out */) {
int i, j, k;
for (i = 0; i < Order(local_A); i++)
for (j = 0; j < Order(local_A); j++)
for (k = 0; k < Order(local_B); k++)
Entry(local_C,i,j) += Entry(local_A,i,k)*Entry(local_B,k,j);
} /* Local_matrix_multiply */
/*********************************************************/
double Sequential_time(int n) {
int i,j,k;
double begin_time, end_time;
float *A = (float *) malloc(n*n * sizeof(float));
float* B = (float *) malloc(n*n * sizeof(float));
float* C = (float *) malloc(n*n * sizeof(float));
for (i=0; i
A = RandVal; B = RandVal; C = 0.0;
}
begin_time = MPI_Wtime();
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
C[i*n+j] += A[i*n+k]*B[k*n+j];
}
}
}
end_time = MPI_Wtime();
free(A); free(B); free(C);
return end_time - begin_time;
}
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I'm not sure that people knowing openMP read this forum.
Regards!
Dmitry
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page