#include "cuda.h"
#include
#include
#include
#include
#include
#include
#include "CORE_CUDA_2_29.cu" //ATTACHED KERNEL CODE
extern "C"
{
//ROUTINES FOR MEMCOPY
void CUcopyArrayFromDevicefloat_(vec_space* host,vec_space* device,int numComb)
{
cudaMemcpy(host, device, numComb*sizeof(vec_space), cudaMemcpyDeviceToHost); // GPU TO CPU MEMORY COPY
}
void CUcopyArrayToDevicefloatS_(vec_space* device,vec_space* host, int numComb)
{
cudaMemcpy(device, host, numComb*sizeof(vec_space),cudaMemcpyHostToDevice); // CPU TO GPU MEMORY COPY
}
void CUcopyArrayToDevicefloat_(double* device,double* host, int numComb)
{
cudaMemcpy(device, host, numComb*sizeof(double),cudaMemcpyHostToDevice); // CPU TO GPU MEMORY COPY
}
void CUcopyArrayToDeviceint_(int* device,int* host, int numComb)
{
cudaMemcpy(device, host, numComb*sizeof(int),cudaMemcpyHostToDevice); // CPU TO GPU MEMORY COPY
}
// ROUTINES FOR MALLOC
void CUallocaterealarrayssol_(double* arr[0], int numComb) //memory allocation on GPU
{
unsigned int memSize = sizeof(sol_space)*(numComb);
printf("n memsize sol sapce Mb %u n",memSize/(1024*1024));
cudaMalloc((void**)(&arr[0]), memSize);
}
void CUallocaterealarraysS_(double* arr[0], int numComb) //memory allocation on GPU
{
unsigned int memSize = sizeof(vec_space)*(numComb);
printf("n memsize vec sapce Mb %u n",memSize/(1024*1024));
cudaMalloc((void**)(&arr[0]), memSize);
}
void CUallocaterealarraysBS_(double* arr[0], int numComb) //memory allocation on GPU
{
unsigned int memSize = sizeof(BIGspace)*(numComb);
printf("n memsize BIGspace Mb %u n",memSize/(1024*1024));
cudaMalloc((void**)(&arr[0]), memSize);
}
void CUallocaterealarraysB_(double* arr[0], int numComb) //memory allocation on GPU
{
unsigned int memSize = sizeof(Bspace)*(numComb);
printf("n memsize BSpace Mb %u n",memSize/(1024*1024));
cudaMalloc((void**)(&arr[0]), memSize);
}
void CUallocateintarrays_(int* arr[0], int numComb) //memory allocation on GPU for integer arrays
{
unsigned int memSize = sizeof(arr[0])*(numComb);
(cudaMalloc((void**)&arr[0], memSize));
}
// ROUTINES FOR DELETING
void intarrayss_(double* arr[0]) // FREE GPU MEMORY
{
(cudaFree((int**)arr[0]));
}
void CUdeleterealarraysBS_(BIGspace* arr) // FREE GPU MEMORY
{
(cudaFree((void**)arr));
}
void CUdeleterealarraysB_(Bspace* arr) // FREE GPU MEMORY
{
(cudaFree((void**)arr));
}
void CUdeleterealarraysS_(vec_space* arr) // FREE GPU MEMORY
{
(cudaFree((void**)arr));
}
void CUdeleterealarrayssol_(sol_space* arr) // FREE GPU MEMORY
{
(cudaFree((void**)arr));
}
void CUError_(const char *msg) // ERROR CHECKER FOR CUDA
{
cudaError_t err = cudaGetLastError();
if(cudaSuccess !=err)
{
printf("Darn..! CUDA ERROR : %s , %s n",msg, cudaGetErrorString(err));
getchar();
}
}
//void allocate_ARRAYS_GPU(vec_space cpu_space,double* ti, int* space_dim, int* space_len,double* stm)
void allocate_gpu(double* ti,double* ode_b,double* ode_a,double* adds[5],int* len) // 1st routine called from fortran
{
cudaSetDevice(1); // creating context
//set DEVICE tesla 1060
double* ff;
double* gpu_space;
double* tempvar;
double* ypass;
double* sol;
CUallocaterealarraysS_(&gpu_space,1);
CUError_("PROBLEM MEMORY ALLOCATION GPU_SPACE ");
CUallocaterealarraysBS_(&ff,13*258);
CUError_("PROBLEM MEMORY ALLOCATION ff ");
CUallocaterealarraysB_(&tempvar,258);
CUError_("PROBLEM MEMORY ALLOCATION tempvar ");
CUallocaterealarraysB_(&ypass,258);
CUError_("PROBLEM MEMORY ALLOCATION ypass ");
CUallocaterealarrayssol_(&sol,252);
CUError_("PROBLEM MEMORY ALLOCATION solspace ");
cudaMemcpyToSymbol("ti",ti,sizeof(double));
CUError_("PROBLEM WITH CONSTANT MEMORY ALLOCATION 'TI ");
cudaMemcpyToSymbol("b",ode_b,sizeof(double)*13);//set_CUDA_constant_memoryarrayb_("b",ode_b,13);
CUError_("PROBLEM WITH CONSTANT MEMORY ALLOCATION 'b' ");
cudaMemcpyToSymbol("a",ode_a,sizeof(double)*156);//set_CUDA_constant_memoryarraya_("a",ode_a,13*12);
CUError_("PROBLEM WITH CONSTANT MEMORY ALLOCATION 'a' ");
unsigned int mem= sizeof(int)*2+sizeof(double)*(13*12+1+13);
printf("n Total constant memory allocated = %0.9f (kb) n",(double)mem/1024);
*len = N*NumberMblocks/Block_sizeR;
// STORING GPU MEMORY POINTERS TO PASS THEM
adds[0] = gpu_space;
adds[1] = tempvar;
adds[2] = ypass;
adds[3] = ff;
adds[4] = sol;
// WARM UP ROUTINE FOR THE GPU
gpu_ford_phia<<>>((vec_space*)adds[0],(BIGspace*)adds[3],(Bspace*)adds[1]);
CUError_("GPU WARM UP FAILED ");
}
// 2ND ROUTINE CALLED FROM FORTRAN
void calulate_arrays_gpu(double* cpu_space,int* space_len,int* space_dim,double* adds[5],int* update)
{
// add[5] contains the adress of the gpu memory pointers which are obtained from malloc previousl and passed to this routine
int SpaceSize = *space_len;
int upd = *update;
cudaEvent_t start,stop;
cudaMemcpy((vec_space*)adds[0],cpu_space,sizeof(vec_space),cudaMemcpyHostToDevice);
// I Know this blocks the ASYNC execution but only when this function is called again from FORTRAN
// THE code below should return without waitng for the kernels to finish
dim3 dimBlock(Block_size);
dim3 dimBlock2(Block_size2);
dim3 dimBlock3(Block_size3);
dim3 dimBlock4(Block_size4);
dim3 dimBlock5(Block_size5);
dim3 dimBlock6(Block_size6);
dim3 dimBlockR(Block_sizeR);
dim3 dimGrid((int)SpaceSize/Block_size);
dim3 dimGrid2((int)SpaceSize/Block_size2);
dim3 dimGrid3((int)SpaceSize/Block_size3);
dim3 dimGrid5((int)SpaceSize/Block_size5);
dim3 dimGrid4((int)SpaceSize/Block_size4);
dim3 dimGrid6((int)SpaceSize/Block_size5);
dim3 dimGridR((int)SpaceSize/Block_sizeR);
// KERNEL CALLS THEY SHOULD NON-BLOCKING (WELL THEY ARE EXCEPT THE LAST ONE!!) // IF YOU TIME THIS WHOLE ROUTINE WITH THE TWO CULPRIT ROTUINES , MAINLY THE LAST ONE YOU SEE CORRECT
// KERNEL QUESING BUT THESE TWO ROUTINES MESS THINGS UP AND THE CODE RESTURNS TO CPU AFTER WAITING FOR THEM TO FINISH!!
gpu_ford_phia<<>>((vec_space*)adds[0],(BIGspace*)adds[3],(Bspace*)adds[1]);
for(int i=0;i<12;i++)
{
// culprit routine 1 (minor culprit)
gpu_ford_phib<<>>((vec_space*)adds[0],(BIGspace*)adds[3],(Bspace*)adds[2],(Bspace*)adds[1],i); // THIS IS ALSO A CULPRIT but its not as bad
gpu_mtranT_prod<<>>((Bspace*)adds[2],(BIGspace*)adds[3],i);
gpu_T_prodmtran<<>>((Bspace*)adds[2],(BIGspace*)adds[3],i);
gpu_mT_prod<<>>((Bspace*)adds[2],(BIGspace*)adds[3],i);
gpu_ford_phic<<>>((vec_space*)adds[0],(BIGspace*)adds[3],(Bspace*)adds[2],(Bspace*)adds[1],i);
}
//MAJOR CULPRIT ROUTINE.. in which each thread performs reduction on the follwing equation : (Matrix_i)*Tensor_(i+1)*Matrix_i + matrix_(i+1)*tensor_i + matrix_(i+1)*matrix_i!
// i = threadIdx.x and there are 256 threads which reduce the final result to thread 0.
//updd is 0 , 1 , 2 etc depnds on the number of times this whole function is called from fortran
gpu_R<<>>((Bspace*)adds[2], upd*(N/Block_sizeR), (sol_space*)adds[4] ); // THIS IS THE MAJOR CULPRIT
}
// ROUTINES CALLED AFTER THE TIMER
void memcpycu(double* adds[5], double* out){
sol_space* cpuphi=(sol_space*)malloc(252*sizeof(sol_space));
cudaMemcpy(cpuphi,(sol_space*)adds[4],252*sizeof(sol_space), cudaMemcpyDeviceToHost);
CUError_("PROBLEM WITH memcpy ");
}
void deletegpu(double* adds[5])
{
CUdeleterealarraysS_((vec_space*)adds[0]);
CUdeleterealarraysB_((Bspace*)adds[1]);
CUdeleterealarraysB_((Bspace*)adds[2]);
CUdeleterealarraysBS_((BIGspace*)adds[3]);
CUdeleterealarrayssol_((sol_space*)adds[4]);
}
}