#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"

#define N 100
//#define min(x,y) (((x) < (y)) ? (x) : (y))

// Declare global arrays
__declspec(target(mic)) static float *A, *B, *C; 
//float *A, *B, *C;

int not_main();

// Init global arrays C is set to zero (being global)
int init_arrays()
{
	long int i;
	long int nz = N*N;
	// Allocate memory with MKL tools.
	A = (float *)mkl_malloc( N*N*sizeof( float ), 64 );
	B = (float *)mkl_malloc( N*N*sizeof( float ), 64 );
	C = (float *)mkl_malloc( N*N*sizeof( float ), 64 );

	//  Check for succesful allocation
	if (A == NULL || B == NULL || C == NULL) {
		printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
		mkl_free(A);
		mkl_free(B);
		mkl_free(C);
		return (1);
	}

	// Matrix initialization
	printf(" Intializing matrix data \n\n");
	for (i = 0; i < nz; i++) 
	{
		A[i] = (float)(i+1);
	}

	for (i = 0; i < nz; i++) 
	{
		B[i] = (float)(-i-1.f);
	}

	for (i = 0; i < nz; i++) 
	{
		C[i] = 0.0f;
	}

	return (0);
}

int free_arrays()
{
	printf ("\n Deallocating memory... ");
	mkl_free(A);
	mkl_free(B);
	mkl_free(C);
	printf ("Deallocating completed \n\n");
	return (0);
}

void print_arrays_snap_shot()
{
	int i, j;
	printf (" Top left corner of matrix A: \n");
	for (i=0; i<min(N,6); i++) {
		printf("[ ");
		for (j=0; j<min(N,6); j++) {
			printf ("%5.2f ", A[j+i*N]);
		}
		printf ("]\n");
	}

	printf ("\n Top left corner of matrix B: \n");
	for (i=0; i<min(N,6); i++) {
		printf("[ ");
		for (j=0; j<min(N,6); j++) {
			printf ("%5.2f ", B[j+i*N]);
		}
		printf ("]\n");
	}

	printf ("\n Top left corner of matrix C: \n");
	for (i=0; i<min(N,6); i++) {
		printf("[ ");
		for (j=0; j<min(N,6); j++) {
			printf ("%5.2f ", C[j+i*N]);
		}
		printf ("]\n");
	}
}

void sgemm_serial()
{
	float alpha = 1.0; 
	float beta = 0.0;
	// Compute the Gemm operation
	cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 
		N, N, N, alpha, A, N, B, N, beta, C, N);
	/*cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 
		N, N, N, alpha, A, N, B, N, beta, C, N);*/
	printf (" Completed Multiplication. \n\n");
}

void main()
{
	double elapsed_t;
	float alpha = 1.0; 
	float beta = 0.0;
	const char transA = CblasNoTrans, transB = CblasNoTrans;

	const __int64 n = N;
	//const int64 n = N;
	//const int n = N;
	init_arrays();

	// Serial processor test
	sgemm_serial();
	elapsed_t = elapsed_time_T64();
	/*printf("Time %lf(seconds) \n", ((double) clock() - startTime ) / (double)CLOCKS_PER_SEC);*/

	print_arrays_snap_shot();

	//mkl_mic_enable();
#define PHI_DEV 0 
/*#pragma offload_attribute(push, target(mic)) */
// Global definition of the input/output matrices
#pragma offload_transfer target(mic:PHI_DEV) \
	in(A:length(n*n) free_if(0)) \
	in(B:length(n*n) free_if(0)) \
	in(C:length(n*n) free_if(0))

// Call the srgmm inside the offload section using the MKL "Native Acceleration"
#pragma offload target(mic:PHI_DEV) \
	in(transA, transB, n, alpha, beta) \
	nocopy(A: alloc_if(0) free_if(0)) \
	nocopy(B: alloc_if(0) free_if(0)) \
	out(C:length(n*n) alloc_if(0) free_if(0)) // output data
	{
		//omp_set_num_threads(64); // set num threads in openmp
		sgemm(&transA, &transB, &n, &n, &n, &alpha, A, &n, B, &n, &beta, C, &n);
	}
	
#pragma offload_transfer target(mic:PHI_DEV) \
	nocopy(A:length(n*n) alloc_if(0) free_if(1)) \
	nocopy(B:length(n*n) alloc_if(0) free_if(1)) \
	nocopy(C:length(n*n) alloc_if(0) free_if(1))
/* #pragma offload_attribute(pop) */
	
	print_arrays_snap_shot(); 
}

