/*
********************************************************************************
*   Copyright(C) 2004-2014 Intel Corporation. All Rights Reserved.
*
*   The source code, information  and  material ("Material") contained herein is
*   owned  by Intel Corporation or its suppliers or licensors, and title to such
*   Material remains  with Intel Corporation  or its suppliers or licensors. The
*   Material  contains proprietary information  of  Intel or  its  suppliers and
*   licensors. The  Material is protected by worldwide copyright laws and treaty
*   provisions. No  part  of  the  Material  may  be  used,  copied, reproduced,
*   modified, published, uploaded, posted, transmitted, distributed or disclosed
*   in any way  without Intel's  prior  express written  permission. No  license
*   under  any patent, copyright  or  other intellectual property rights  in the
*   Material  is  granted  to  or  conferred  upon  you,  either  expressly,  by
*   implication, inducement,  estoppel or  otherwise.  Any  license  under  such
*   intellectual  property  rights must  be express  and  approved  by  Intel in
*   writing.
*
*   *Third Party trademarks are the property of their respective owners.
*
*   Unless otherwise  agreed  by Intel  in writing, you may not remove  or alter
*   this  notice or  any other notice embedded  in Materials by Intel or Intel's
*   suppliers or licensors in any way.
*
********************************************************************************
*
*   MKL Cluster Sparse Solver example demonstrating the case when initial data (matrix
*   and rhs) distributed between several MPI processes, final solution is
*   distributed between MPI processes in the same way as they hold initial data.
*
********************************************************************************
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"
#include "mkl.h"
#include "mkl_cluster_sparse_solver.h"

int main(void)
{
    /* Matrix data. */
    MKL_INT n = 5;

    MKL_INT mtype = 11; /* Real unsymmetric matrix */
    MKL_INT *ia = NULL;
    MKL_INT *ja = NULL;
    double  *a = NULL;
    /* RHS and solution vectors. */
    double  *b = NULL;
    double  *x = NULL;

    MKL_INT nrhs = 1; /* Number of right hand sides. */
    /* Internal solver memory pointer pt, */
    /* 32-bit: int pt[64]; 64-bit: long int pt[64] */
    /* or void *pt[64] should be OK on both architectures */
    void *pt[64] = { 0 };
    /* Cluster Sparse Solver control parameters. */
    MKL_INT iparm[64] = { 0 };
    MKL_INT maxfct, mnum, phase, msglvl, error;

    /* Auxiliary variables. */
    double  ddum; /* Double dummy   */
    MKL_INT idum; /* Integer dummy. */
    MKL_INT j;
    int     mpi_stat = 0;
    int     argc = 0;
    int     comm, rank, size;
    char**  argv;

    /* -------------------------------------------------------------------- */
    /* .. Init MPI.                                                         */
    /* -------------------------------------------------------------------- */
    mpi_stat = MPI_Init( &argc, &argv );
    mpi_stat = MPI_Comm_rank( MPI_COMM_WORLD, &rank );
	mpi_stat = MPI_Comm_size( MPI_COMM_WORLD, &size );
    comm =  MPI_Comm_c2f( MPI_COMM_WORLD );

	if( size < 2 )
	{
		printf("\nERROR: this example doesn't work on number of MPI less than 2");
		mpi_stat = MPI_Finalize();
		return 1;
	}

    /* -------------------------------------------------------------------- */
    /* .. Setup Cluster Sparse Solver control parameters.                                 */
    /* -------------------------------------------------------------------- */
    iparm[ 0] =  1; /* Solver default parameters overriden with provided by iparm */
    iparm[ 1] =  2; /* Use METIS for fill-in reordering */
    iparm[ 5] =  0; /* Write solution into x */
    iparm[ 7] =  2; /* Max number of iterative refinement steps */
    iparm[ 9] = 13; /* Perturb the pivot elements with 1E-13 */
    iparm[10] =  1; /* Use nonsymmetric permutation and scaling MPS */
    iparm[12] =  1; /* Switch on Maximum Weighted Matching algorithm (default for non-symmetric) */
    iparm[17] = -1; /* Output: Number of nonzeros in the factor LU */
    iparm[18] = -1; /* Output: Mflops for LU factorization */
    iparm[26] =  1; /* Check input data for correctness */
    iparm[39] =  2; /* Input: matrix/rhs/solution are distributed between MPI processes  */
    /* If iparm[39]=2, the matrix is provided in distributed assembled matrix input          
       format. In this case, each MPI process stores only a part (or domain) of the matrix A 
       data. The bounds of the domain should be set via iparm(41) and iparm(42). Solution    
       vector is distributed between process in same manner with rhs. */		

    maxfct = 1; /* Maximum number of numerical factorizations. */
    mnum   = 1; /* Which factorization to use. */
    msglvl = 1; /* Print statistical information in file */
    error  = 0; /* Initialize error flag */

    /* Initialize matrix and rhs components on each process:
       In this example initial matrix is distributed between 2 processes
       so for MPI processes with rank > 1 input domains are empty */
    if ( rank == 0 )
    {
        iparm[40] = 1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = 3; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */
        ia = (MKL_INT*) MKL_malloc (sizeof (MKL_INT) * 4, 64);
        ja = (MKL_INT*) MKL_malloc (sizeof (MKL_INT) * 6, 64);
        a = (double*) MKL_malloc (sizeof (double ) * 6, 64);
        x = (double*) MKL_malloc (sizeof (double) * 3, 64);
        b = (double*) MKL_malloc (sizeof (double) * 3, 64);
        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
        ia[0] = 1;
        ia[1] = 4;
        ia[2] = 6;
        ia[3] = 7;

        ja[0] = 1;
        ja[1] = 2;
        ja[2] = 3;
        ja[3] = 1;
        ja[4] = 2;
        ja[5] = 4;

        a[0] = 1.;
        a[1] = -1.;
        a[2] = -3.;
        a[3] = -2.;
        a[4] = 5.;
        a[5] = 6.;

        b[0] = 1.;
        b[1] = 1.;
        b[2] = 0.25;
    }
    else
    {
        if ( rank == 1 )
        {
            iparm[40] = 3; /* The number of row in global matrix, rhs element and solution vector
                              that begins the input domain belonging to this MPI process*/
            iparm[41] = 5; /* The number of row in global matrix, rhs element and solution vector
                              that ends the input domain belonging to this MPI process*/
            ia = (MKL_INT*) MKL_malloc (sizeof (MKL_INT) * 4, 64);
            ja = (MKL_INT*) MKL_malloc (sizeof (MKL_INT) * 7, 64);
            a = (double*) MKL_malloc (sizeof (double ) * 7, 64);
            x = (double*) MKL_malloc (sizeof (double) * 3, 64);
            b = (double*) MKL_malloc (sizeof (double) * 3, 64);
            if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
            {
                if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
                mpi_stat = MPI_Finalize();
                return 1;
            }
            ia[0] = 1;
            ia[1] = 3;
            ia[2] = 6;
            ia[3] = 8;

            ja[0] = 3;
            ja[1] = 5;
            ja[2] = 1;
            ja[3] = 2;
            ja[4] = 4;
            ja[5] = 2;
            ja[6] = 5;

            a[0] = 4.;
            a[1] = 4.;
            a[2] = -4.;
            a[3] = 2.;
            a[4] = 7.;
            a[5] = 8.;
            a[6] = -5.;

            b[0] = 0.75;
            b[1] = 1.;
            b[2] = 1.;
        }
        else
        {
            /* In this example MPI processes with rank > 1 doesn't have input domain
               so iparm[40] need to be greater then iparm[41] */
            iparm[40] = 2;
            iparm[41] = 1;
        }
    }

    /* -------------------------------------------------------------------- */
    /* .. Reordering and Symbolic Factorization. This step also allocates   */
    /* all memory that is necessary for the factorization.                  */
    /* -------------------------------------------------------------------- */
    phase = 11;
    cluster_sparse_solver ( pt, &maxfct, &mnum, &mtype, &phase,
        &n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &comm, &error );
    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during symbolic factorization: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 1;
    }

    if ( rank == 0 ) printf ("\nReordering completed ... ");

    /* -------------------------------------------------------------------- */
    /* .. Numerical factorization.                                          */
    /* -------------------------------------------------------------------- */
    //phase = 22;
    //cluster_sparse_solver ( pt, &maxfct, &mnum, &mtype, &phase,
    //    &n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &comm, &error );
    //if ( error != 0 )
    //{
    //    if ( rank == 0 ) printf ("\nERROR during numerical factorization: %lli", (long long int)error);
    //    mpi_stat = MPI_Finalize();
    //    return 2;
    //}
    //if ( rank == 0 ) printf ("\nFactorization completed ... ");

    /* -------------------------------------------------------------------- */
    /* .. Back substitution and iterative refinement.                       */
    /* -------------------------------------------------------------------- */
    phase = 23;

    if ( rank == 0 ) printf ("\nSolving system...");
    cluster_sparse_solver ( pt, &maxfct, &mnum, &mtype, &phase,
        &n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, b, x, &comm, &error );
    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during solution: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 4;
    }
    /* The solution of the system is distributed between MPI processes like as input matrix
       so MPI processes with rank 0 and 1 keep only part of solution */
    if ( rank == 0 )
    {
        printf ("\nThe solution of the system is: ");
        for ( j = 0; j < 3; j++ )
        {
            printf ("\n on zero process x [%lli] = % f", (long long int)j, x[j]);
        }
        printf ("\n");
    }
    MPI_Barrier(MPI_COMM_WORLD);

    if ( rank == 1 )
    {
        printf ("\nThe solution of the system is: ");
        for ( j = 0; j < 3; j++ )
        {
            printf ("\n on first process x [%lli] = % f", (long long int)j, x[j]);
        }
        printf ("\n");
    }
    MPI_Barrier(MPI_COMM_WORLD);

    /* -------------------------------------------------------------------- */
    /* .. Termination and release of memory. */
    /* -------------------------------------------------------------------- */
    phase = -1; /* Release internal memory. */
    cluster_sparse_solver ( pt, &maxfct, &mnum, &mtype, &phase,
        &n, &ddum, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &comm, &error );
    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during release memory: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 5;
    }
    if ( rank < 2 )
    {
        MKL_free(ia);
        MKL_free(ja);
        MKL_free(a);
        MKL_free(x);
        MKL_free(b);
    }

    mpi_stat = MPI_Finalize();
    return 0;
}