Large sparse matrix solving problem with cluster

YONGHEE_L_ · ‎02-06-2017

Dear Intel

With your help a month ago, I could set up the 'cluster sparse solver 64' program using 'iparm[1]=10 (The MPI version of the nested dissection and symbolic factorization algorithms)' with my 4 cluster computers.
This program is devised for very large sparse matrix, which contains about 10^8 - 10^9 rows.
However, it shows the lower performance(4MPI & 4OpenMP) than the result of a single machine(1MPI & 4OpenMP)

Following table is the result of the test.

target : 4*10^8 rows matrix
time consumption (1MPI & 4OpenMP || 4MPI & 4OpenMP(iparm[1]=3) || 4MPI & 4OpenMP(iparm[1]=10))
-------------------------------------------------------------------------------------------------------------------------
Reorder time (1021.6 s || 2094.3 s || 7644.3 s)
Factorization time (1403.2 s || 2136.6 s || 9263.1 s)
Solution time (158.6 s || 684.9 s || 554.14 s)
-------------------------------------------------------------------------------------------------------------------------

Could you please look into this issue?
I attache my code below.

Thank you very much in advance!!!

Regards,
Yong-hee

P.S. This is my code. (almost same with the example code)
=================================================================================

/*******************************************************************************
* Copyright 2004-2015 Intel Corporation All Rights Reserved.
*
* The source code,  information  and material  ("Material") contained  herein is
* owned by Intel Corporation or its  suppliers or licensors,  and  title to such
* Material remains with Intel  Corporation or its  suppliers or  licensors.  The
* Material  contains  proprietary  information  of  Intel or  its suppliers  and
* licensors.  The Material is protected by  worldwide copyright  laws and treaty
* provisions.  No part  of  the  Material   may  be  used,  copied,  reproduced,
* modified, published,  uploaded, posted, transmitted,  distributed or disclosed
* in any way without Intel's prior express written permission.  No license under
* any patent,  copyright or other  intellectual property rights  in the Material
* is granted to  or  conferred  upon  you,  either   expressly,  by implication,
* inducement,  estoppel  or  otherwise.  Any  license   under such  intellectual
* property rights must be express and approved by Intel in writing.
*
* Unless otherwise agreed by Intel in writing,  you may not remove or alter this
* notice or  any  other  notice   embedded  in  Materials  by  Intel  or Intel's
* suppliers or licensors in any way.
*******************************************************************************/

/*
*
*   MKL Cluster Sparse Solver example demonstrating the case when initial data (matrix
*   and rhs) distributed between several MPI processes, final solution is
*   distributed between MPI processes in the same way as they hold initial data.
*
********************************************************************************
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <time.h>
#include "mpi.h"
#include "mkl.h"
#include "mkl_cluster_sparse_solver.h"

#ifdef MKL_ILP64
#define MPI_DT MPI_LONG
#else
#define MPI_DT MPI_INT
#endif
#define MPI_REDUCE_AND_BCAST \
        MPI_Reduce(&err_mem, &error, 1, MPI_DT, MPI_SUM, 0, MPI_COMM_WORLD); \
        MPI_Bcast(&error, 1, MPI_DT, 0, MPI_COMM_WORLD);

int main(void)

{
	clock_t before;
	double result;

	MKL_INT64 AllocatedBytes; 

	int N_AllocatedBuffers;
	mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE);
	AllocatedBytes = mkl_mem_stat(&N_AllocatedBuffers);

	mkl_set_num_threads(2);
	FILE* pre_inputFile=fopen("/user/source/CREATE_MATRIX/SPM_test_20000_whole.txt", "r");
	if (pre_inputFile==NULL){
		puts( "There is no file." );}

	MKL_INT64 n;
	fscanf(pre_inputFile, "%lld", &n);
	n -= 1;
    fclose(pre_inputFile);
	
    /* Matrix data. */
	//MKL_INT64 n = 5;

    MKL_INT64 mtype = 2; /* Real symmetric definite matrix */
    MKL_INT64 *ia = NULL;
    MKL_INT64 *ja = NULL;
    double  *a = NULL;
    /* RHS and solution vectors. */
    double  *b = NULL;
    double  *x = NULL;

	char	hostName[1024]	= "\0";

    MKL_INT64 nrhs = 1; /* Number of right hand sides. */
    /* Internal solver memory pointer pt, */
    /* 32-bit: int pt[64]; 64-bit: long int pt[64] */
    /* or void *pt[64] should be OK on both architectures */
    void *pt[64] = { 0 };
    /* Cluster Sparse Solver control parameters. */
    MKL_INT64 iparm[64] = { 0 };
    MKL_INT64 maxfct, mnum, phase, msglvl, error, err_mem;

    /* Auxiliary variables. */
    double  ddum; /* Double dummy   */
    MKL_INT64 idum; /* Integer dummy. */
    MKL_INT64 j;
    int     mpi_stat = 0;
    int     argc = 0;
    int     comm, rank, size;
    char**  argv;

	printf("at the beginning of the program, peak memory : %ld bytes\n", mkl_peak_mem_usage(MKL_PEAK_MEM));
    
	/* -------------------------------------------------------------------- */
    /* .. Init MPI.                                                         */
    /* -------------------------------------------------------------------- */
    mpi_stat = MPI_Init( &argc, &argv );
    mpi_stat = MPI_Comm_rank( MPI_COMM_WORLD, &rank );
	mpi_stat = MPI_Comm_size( MPI_COMM_WORLD, &size );
    comm =  MPI_Comm_c2f( MPI_COMM_WORLD );
	//printf ("comm : %d, rank : %d, size : %d", comm, rank, size);

	if( size < 2 )
	{
		printf("\nERROR: this example doesn't work on number of MPI less than 2");
		mpi_stat = MPI_Finalize();
		return 1;
	}

    /* -------------------------------------------------------------------- */
    /* .. Setup Cluster Sparse Solver control parameters.                                 */
    /* -------------------------------------------------------------------- */
    iparm[ 0] =  1; /* Solver default parameters overriden with provided by iparm */
    iparm[ 1] =  10; /* Use METIS for fill-in reordering */
    iparm[ 5] =  0; /* Write solution into x */
    iparm[ 7] =  2; /* Max number of iterative refinement steps */
    iparm[ 9] = 13; /* Perturb the pivot elements with 1E-13 */
    iparm[10] =  0; /* Don't use nonsymmetric permutation and scaling MPS */
    iparm[12] =  1; /* Switch on Maximum Weighted Matching algorithm (default for non-symmetric) */
    //iparm[17] = -1; /* Output: Number of nonzeros in the factor LU */
    //iparm[18] = -1; /* Output: Mflops for LU factorization */
    //iparm[26] =  1; /* Check input data for correctness */
	//iparm[34] =  1; /* Cluster Sparse Solver use C-style indexing for ia and ja arrays */
    //iparm[39] =  2; /* Input: matrix/rhs/solution are distributed between MPI processes  */
    /* If iparm[39]=2, the matrix is provided in distributed assembled matrix input          
       format. In this case, each MPI process stores only a part (or domain) of the matrix A 
       data. The bounds of the domain should be set via iparm(41) and iparm(42). Solution    
       vector is distributed between process in same manner with rhs. */		

    maxfct = 1; /* Maximum number of numerical factorizations. */
    mnum   = 1; /* Which factorization to use. */
    msglvl = 1; /* Print statistical information in file */
    error  = 0; /* Initialize error flag */
	err_mem = 0; /* Initialize error flag for memory allocation */

    /* Initialize matrix and rhs components on each process:
       In this example initial matrix is distributed between 2 processes
       so for MPI processes with rank > 1 input domains are empty */
	
	MKL_INT64 ii, ia_index, ja_index, up_r, down_r;
	FILE * inputFile;

    if (rank == 0)
    {
		//mkl_set_num_threads(2);
		inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_1.txt", "r");

		fscanf(inputFile, "%lld %lld", &ia_index, &ja_index);

		iparm[40] = 1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = n/4; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */
		
		//printf("%d %d %d %d", ia_index, ja_index, up_r, down_r);

        ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64);
        ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64);
        a = (double*) MKL_malloc (sizeof (double) * ja_index, 64);
        x = (double*) MKL_malloc (sizeof (double) * n, 64);
        b = (double*) MKL_malloc (sizeof (double) * n, 64);
        MPI_REDUCE_AND_BCAST;

		for (ii=0; ii<ia_index; ii++)
			fscanf(inputFile, "%lld", &ia[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lld", &ja[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lf", &a[ii]);
		for (ii=0; ii<n; ii++)
			b[ii] = 1.0;
	
	    fclose(inputFile);
		//printf("%d %d %d %d %lf %lf", ia[0], ia[ia_index-1], ja[0], ja[ja_index-1], a[0], a[ja_index-1]);
		//printf("%d %d", iparm[40], iparm[41]);
		//printf("%lf %lf %lf %lf",b[0],b[1],b[ia_index-3],b[ia_index-1]);
        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
    }
    
	else if (rank == 1)
    {
		//mkl_set_num_threads(2);
		inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_2.txt", "r");

		fscanf(inputFile, "%lld %lld", &ia_index, &ja_index);

		iparm[40] = (n/4)+1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = (n/4)*2; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */

        ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64);
        ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64);
        a = (double*) MKL_malloc (sizeof (double) * ja_index, 64);
        x = (double*) MKL_malloc (sizeof (double) * n, 64);
        b = (double*) MKL_malloc (sizeof (double) * n, 64);
        MPI_REDUCE_AND_BCAST;

		for (ii=0; ii<ia_index; ii++)
			fscanf(inputFile, "%lld", &ia[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lld", &ja[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lf", &a[ii]);
	
	    fclose(inputFile);

        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 1 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
    }

	else if (rank == 2)
    {
		//mkl_set_num_threads(2);
		inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_3.txt", "r");

		fscanf(inputFile, "%lld %lld", &ia_index, &ja_index);

		iparm[40] = (n/4)*2+1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = (n/4)*3; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */

        ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64);
        ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64);
        a = (double*) MKL_malloc (sizeof (double) * ja_index, 64);
        x = (double*) MKL_malloc (sizeof (double) * n, 64);
        b = (double*) MKL_malloc (sizeof (double) * n, 64);
        MPI_REDUCE_AND_BCAST;

		for (ii=0; ii<ia_index; ii++)
			fscanf(inputFile, "%lld", &ia[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lld", &ja[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lf", &a[ii]);
	
	    fclose(inputFile);

        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
    }

	else if (rank == 3)
    {
		//mkl_set_num_threads(2);
		inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_4.txt", "r");

		fscanf(inputFile, "%lld %lld", &ia_index, &ja_index);

		iparm[40] = (n/4)*3+1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = (n/4)*4; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */

        ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64);
        ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64);
        a = (double*) MKL_malloc (sizeof (double) * ja_index, 64);
        x = (double*) MKL_malloc (sizeof (double) * n, 64);
        b = (double*) MKL_malloc (sizeof (double) * n, 64);
        MPI_REDUCE_AND_BCAST;

		for (ii=0; ii<ia_index; ii++)
			fscanf(inputFile, "%lld", &ia[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lld", &ja[ii]);
		for (ii=0; ii<ja_index; ii++)
			fscanf(inputFile, "%lf", &a[ii]);
	
	    fclose(inputFile);

        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
    }

	else {
		MPI_REDUCE_AND_BCAST;
        /* In this example MPI processes with rank > 1 doesn't have input domain
           so iparm[40] need to be greater then iparm[41] */
        iparm[40] = 2;
        iparm[41] = 1;
    }

	//***************************************************************************************************************************************************

	gethostname(hostName, 1023);
    printf("\n%d th rank at the end of the loading phase, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	if (rank == 0 )
		before = clock();
	/* -------------------------------------------------------------------- */
    /* .. Reordering and Symbolic Factorization. This step also allocates   */
    /* all memory that is necessary for the factorization.                  */
    /* -------------------------------------------------------------------- */
    phase = 11;
    cluster_sparse_solver_64 ( pt, &maxfct, &mnum, &mtype, &phase,
        &n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &comm, &error );
    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during symbolic factorization: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 1;
    }

    if ( rank == 0 ) printf ("\nReordering completed ... ");
		
	printf("\n%d th rank at the end of the phase 11, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
	printf("\n%d th rank at the end of the phase 11, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	if (rank == 0 )
	{
		result = (double)(clock()-before)/CLOCKS_PER_SEC;
		printf("##### Time consumption for reordering is %7.2lf seconds.\n", result);
		before = clock();
	}

	//***************************************************************************************************************************************************
    /* -------------------------------------------------------------------- */
    /* .. Numerical factorization.                                          */
    /* -------------------------------------------------------------------- */
    phase = 22;
    cluster_sparse_solver_64 ( pt, &maxfct, &mnum, &mtype, &phase,
        &n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &comm, &error );
    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during numerical factorization: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 2;
    }
    if ( rank == 0 ) printf ("\nFactorization completed ... ");
	
	printf("\n%d th rank at the end of the phase 22, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
	printf("\n%d th rank at the end of the phase 22, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	if (rank == 0 )
	{
		result = (double)(clock()-before)/CLOCKS_PER_SEC;
		printf("##### Time consumption for factorization is %7.2lf seconds.\n", result);
		before = clock();
	}

	//***************************************************************************************************************************************************
    /* -------------------------------------------------------------------- */
    /* .. Back substitution and iterative refinement.                       */
    /* -------------------------------------------------------------------- */
    phase = 33;

    if ( rank == 0 ) printf ("\nSolving system...");
    cluster_sparse_solver_64 ( pt, &maxfct, &mnum, &mtype, &phase,
        &n, a, ia, ja, &idum, &nrhs, iparm, &msglvl, b, x, &comm, &error );
    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during solution: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 4;
    }
    /* The solution of the system is distributed between MPI processes like as input matrix
       so MPI processes with rank 0 and 1 keep only part of solution */
	
	printf("\n%d th rank at the end of the program, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
	printf("\n%d th rank at the end of the program, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	if (rank == 0 )
	{
		result = (double)(clock()-before)/CLOCKS_PER_SEC;
		printf("##### Time consumption for solving is %7.2lf seconds.\n", result);
	}

	//***************************************************************************************************************************************************
    if ( rank == 0 )
    {
        printf ("\nThe solution of the system is: ");
        for ( j = 0; j < 10; j++ )
        {
            printf ("\n on zero process x [%lli] = % f", (long long int)j, x);
        }
        printf ("\n");
    }
    MPI_Barrier(MPI_COMM_WORLD);

    if ( rank == 1 )
    {
        printf ("\nThe solution of the system is: ");
        for ( j = n-10; j < n; j++ )
        {
            printf ("\n on first process x [%lli] = % f", (long long int)j, x);
        }
        printf ("\n");
    }
    MPI_Barrier(MPI_COMM_WORLD);

	//double res, res0;
    //char*   uplo;
	//uplo = "Upper-triangle";
    //mkl_cspblas_scsrsymv ( uplo, &n, a, ia, ja, x, bs );
    //res  = 0.0;
    //res0 = 0.0;

	//printf ("%lf %lf %lf %lf \n", b[0], b[1], b[2], b[3]);
	//printf ("%lf %lf %lf %lf \n", bs[0], bs[1], bs[2], bs[3]);
	//printf("XXXX");
	
    /* -------------------------------------------------------------------- */
    /* .. Termination and release of memory. */
    /* -------------------------------------------------------------------- */
    phase = -1; /* Release internal memory. */
    cluster_sparse_solver_64 ( pt, &maxfct, &mnum, &mtype, &phase,
        &n, &ddum, ia, ja, &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &comm, &error );

	//printf ("%lf %lf %lf %lf \n", b[0], b[1], b[2], b[3]);
	//printf ("%lf %lf %lf %lf \n", bs[0], bs[1], bs[2], bs[3]);

    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during release memory: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 5;
    }
    if ( rank < size )
    {
        MKL_free(ia);
        MKL_free(ja);
        MKL_free(a);
        MKL_free(x);
        MKL_free(b);
    }
	
	printf("\n%d th rank at the end of the phase 33, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);

	//gethostname(hostName, 1023);
	//printf("\n%d th rank hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	//printf("\nPeak memory allocated by Intel MKL memory allocator after reset of peak memory counter %ld bytes\n", mkl_peak_mem_usage(MKL_PEAK_MEM));

    mpi_stat = MPI_Finalize();
    return 0;
}

Alexander_K_Intel2 · ‎02-08-2017

Hi Yong-hee,

The results that you shows are quite strange, can i ask you to send us the tested matrix? Something goes wrong and we need to play with reproducer on our side

Thanks,

Alex

YONGHEE_L_ · ‎02-12-2017

Hi Alexander Kalinkin,

Below, I attach code for matrix generation..
If you execute this code, you can input the matrix size.
First, input 10000 at size and input 25 at interconnection, then you can generate a matrix having 4*10^8 rows same that I used.

I really appreciate your consideration and help.

Regards,
Yong-hee

#include <stdio.h>
#include <stdlib.h>
#include <tgmath.h>

void main()
{
	unsigned long long int i, j, row, col, input_n, percentage, temp_1;
	unsigned long long int spRow, spColumn, ia_index, ja_index, ia_check, rate_of_intercon;		
	unsigned long long int total_ia, total_ja;
	unsigned long long int spRowrow, spRowrowcol, spRC;
	unsigned long long int *ia_data, *ja_data, *division_index;
	double *a_data;
	FILE *SP_0;
	FILE *SP_1;
	FILE *SP_2;
	FILE *SP_3;
	FILE *SP_4;
	FILE *SP_0_1;
	FILE *SP_0_2;
	FILE *SP_0_3;
	FILE *SP_0_4;
	char title[300] = "SPM_test_";
	char title2;

	printf("Size of a die(int) : ");
	scanf("%llu", &input_n);
	printf("Perc. of interconnection(int) : ");
	scanf("%llu", &percentage);

	//sprintf(title2, "%llu", input_n);
	//title = title+title2;
	//puts(title);
	
	spRow = spColumn = input_n;	//size of 1-die of PDN
	spRC = spRow*spColumn;
	rate_of_intercon = percentage;		//percentage of interconnection
	total_ia = 4*(spRow*spRow)+1;
	total_ja = 4*3*spRow*spRow;
	//total_ja = 4*(3*(spRow-1)*(spRow-1)+2*2*(spRow-1)+1)+3*(spRow*(rate_of_intercon/100.0))pRow

	
	printf("Total number of the 'ia' is %llu and 'ja' is %llu\n", total_ia, total_ja);
	
	ia_data = (unsigned long long int*)malloc(sizeof(unsigned long long int)*total_ia);
	ja_data = (unsigned long long int*)malloc(sizeof(unsigned long long int)*total_ja);
	a_data = (double*)malloc(sizeof(double)*total_ja);
	division_index = (unsigned long long int*)malloc(sizeof(unsigned long long int)*5);
	
	ia_index = 0;
	ja_index = 0;
	ia_check = 1;
	division_index[0] = 0;
	ia_data[0] = 1;
	
	for (row=0; row<spRow; row++)		//1st-die
	{
		spRowrow = spRow*row;
		for (col=0; col<spColumn; col++)
		{
			spRowrowcol = spRowrow+(col+1);
			//printf("%llu %llu|", row, col);
			if (((row+1)==spRow) && ((col+1)==spColumn))
			{

				division_index[1] = ja_index;
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
			}
			else if ((row+1)==spRow)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
			}	
			else if ((col+1)==spColumn)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}	
			else if (((row+1)==(spRow/2)) && (((col+1)%(100/rate_of_intercon))==0))
			{
				//printf("row : %llu, column : %llu\n", row+1, col+1);
				a_data[ia_index] = 5;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRC;
				ia_index++;
				ja_index++;
			}
			else
			{	
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}
			ia_data[ia_check] = ia_index+1;
			ia_check++;
		}
	}
	for (row=0; row<spRow; row++)		//2nd-die
	{
		spRowrow = spRC + spRow*row;
		for (col=0; col<spColumn; col++)
		{
			spRowrowcol = spRowrow+(col+1);
			if (((row+1)==spRow) && ((col+1)==spColumn))
			{
				division_index[2] = ja_index;
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
			}
			else if ((row+1)==spRow)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
			}	
			else if ((col+1)==spColumn)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}	
			else if (((row+1)==(spRow/2)) && (((col+1)%(100/rate_of_intercon))==0))
			{
				a_data[ia_index] = 6;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRC;
				ia_index++;
				ja_index++;
			}
			else
			{	
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}
			ia_data[ia_check] = ia_index+1;
			ia_check++;
		}
	}
	for (row=0; row<spRow; row++)		//3rd-die
	{
		spRowrow = spRC*2 + spRow*row;
		for (col=0; col<spColumn; col++)
		{
			spRowrowcol = spRowrow+(col+1);
			if (((row+1)==spRow) && ((col+1)==spColumn))
			{
				division_index[3] = ja_index;
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
			}
			else if ((row+1)==spRow)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
			}	
			else if ((col+1)==spColumn)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}	
			else if (((row+1)==(spRow/2)) && (((col+1)%(100/rate_of_intercon))==0))
			{
				a_data[ia_index] = 6;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRC;
				ia_index++;
				ja_index++;
			}
			else
			{	
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}
			ia_data[ia_check] = ia_index+1;
			ia_check++;
		}
	}
	for (row=0; row<spRow; row++)		//4th-die
	{
		spRowrow = spRC*3 + spRow*row;
		for (col=0; col<spColumn; col++)
		{
			spRowrowcol = spRowrow+(col+1);
			if (((row+1)==spRow) && ((col+1)==spColumn))
			{
				division_index[4] = ja_index;
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
			}
			else if ((row+1)==spRow)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
			}	
			else if ((col+1)==spColumn)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}	
			else if (((row+1)==(spRow/2)) && (((col+1)%(100/rate_of_intercon))==0))
			{
				a_data[ia_index] = 5;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}
			else
			{	
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}
			ia_data[ia_check] = ia_index+1;
			ia_check++;
		}
	}
	
	a_data[0] = 1000.0;	//Voltage source

	//********************************************

	SP_0 = fopen("SPM_test_20000_whole.txt", "w");
	fprintf(SP_0, "%llu %llu\n", ia_check, ja_index);
	for (i=0; i<ia_check; i++)
		fprintf(SP_0, "%llu ", ia_data);
	fprintf(SP_0, "\n");
	for (i=0; i<ja_index; i++)
		fprintf(SP_0, "%llu ", ja_data);
	fprintf(SP_0, "\n");
	for (i=0; i<ja_index; i++)
		fprintf(SP_0, "%lf ", a_data);
	fprintf(SP_0, "\n");

	//********************************************

	SP_0_1 = fopen("SPM_test_20000_n_1.txt", "w");
	SP_0_2 = fopen("SPM_test_20000_n_2.txt", "w");
	SP_0_3 = fopen("SPM_test_20000_n_3.txt", "w");
	SP_0_4 = fopen("SPM_test_20000_n_4.txt", "w");

	fprintf(SP_0_1, "%llu %llu\n", input_n*input_n+1, ia_data[input_n*input_n]-ia_data[0]);
	fprintf(SP_0_2, "%llu %llu\n", input_n*input_n+1, ia_data[input_n*input_n*2]-ia_data[input_n*input_n]);
	fprintf(SP_0_3, "%llu %llu\n", input_n*input_n+1, ia_data[input_n*input_n*3]-ia_data[input_n*input_n*2]);
	fprintf(SP_0_4, "%llu %llu\n", input_n*input_n+1, ia_data[input_n*input_n*4]-ia_data[input_n*input_n*3]);

	for (i=0; i<=input_n*input_n; i++)
		fprintf(SP_0_1, "%llu ", ia_data);
	fprintf(SP_0_1, "\n");
	temp_1 = ia_data[input_n*input_n]-1;
	for (i=input_n*input_n; i<=input_n*input_n*2; i++)
		fprintf(SP_0_2, "%llu ", ia_data-temp_1);
	fprintf(SP_0_2, "\n");
	temp_1 = ia_data[input_n*input_n*2]-1;
	for (i=input_n*input_n*2; i<=input_n*input_n*3; i++)
		fprintf(SP_0_3, "%llu ", ia_data-temp_1);
	fprintf(SP_0_3, "\n");
	temp_1 = ia_data[input_n*input_n*3]-1;
	for (i=input_n*input_n*3; i<=input_n*input_n*4; i++)
		fprintf(SP_0_4, "%llu ", ia_data-temp_1);
	fprintf(SP_0_4, "\n");

	for (i=ia_data[0]-1; i<ia_data[input_n*input_n]-1; i++)
		fprintf(SP_0_1, "%llu ", ja_data);
	fprintf(SP_0_1, "\n");
	for (i=ia_data[input_n*input_n]-1; i<ia_data[input_n*input_n*2]-1; i++)
		fprintf(SP_0_2, "%llu ", ja_data);
	fprintf(SP_0_2, "\n");
	for (i=ia_data[input_n*input_n*2]-1; i<ia_data[input_n*input_n*3]-1; i++)
		fprintf(SP_0_3, "%llu ", ja_data);
	fprintf(SP_0_3, "\n");
	for (i=ia_data[input_n*input_n*3]-1; i<ia_data[input_n*input_n*4]-1; i++)
		fprintf(SP_0_4, "%llu ", ja_data);
	fprintf(SP_0_4, "\n");

	for (i=ia_data[0]-1; i<ia_data[input_n*input_n]-1; i++)
		fprintf(SP_0_1, "%lf ", a_data);
	fprintf(SP_0_1, "\n");
	for (i=ia_data[input_n*input_n]-1; i<ia_data[input_n*input_n*2]-1; i++)
		fprintf(SP_0_2, "%lf ", a_data);
	fprintf(SP_0_2, "\n");
	for (i=ia_data[input_n*input_n*2]-1; i<ia_data[input_n*input_n*3]-1; i++)
		fprintf(SP_0_3, "%lf ", a_data);
	fprintf(SP_0_3, "\n");
	for (i=ia_data[input_n*input_n*3]-1; i<ia_data[input_n*input_n*4]-1; i++)
		fprintf(SP_0_4, "%lf ", a_data);
	fprintf(SP_0_4, "\n");

	//********************************************
	
	a_data[division_index[1]] = a_data[division_index[1]]/2.0;
	a_data[division_index[2]] = a_data[division_index[2]]/2.0;
	a_data[division_index[3]] = a_data[division_index[3]]/2.0;

	SP_1 = fopen("SPM_test_20000_1.txt", "w");
	fprintf(SP_1, "%llu %llu %llu %llu\n", spRC+1, division_index[1]-division_index[0]+1, 1, spRC);
	for (i=0; i<=spRC; i++)
		fprintf(SP_1, "%llu ", ia_data);
	//fprintf(SP_1, "%llu ", ia_data);
	fprintf(SP_1, "\n");
	for (i=division_index[0]; i<=division_index[1]; i++)
		fprintf(SP_1, "%llu ", ja_data);
	fprintf(SP_1, "\n");
	for (i=division_index[0]; i<=division_index[1]; i++)
		fprintf(SP_1, "%lf ", a_data);
	fprintf(SP_1, "\n");

	SP_2 = fopen("SPM_test_20000_2.txt", "w");
	fprintf(SP_2, "%llu %llu %llu %llu\n", spRC+2, division_index[2]-division_index[1]+1, spRC, spRC*2);
	for (i=spRC-1; i<=spRC*2; i++)
		fprintf(SP_2, "%llu ", ia_data-ia_data[spRC-1]+1);
	//fprintf(SP_2, "%llu ", ia_data-ia_data[spRC-1]+1);
	fprintf(SP_2, "\n");
	for (i=division_index[1]; i<=division_index[2]; i++)
		fprintf(SP_2, "%llu ", ja_data);
	fprintf(SP_2, "\n");
	for (i=division_index[1]; i<=division_index[2]; i++)
		fprintf(SP_2, "%lf ", a_data);
	fprintf(SP_2, "\n");

	SP_3 = fopen("SPM_test_20000_3.txt", "w");
	fprintf(SP_3, "%llu %llu %llu %llu\n", spRC+2, division_index[3]-division_index[2]+1, spRC*2, spRC*3);
	for (i=spRC*2-1; i<=spRC*3; i++)
		fprintf(SP_3, "%llu ", ia_data-ia_data[spRC*2-1]+1);
	//fprintf(SP_3, "%llu ", ia_data-ia_data[spRC*2-1]+1);
	fprintf(SP_3, "\n");
	for (i=division_index[2]; i<=division_index[3]; i++)
		fprintf(SP_3, "%llu ", ja_data);
	fprintf(SP_3, "\n");
	for (i=division_index[2]; i<=division_index[3]; i++)
		fprintf(SP_3, "%lf ", a_data);
	fprintf(SP_3, "\n");

	SP_4 = fopen("SPM_test_20000_4.txt", "w");
	fprintf(SP_4, "%llu %llu %llu %llu\n", spRC+2, division_index[4]-division_index[3]+1, spRC*3, spRC*4);
	for (i=spRC*3-1; i<=spRC*4; i++)
		fprintf(SP_4, "%llu ", ia_data-ia_data[spRC*3-1]+1);
	//fprintf(SP_4, "%llu ", ia_data-ia_data[spRC*3-1]+1);
	fprintf(SP_4, "\n");
	for (i=division_index[3]; i<=division_index[4]; i++)
		fprintf(SP_4, "%llu ", ja_data);
	fprintf(SP_4, "\n");
	for (i=division_index[3]; i<=division_index[4]; i++)
		fprintf(SP_4, "%lf ", a_data);
	fprintf(SP_4, "\n");

	//********************************************

	fclose(SP_0);
	fclose(SP_1);
	fclose(SP_2);
	fclose(SP_3);
	fclose(SP_4);
	fclose(SP_0_1);
	fclose(SP_0_2);
	fclose(SP_0_3);
	fclose(SP_0_4);

	free(ia_data);
	free(ja_data);
	free(a_data);
	free(division_index);

}