<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Hi Yong-hee, in Intel® oneAPI Math Kernel Library</title>
    <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Large-sparse-matrix-solving-problem-with-cluster/m-p/1089785#M23168</link>
    <description>&lt;P&gt;Hi&amp;nbsp;&lt;SPAN style="font-size: 12px;"&gt;Yong-hee,&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 12px;"&gt;The results that you shows are quite strange, can i ask you to send us the tested matrix? Something goes wrong and we need to play with reproducer on our side&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 12px;"&gt;Thanks,&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 12px;"&gt;Alex&lt;/SPAN&gt;&lt;/P&gt;</description>
    <pubDate>Wed, 08 Feb 2017 09:44:37 GMT</pubDate>
    <dc:creator>Alexander_K_Intel2</dc:creator>
    <dc:date>2017-02-08T09:44:37Z</dc:date>
    <item>
      <title>Large sparse matrix solving problem with cluster</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Large-sparse-matrix-solving-problem-with-cluster/m-p/1089784#M23167</link>
      <description>&lt;P&gt;Dear Intel&lt;/P&gt;

&lt;P&gt;With your help a month ago, I could set up the 'cluster sparse solver 64' program using 'iparm[1]=10 (The MPI version of the nested dissection and symbolic factorization algorithms)' with my 4 cluster computers.&lt;BR /&gt;
	This program is devised for very large sparse matrix, which contains about 10^8 - 10^9 rows.&lt;BR /&gt;
	However, it shows the lower performance(4MPI &amp;amp; 4OpenMP) than the result of a single machine(1MPI &amp;amp; 4OpenMP)&lt;/P&gt;

&lt;P&gt;Following table is the result of the test.&lt;/P&gt;

&lt;P&gt;&amp;nbsp; target : 4*10^8 rows matrix&lt;BR /&gt;
	&amp;nbsp; time consumption &amp;nbsp; (1MPI &amp;amp; 4OpenMP &amp;nbsp;|| &amp;nbsp;4MPI &amp;amp; 4OpenMP(iparm[1]=3) &amp;nbsp;|| &amp;nbsp;4MPI &amp;amp; 4OpenMP(iparm[1]=10))&lt;BR /&gt;
	-------------------------------------------------------------------------------------------------------------------------&lt;BR /&gt;
	&amp;nbsp; &amp;nbsp; Reorder time &amp;nbsp; &amp;nbsp; &amp;nbsp; (1021.6 s &amp;nbsp; &amp;nbsp;|| &amp;nbsp; &amp;nbsp;2094.3 s &amp;nbsp; &amp;nbsp;|| &amp;nbsp; &amp;nbsp;7644.3 s)&lt;BR /&gt;
	&amp;nbsp; &amp;nbsp; Factorization time &amp;nbsp;(1403.2 s &amp;nbsp; &amp;nbsp;|| &amp;nbsp; &amp;nbsp;2136.6 s &amp;nbsp; &amp;nbsp;|| &amp;nbsp; &amp;nbsp;9263.1 s)&lt;BR /&gt;
	&amp;nbsp; &amp;nbsp; Solution time &amp;nbsp; &amp;nbsp; &amp;nbsp; (158.6 s &amp;nbsp; &amp;nbsp;|| &amp;nbsp; &amp;nbsp;684.9 s &amp;nbsp; &amp;nbsp;|| &amp;nbsp; &amp;nbsp;554.14 s)&lt;BR /&gt;
	-------------------------------------------------------------------------------------------------------------------------&lt;/P&gt;

&lt;P&gt;Could you please look into this issue?&lt;BR /&gt;
	I attache my code below.&lt;/P&gt;

&lt;P&gt;Thank you very much in advance!!!&lt;/P&gt;

&lt;P&gt;Regards,&lt;BR /&gt;
	&amp;nbsp;Yong-hee&lt;/P&gt;

&lt;P&gt;P.S. This is my code. (almost same with the example code)&lt;BR /&gt;
	=================================================================================&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;/*******************************************************************************
* Copyright 2004-2015 Intel Corporation All Rights Reserved.
*
* The source code,  information  and material  ("Material") contained  herein is
* owned by Intel Corporation or its  suppliers or licensors,  and  title to such
* Material remains with Intel  Corporation or its  suppliers or  licensors.  The
* Material  contains  proprietary  information  of  Intel or  its suppliers  and
* licensors.  The Material is protected by  worldwide copyright  laws and treaty
* provisions.  No part  of  the  Material   may  be  used,  copied,  reproduced,
* modified, published,  uploaded, posted, transmitted,  distributed or disclosed
* in any way without Intel's prior express written permission.  No license under
* any patent,  copyright or other  intellectual property rights  in the Material
* is granted to  or  conferred  upon  you,  either   expressly,  by implication,
* inducement,  estoppel  or  otherwise.  Any  license   under such  intellectual
* property rights must be express and approved by Intel in writing.
*
* Unless otherwise agreed by Intel in writing,  you may not remove or alter this
* notice or  any  other  notice   embedded  in  Materials  by  Intel  or Intel's
* suppliers or licensors in any way.
*******************************************************************************/

/*
*
*   MKL Cluster Sparse Solver example demonstrating the case when initial data (matrix
*   and rhs) distributed between several MPI processes, final solution is
*   distributed between MPI processes in the same way as they hold initial data.
*
********************************************************************************
*/
#include &amp;lt;stdio.h&amp;gt;
#include &amp;lt;stdlib.h&amp;gt;
#include &amp;lt;unistd.h&amp;gt;
#include &amp;lt;math.h&amp;gt;
#include &amp;lt;time.h&amp;gt;
#include "mpi.h"
#include "mkl.h"
#include "mkl_cluster_sparse_solver.h"

#ifdef MKL_ILP64
#define MPI_DT MPI_LONG
#else
#define MPI_DT MPI_INT
#endif
#define MPI_REDUCE_AND_BCAST \
        MPI_Reduce(&amp;amp;err_mem, &amp;amp;error, 1, MPI_DT, MPI_SUM, 0, MPI_COMM_WORLD); \
        MPI_Bcast(&amp;amp;error, 1, MPI_DT, 0, MPI_COMM_WORLD);

int main(void)

{
	clock_t before;
	double result;

	MKL_INT64 AllocatedBytes; 

	int N_AllocatedBuffers;
	mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE);
	AllocatedBytes = mkl_mem_stat(&amp;amp;N_AllocatedBuffers);

	mkl_set_num_threads(2);
	FILE* pre_inputFile=fopen("/user/source/CREATE_MATRIX/SPM_test_20000_whole.txt", "r");
	if (pre_inputFile==NULL){
		puts( "There is no file." );}

	MKL_INT64 n;
	fscanf(pre_inputFile, "%lld", &amp;amp;n);
	n -= 1;
    fclose(pre_inputFile);
	
    /* Matrix data. */
	//MKL_INT64 n = 5;

    MKL_INT64 mtype = 2; /* Real symmetric definite matrix */
    MKL_INT64 *ia = NULL;
    MKL_INT64 *ja = NULL;
    double  *a = NULL;
    /* RHS and solution vectors. */
    double  *b = NULL;
    double  *x = NULL;

	char	hostName[1024]	= "\0";

    MKL_INT64 nrhs = 1; /* Number of right hand sides. */
    /* Internal solver memory pointer pt, */
    /* 32-bit: int pt[64]; 64-bit: long int pt[64] */
    /* or void *pt[64] should be OK on both architectures */
    void *pt[64] = { 0 };
    /* Cluster Sparse Solver control parameters. */
    MKL_INT64 iparm[64] = { 0 };
    MKL_INT64 maxfct, mnum, phase, msglvl, error, err_mem;

    /* Auxiliary variables. */
    double  ddum; /* Double dummy   */
    MKL_INT64 idum; /* Integer dummy. */
    MKL_INT64 j;
    int     mpi_stat = 0;
    int     argc = 0;
    int     comm, rank, size;
    char**  argv;

	printf("at the beginning of the program, peak memory : %ld bytes\n", mkl_peak_mem_usage(MKL_PEAK_MEM));
    
	/* -------------------------------------------------------------------- */
    /* .. Init MPI.                                                         */
    /* -------------------------------------------------------------------- */
    mpi_stat = MPI_Init( &amp;amp;argc, &amp;amp;argv );
    mpi_stat = MPI_Comm_rank( MPI_COMM_WORLD, &amp;amp;rank );
	mpi_stat = MPI_Comm_size( MPI_COMM_WORLD, &amp;amp;size );
    comm =  MPI_Comm_c2f( MPI_COMM_WORLD );
	//printf ("comm : %d, rank : %d, size : %d", comm, rank, size);

	if( size &amp;lt; 2 )
	{
		printf("\nERROR: this example doesn't work on number of MPI less than 2");
		mpi_stat = MPI_Finalize();
		return 1;
	}

    /* -------------------------------------------------------------------- */
    /* .. Setup Cluster Sparse Solver control parameters.                                 */
    /* -------------------------------------------------------------------- */
    iparm[ 0] =  1; /* Solver default parameters overriden with provided by iparm */
    iparm[ 1] =  10; /* Use METIS for fill-in reordering */
    iparm[ 5] =  0; /* Write solution into x */
    iparm[ 7] =  2; /* Max number of iterative refinement steps */
    iparm[ 9] = 13; /* Perturb the pivot elements with 1E-13 */
    iparm[10] =  0; /* Don't use nonsymmetric permutation and scaling MPS */
    iparm[12] =  1; /* Switch on Maximum Weighted Matching algorithm (default for non-symmetric) */
    //iparm[17] = -1; /* Output: Number of nonzeros in the factor LU */
    //iparm[18] = -1; /* Output: Mflops for LU factorization */
    //iparm[26] =  1; /* Check input data for correctness */
	//iparm[34] =  1; /* Cluster Sparse Solver use C-style indexing for ia and ja arrays */
    //iparm[39] =  2; /* Input: matrix/rhs/solution are distributed between MPI processes  */
    /* If iparm[39]=2, the matrix is provided in distributed assembled matrix input          
       format. In this case, each MPI process stores only a part (or domain) of the matrix A 
       data. The bounds of the domain should be set via iparm(41) and iparm(42). Solution    
       vector is distributed between process in same manner with rhs. */		

    maxfct = 1; /* Maximum number of numerical factorizations. */
    mnum   = 1; /* Which factorization to use. */
    msglvl = 1; /* Print statistical information in file */
    error  = 0; /* Initialize error flag */
	err_mem = 0; /* Initialize error flag for memory allocation */

    /* Initialize matrix and rhs components on each process:
       In this example initial matrix is distributed between 2 processes
       so for MPI processes with rank &amp;gt; 1 input domains are empty */
	
	MKL_INT64 ii, ia_index, ja_index, up_r, down_r;
	FILE * inputFile;

    if (rank == 0)
    {
		//mkl_set_num_threads(2);
		inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_1.txt", "r");

		fscanf(inputFile, "%lld %lld", &amp;amp;ia_index, &amp;amp;ja_index);

		iparm[40] = 1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = n/4; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */
		
		//printf("%d %d %d %d", ia_index, ja_index, up_r, down_r);

        ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64);
        ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64);
        a = (double*) MKL_malloc (sizeof (double) * ja_index, 64);
        x = (double*) MKL_malloc (sizeof (double) * n, 64);
        b = (double*) MKL_malloc (sizeof (double) * n, 64);
        MPI_REDUCE_AND_BCAST;

		for (ii=0; ii&amp;lt;ia_index; ii++)
			fscanf(inputFile, "%lld", &amp;amp;ia[ii]);
		for (ii=0; ii&amp;lt;ja_index; ii++)
			fscanf(inputFile, "%lld", &amp;amp;ja[ii]);
		for (ii=0; ii&amp;lt;ja_index; ii++)
			fscanf(inputFile, "%lf", &amp;amp;a[ii]);
		for (ii=0; ii&amp;lt;n; ii++)
			b[ii] = 1.0;
	
	    fclose(inputFile);
		//printf("%d %d %d %d %lf %lf", ia[0], ia[ia_index-1], ja[0], ja[ja_index-1], a[0], a[ja_index-1]);
		//printf("%d %d", iparm[40], iparm[41]);
		//printf("%lf %lf %lf %lf",b[0],b[1],b[ia_index-3],b[ia_index-1]);
        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
    }
    
	else if (rank == 1)
    {
		//mkl_set_num_threads(2);
		inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_2.txt", "r");

		fscanf(inputFile, "%lld %lld", &amp;amp;ia_index, &amp;amp;ja_index);

		iparm[40] = (n/4)+1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = (n/4)*2; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */

        ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64);
        ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64);
        a = (double*) MKL_malloc (sizeof (double) * ja_index, 64);
        x = (double*) MKL_malloc (sizeof (double) * n, 64);
        b = (double*) MKL_malloc (sizeof (double) * n, 64);
        MPI_REDUCE_AND_BCAST;

		for (ii=0; ii&amp;lt;ia_index; ii++)
			fscanf(inputFile, "%lld", &amp;amp;ia[ii]);
		for (ii=0; ii&amp;lt;ja_index; ii++)
			fscanf(inputFile, "%lld", &amp;amp;ja[ii]);
		for (ii=0; ii&amp;lt;ja_index; ii++)
			fscanf(inputFile, "%lf", &amp;amp;a[ii]);
	
	    fclose(inputFile);

        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 1 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
    }

	else if (rank == 2)
    {
		//mkl_set_num_threads(2);
		inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_3.txt", "r");

		fscanf(inputFile, "%lld %lld", &amp;amp;ia_index, &amp;amp;ja_index);

		iparm[40] = (n/4)*2+1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = (n/4)*3; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */

        ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64);
        ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64);
        a = (double*) MKL_malloc (sizeof (double) * ja_index, 64);
        x = (double*) MKL_malloc (sizeof (double) * n, 64);
        b = (double*) MKL_malloc (sizeof (double) * n, 64);
        MPI_REDUCE_AND_BCAST;

		for (ii=0; ii&amp;lt;ia_index; ii++)
			fscanf(inputFile, "%lld", &amp;amp;ia[ii]);
		for (ii=0; ii&amp;lt;ja_index; ii++)
			fscanf(inputFile, "%lld", &amp;amp;ja[ii]);
		for (ii=0; ii&amp;lt;ja_index; ii++)
			fscanf(inputFile, "%lf", &amp;amp;a[ii]);
	
	    fclose(inputFile);

        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
    }

	else if (rank == 3)
    {
		//mkl_set_num_threads(2);
		inputFile=fopen("/user/rail7/source/CREATE_MATRIX/SPM_test_20000_n_4.txt", "r");

		fscanf(inputFile, "%lld %lld", &amp;amp;ia_index, &amp;amp;ja_index);

		iparm[40] = (n/4)*3+1; /* The number of row in global matrix, rhs element and solution vector
                          that begins the input domain belonging to this MPI process */
        iparm[41] = (n/4)*4; /* The number of row in global matrix, rhs element and solution vector
                          that ends the input domain belonging to this MPI process   */

        ia = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ia_index, 64);
        ja = (MKL_INT64*) MKL_malloc (sizeof (MKL_INT64) * ja_index, 64);
        a = (double*) MKL_malloc (sizeof (double) * ja_index, 64);
        x = (double*) MKL_malloc (sizeof (double) * n, 64);
        b = (double*) MKL_malloc (sizeof (double) * n, 64);
        MPI_REDUCE_AND_BCAST;

		for (ii=0; ii&amp;lt;ia_index; ii++)
			fscanf(inputFile, "%lld", &amp;amp;ia[ii]);
		for (ii=0; ii&amp;lt;ja_index; ii++)
			fscanf(inputFile, "%lld", &amp;amp;ja[ii]);
		for (ii=0; ii&amp;lt;ja_index; ii++)
			fscanf(inputFile, "%lf", &amp;amp;a[ii]);
	
	    fclose(inputFile);

        if ( ia == NULL || ja == NULL || a == NULL || x == NULL || b == NULL )
        {
            if ( rank == 0 ) printf ("\nERROR during memory allocation: %lli", (long long int)error);
            mpi_stat = MPI_Finalize();
            return 1;
        }
    }

	else {
		MPI_REDUCE_AND_BCAST;
        /* In this example MPI processes with rank &amp;gt; 1 doesn't have input domain
           so iparm[40] need to be greater then iparm[41] */
        iparm[40] = 2;
        iparm[41] = 1;
    }

	//***************************************************************************************************************************************************

	gethostname(hostName, 1023);
    printf("\n%d th rank at the end of the loading phase, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	if (rank == 0 )
		before = clock();
	/* -------------------------------------------------------------------- */
    /* .. Reordering and Symbolic Factorization. This step also allocates   */
    /* all memory that is necessary for the factorization.                  */
    /* -------------------------------------------------------------------- */
    phase = 11;
    cluster_sparse_solver_64 ( pt, &amp;amp;maxfct, &amp;amp;mnum, &amp;amp;mtype, &amp;amp;phase,
        &amp;amp;n, a, ia, ja, &amp;amp;idum, &amp;amp;nrhs, iparm, &amp;amp;msglvl, &amp;amp;ddum, &amp;amp;ddum, &amp;amp;comm, &amp;amp;error );
    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during symbolic factorization: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 1;
    }

    if ( rank == 0 ) printf ("\nReordering completed ... ");
		
	printf("\n%d th rank at the end of the phase 11, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
	printf("\n%d th rank at the end of the phase 11, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	if (rank == 0 )
	{
		result = (double)(clock()-before)/CLOCKS_PER_SEC;
		printf("##### Time consumption for reordering is %7.2lf seconds.\n", result);
		before = clock();
	}

	//***************************************************************************************************************************************************
    /* -------------------------------------------------------------------- */
    /* .. Numerical factorization.                                          */
    /* -------------------------------------------------------------------- */
    phase = 22;
    cluster_sparse_solver_64 ( pt, &amp;amp;maxfct, &amp;amp;mnum, &amp;amp;mtype, &amp;amp;phase,
        &amp;amp;n, a, ia, ja, &amp;amp;idum, &amp;amp;nrhs, iparm, &amp;amp;msglvl, &amp;amp;ddum, &amp;amp;ddum, &amp;amp;comm, &amp;amp;error );
    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during numerical factorization: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 2;
    }
    if ( rank == 0 ) printf ("\nFactorization completed ... ");
	
	printf("\n%d th rank at the end of the phase 22, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
	printf("\n%d th rank at the end of the phase 22, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	if (rank == 0 )
	{
		result = (double)(clock()-before)/CLOCKS_PER_SEC;
		printf("##### Time consumption for factorization is %7.2lf seconds.\n", result);
		before = clock();
	}

	//***************************************************************************************************************************************************
    /* -------------------------------------------------------------------- */
    /* .. Back substitution and iterative refinement.                       */
    /* -------------------------------------------------------------------- */
    phase = 33;

    if ( rank == 0 ) printf ("\nSolving system...");
    cluster_sparse_solver_64 ( pt, &amp;amp;maxfct, &amp;amp;mnum, &amp;amp;mtype, &amp;amp;phase,
        &amp;amp;n, a, ia, ja, &amp;amp;idum, &amp;amp;nrhs, iparm, &amp;amp;msglvl, b, x, &amp;amp;comm, &amp;amp;error );
    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during solution: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 4;
    }
    /* The solution of the system is distributed between MPI processes like as input matrix
       so MPI processes with rank 0 and 1 keep only part of solution */
	
	printf("\n%d th rank at the end of the program, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
	printf("\n%d th rank at the end of the program, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	if (rank == 0 )
	{
		result = (double)(clock()-before)/CLOCKS_PER_SEC;
		printf("##### Time consumption for solving is %7.2lf seconds.\n", result);
	}

	//***************************************************************************************************************************************************
    if ( rank == 0 )
    {
        printf ("\nThe solution of the system is: ");
        for ( j = 0; j &amp;lt; 10; j++ )
        {
            printf ("\n on zero process x [%lli] = % f", (long long int)j, x&lt;J&gt;);
        }
        printf ("\n");
    }
    MPI_Barrier(MPI_COMM_WORLD);

    if ( rank == 1 )
    {
        printf ("\nThe solution of the system is: ");
        for ( j = n-10; j &amp;lt; n; j++ )
        {
            printf ("\n on first process x [%lli] = % f", (long long int)j, x&lt;J&gt;);
        }
        printf ("\n");
    }
    MPI_Barrier(MPI_COMM_WORLD);

	//double res, res0;
    //char*   uplo;
	//uplo = "Upper-triangle";
    //mkl_cspblas_scsrsymv ( uplo, &amp;amp;n, a, ia, ja, x, bs );
    //res  = 0.0;
    //res0 = 0.0;

	//printf ("%lf %lf %lf %lf \n", b[0], b[1], b[2], b[3]);
	//printf ("%lf %lf %lf %lf \n", bs[0], bs[1], bs[2], bs[3]);
	//printf("XXXX");
	
    /* -------------------------------------------------------------------- */
    /* .. Termination and release of memory. */
    /* -------------------------------------------------------------------- */
    phase = -1; /* Release internal memory. */
    cluster_sparse_solver_64 ( pt, &amp;amp;maxfct, &amp;amp;mnum, &amp;amp;mtype, &amp;amp;phase,
        &amp;amp;n, &amp;amp;ddum, ia, ja, &amp;amp;idum, &amp;amp;nrhs, iparm, &amp;amp;msglvl, &amp;amp;ddum, &amp;amp;ddum, &amp;amp;comm, &amp;amp;error );

	//printf ("%lf %lf %lf %lf \n", b[0], b[1], b[2], b[3]);
	//printf ("%lf %lf %lf %lf \n", bs[0], bs[1], bs[2], bs[3]);

    if ( error != 0 )
    {
        if ( rank == 0 ) printf ("\nERROR during release memory: %lli", (long long int)error);
        mpi_stat = MPI_Finalize();
        return 5;
    }
    if ( rank &amp;lt; size )
    {
        MKL_free(ia);
        MKL_free(ja);
        MKL_free(a);
        MKL_free(x);
        MKL_free(b);
    }
	
	printf("\n%d th rank at the end of the phase 33, hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);

	//gethostname(hostName, 1023);
	//printf("\n%d th rank hostname : %s peak memory : %ld bytes\n", rank, hostName, mkl_peak_mem_usage(MKL_PEAK_MEM));
	//printf("\nPeak memory allocated by Intel MKL memory allocator after reset of peak memory counter %ld bytes\n", mkl_peak_mem_usage(MKL_PEAK_MEM));

    mpi_stat = MPI_Finalize();
    return 0;
}
&lt;/J&gt;&lt;/J&gt;&lt;/PRE&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 07 Feb 2017 02:09:12 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Large-sparse-matrix-solving-problem-with-cluster/m-p/1089784#M23167</guid>
      <dc:creator>YONGHEE_L_</dc:creator>
      <dc:date>2017-02-07T02:09:12Z</dc:date>
    </item>
    <item>
      <title>Hi Yong-hee,</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Large-sparse-matrix-solving-problem-with-cluster/m-p/1089785#M23168</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;SPAN style="font-size: 12px;"&gt;Yong-hee,&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 12px;"&gt;The results that you shows are quite strange, can i ask you to send us the tested matrix? Something goes wrong and we need to play with reproducer on our side&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 12px;"&gt;Thanks,&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 12px;"&gt;Alex&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 08 Feb 2017 09:44:37 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Large-sparse-matrix-solving-problem-with-cluster/m-p/1089785#M23168</guid>
      <dc:creator>Alexander_K_Intel2</dc:creator>
      <dc:date>2017-02-08T09:44:37Z</dc:date>
    </item>
    <item>
      <title>Hi Alexander Kalinkin,</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Large-sparse-matrix-solving-problem-with-cluster/m-p/1089786#M23169</link>
      <description>&lt;P&gt;Hi Alexander Kalinkin,&lt;/P&gt;

&lt;P&gt;Below, I attach code for matrix generation..&lt;BR /&gt;
	If you execute this code, you can input the matrix size.&lt;BR /&gt;
	First, input 10000 at size and input 25 at interconnection, then you can generate a matrix having 4*10^8 rows same that I used.&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 1em;"&gt;I really appreciate your consideration and help.&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;Regards,&amp;nbsp;&lt;BR /&gt;
	Yong-hee&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;#include &amp;lt;stdio.h&amp;gt;
#include &amp;lt;stdlib.h&amp;gt;
#include &amp;lt;tgmath.h&amp;gt;

void main()
{
	unsigned long long int i, j, row, col, input_n, percentage, temp_1;
	unsigned long long int spRow, spColumn, ia_index, ja_index, ia_check, rate_of_intercon;		
	unsigned long long int total_ia, total_ja;
	unsigned long long int spRowrow, spRowrowcol, spRC;
	unsigned long long int *ia_data, *ja_data, *division_index;
	double *a_data;
	FILE *SP_0;
	FILE *SP_1;
	FILE *SP_2;
	FILE *SP_3;
	FILE *SP_4;
	FILE *SP_0_1;
	FILE *SP_0_2;
	FILE *SP_0_3;
	FILE *SP_0_4;
	char title[300] = "SPM_test_";
	char title2;

	printf("Size of a die(int) : ");
	scanf("%llu", &amp;amp;input_n);
	printf("Perc. of interconnection(int) : ");
	scanf("%llu", &amp;amp;percentage);

	//sprintf(title2, "%llu", input_n);
	//title = title+title2;
	//puts(title);
	
	spRow = spColumn = input_n;	//size of 1-die of PDN
	spRC = spRow*spColumn;
	rate_of_intercon = percentage;		//percentage of interconnection
	total_ia = 4*(spRow*spRow)+1;
	total_ja = 4*3*spRow*spRow;
	//total_ja = 4*(3*(spRow-1)*(spRow-1)+2*2*(spRow-1)+1)+3*(spRow*(rate_of_intercon/100.0))pRow

	
	printf("Total number of the 'ia' is %llu and 'ja' is %llu\n", total_ia, total_ja);
	
	ia_data = (unsigned long long int*)malloc(sizeof(unsigned long long int)*total_ia);
	ja_data = (unsigned long long int*)malloc(sizeof(unsigned long long int)*total_ja);
	a_data = (double*)malloc(sizeof(double)*total_ja);
	division_index = (unsigned long long int*)malloc(sizeof(unsigned long long int)*5);
	
	ia_index = 0;
	ja_index = 0;
	ia_check = 1;
	division_index[0] = 0;
	ia_data[0] = 1;
	
	for (row=0; row&amp;lt;spRow; row++)		//1st-die
	{
		spRowrow = spRow*row;
		for (col=0; col&amp;lt;spColumn; col++)
		{
			spRowrowcol = spRowrow+(col+1);
			//printf("%llu %llu|", row, col);
			if (((row+1)==spRow) &amp;amp;&amp;amp; ((col+1)==spColumn))
			{

				division_index[1] = ja_index;
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
			}
			else if ((row+1)==spRow)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
			}	
			else if ((col+1)==spColumn)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}	
			else if (((row+1)==(spRow/2)) &amp;amp;&amp;amp; (((col+1)%(100/rate_of_intercon))==0))
			{
				//printf("row : %llu, column : %llu\n", row+1, col+1);
				a_data[ia_index] = 5;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRC;
				ia_index++;
				ja_index++;
			}
			else
			{	
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}
			ia_data[ia_check] = ia_index+1;
			ia_check++;
		}
	}
	for (row=0; row&amp;lt;spRow; row++)		//2nd-die
	{
		spRowrow = spRC + spRow*row;
		for (col=0; col&amp;lt;spColumn; col++)
		{
			spRowrowcol = spRowrow+(col+1);
			if (((row+1)==spRow) &amp;amp;&amp;amp; ((col+1)==spColumn))
			{
				division_index[2] = ja_index;
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
			}
			else if ((row+1)==spRow)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
			}	
			else if ((col+1)==spColumn)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}	
			else if (((row+1)==(spRow/2)) &amp;amp;&amp;amp; (((col+1)%(100/rate_of_intercon))==0))
			{
				a_data[ia_index] = 6;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRC;
				ia_index++;
				ja_index++;
			}
			else
			{	
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}
			ia_data[ia_check] = ia_index+1;
			ia_check++;
		}
	}
	for (row=0; row&amp;lt;spRow; row++)		//3rd-die
	{
		spRowrow = spRC*2 + spRow*row;
		for (col=0; col&amp;lt;spColumn; col++)
		{
			spRowrowcol = spRowrow+(col+1);
			if (((row+1)==spRow) &amp;amp;&amp;amp; ((col+1)==spColumn))
			{
				division_index[3] = ja_index;
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
			}
			else if ((row+1)==spRow)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
			}	
			else if ((col+1)==spColumn)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}	
			else if (((row+1)==(spRow/2)) &amp;amp;&amp;amp; (((col+1)%(100/rate_of_intercon))==0))
			{
				a_data[ia_index] = 6;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRC;
				ia_index++;
				ja_index++;
			}
			else
			{	
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}
			ia_data[ia_check] = ia_index+1;
			ia_check++;
		}
	}
	for (row=0; row&amp;lt;spRow; row++)		//4th-die
	{
		spRowrow = spRC*3 + spRow*row;
		for (col=0; col&amp;lt;spColumn; col++)
		{
			spRowrowcol = spRowrow+(col+1);
			if (((row+1)==spRow) &amp;amp;&amp;amp; ((col+1)==spColumn))
			{
				division_index[4] = ja_index;
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
			}
			else if ((row+1)==spRow)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
			}	
			else if ((col+1)==spColumn)
			{
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}	
			else if (((row+1)==(spRow/2)) &amp;amp;&amp;amp; (((col+1)%(100/rate_of_intercon))==0))
			{
				a_data[ia_index] = 5;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}
			else
			{	
				a_data[ia_index] = 4;
				ja_data[ja_index] = spRowrowcol;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+1;
				ia_index++;
				ja_index++;
				a_data[ia_index] = -1;
				ja_data[ja_index] = spRowrowcol+spRow;
				ia_index++;
				ja_index++;
			}
			ia_data[ia_check] = ia_index+1;
			ia_check++;
		}
	}
	
	a_data[0] = 1000.0;	//Voltage source

	//********************************************

	SP_0 = fopen("SPM_test_20000_whole.txt", "w");
	fprintf(SP_0, "%llu %llu\n", ia_check, ja_index);
	for (i=0; i&amp;lt;ia_check; i++)
		fprintf(SP_0, "%llu ", ia_data&lt;I&gt;);
	fprintf(SP_0, "\n");
	for (i=0; i&amp;lt;ja_index; i++)
		fprintf(SP_0, "%llu ", ja_data&lt;I&gt;);
	fprintf(SP_0, "\n");
	for (i=0; i&amp;lt;ja_index; i++)
		fprintf(SP_0, "%lf ", a_data&lt;I&gt;);
	fprintf(SP_0, "\n");

	//********************************************

	SP_0_1 = fopen("SPM_test_20000_n_1.txt", "w");
	SP_0_2 = fopen("SPM_test_20000_n_2.txt", "w");
	SP_0_3 = fopen("SPM_test_20000_n_3.txt", "w");
	SP_0_4 = fopen("SPM_test_20000_n_4.txt", "w");

	fprintf(SP_0_1, "%llu %llu\n", input_n*input_n+1, ia_data[input_n*input_n]-ia_data[0]);
	fprintf(SP_0_2, "%llu %llu\n", input_n*input_n+1, ia_data[input_n*input_n*2]-ia_data[input_n*input_n]);
	fprintf(SP_0_3, "%llu %llu\n", input_n*input_n+1, ia_data[input_n*input_n*3]-ia_data[input_n*input_n*2]);
	fprintf(SP_0_4, "%llu %llu\n", input_n*input_n+1, ia_data[input_n*input_n*4]-ia_data[input_n*input_n*3]);

	for (i=0; i&amp;lt;=input_n*input_n; i++)
		fprintf(SP_0_1, "%llu ", ia_data&lt;I&gt;);
	fprintf(SP_0_1, "\n");
	temp_1 = ia_data[input_n*input_n]-1;
	for (i=input_n*input_n; i&amp;lt;=input_n*input_n*2; i++)
		fprintf(SP_0_2, "%llu ", ia_data&lt;I&gt;-temp_1);
	fprintf(SP_0_2, "\n");
	temp_1 = ia_data[input_n*input_n*2]-1;
	for (i=input_n*input_n*2; i&amp;lt;=input_n*input_n*3; i++)
		fprintf(SP_0_3, "%llu ", ia_data&lt;I&gt;-temp_1);
	fprintf(SP_0_3, "\n");
	temp_1 = ia_data[input_n*input_n*3]-1;
	for (i=input_n*input_n*3; i&amp;lt;=input_n*input_n*4; i++)
		fprintf(SP_0_4, "%llu ", ia_data&lt;I&gt;-temp_1);
	fprintf(SP_0_4, "\n");

	for (i=ia_data[0]-1; i&amp;lt;ia_data[input_n*input_n]-1; i++)
		fprintf(SP_0_1, "%llu ", ja_data&lt;I&gt;);
	fprintf(SP_0_1, "\n");
	for (i=ia_data[input_n*input_n]-1; i&amp;lt;ia_data[input_n*input_n*2]-1; i++)
		fprintf(SP_0_2, "%llu ", ja_data&lt;I&gt;);
	fprintf(SP_0_2, "\n");
	for (i=ia_data[input_n*input_n*2]-1; i&amp;lt;ia_data[input_n*input_n*3]-1; i++)
		fprintf(SP_0_3, "%llu ", ja_data&lt;I&gt;);
	fprintf(SP_0_3, "\n");
	for (i=ia_data[input_n*input_n*3]-1; i&amp;lt;ia_data[input_n*input_n*4]-1; i++)
		fprintf(SP_0_4, "%llu ", ja_data&lt;I&gt;);
	fprintf(SP_0_4, "\n");

	for (i=ia_data[0]-1; i&amp;lt;ia_data[input_n*input_n]-1; i++)
		fprintf(SP_0_1, "%lf ", a_data&lt;I&gt;);
	fprintf(SP_0_1, "\n");
	for (i=ia_data[input_n*input_n]-1; i&amp;lt;ia_data[input_n*input_n*2]-1; i++)
		fprintf(SP_0_2, "%lf ", a_data&lt;I&gt;);
	fprintf(SP_0_2, "\n");
	for (i=ia_data[input_n*input_n*2]-1; i&amp;lt;ia_data[input_n*input_n*3]-1; i++)
		fprintf(SP_0_3, "%lf ", a_data&lt;I&gt;);
	fprintf(SP_0_3, "\n");
	for (i=ia_data[input_n*input_n*3]-1; i&amp;lt;ia_data[input_n*input_n*4]-1; i++)
		fprintf(SP_0_4, "%lf ", a_data&lt;I&gt;);
	fprintf(SP_0_4, "\n");

	//********************************************
	
	a_data[division_index[1]] = a_data[division_index[1]]/2.0;
	a_data[division_index[2]] = a_data[division_index[2]]/2.0;
	a_data[division_index[3]] = a_data[division_index[3]]/2.0;

	SP_1 = fopen("SPM_test_20000_1.txt", "w");
	fprintf(SP_1, "%llu %llu %llu %llu\n", spRC+1, division_index[1]-division_index[0]+1, 1, spRC);
	for (i=0; i&amp;lt;=spRC; i++)
		fprintf(SP_1, "%llu ", ia_data&lt;I&gt;);
	//fprintf(SP_1, "%llu ", ia_data&lt;I&gt;);
	fprintf(SP_1, "\n");
	for (i=division_index[0]; i&amp;lt;=division_index[1]; i++)
		fprintf(SP_1, "%llu ", ja_data&lt;I&gt;);
	fprintf(SP_1, "\n");
	for (i=division_index[0]; i&amp;lt;=division_index[1]; i++)
		fprintf(SP_1, "%lf ", a_data&lt;I&gt;);
	fprintf(SP_1, "\n");

	SP_2 = fopen("SPM_test_20000_2.txt", "w");
	fprintf(SP_2, "%llu %llu %llu %llu\n", spRC+2, division_index[2]-division_index[1]+1, spRC, spRC*2);
	for (i=spRC-1; i&amp;lt;=spRC*2; i++)
		fprintf(SP_2, "%llu ", ia_data&lt;I&gt;-ia_data[spRC-1]+1);
	//fprintf(SP_2, "%llu ", ia_data&lt;I&gt;-ia_data[spRC-1]+1);
	fprintf(SP_2, "\n");
	for (i=division_index[1]; i&amp;lt;=division_index[2]; i++)
		fprintf(SP_2, "%llu ", ja_data&lt;I&gt;);
	fprintf(SP_2, "\n");
	for (i=division_index[1]; i&amp;lt;=division_index[2]; i++)
		fprintf(SP_2, "%lf ", a_data&lt;I&gt;);
	fprintf(SP_2, "\n");

	SP_3 = fopen("SPM_test_20000_3.txt", "w");
	fprintf(SP_3, "%llu %llu %llu %llu\n", spRC+2, division_index[3]-division_index[2]+1, spRC*2, spRC*3);
	for (i=spRC*2-1; i&amp;lt;=spRC*3; i++)
		fprintf(SP_3, "%llu ", ia_data&lt;I&gt;-ia_data[spRC*2-1]+1);
	//fprintf(SP_3, "%llu ", ia_data&lt;I&gt;-ia_data[spRC*2-1]+1);
	fprintf(SP_3, "\n");
	for (i=division_index[2]; i&amp;lt;=division_index[3]; i++)
		fprintf(SP_3, "%llu ", ja_data&lt;I&gt;);
	fprintf(SP_3, "\n");
	for (i=division_index[2]; i&amp;lt;=division_index[3]; i++)
		fprintf(SP_3, "%lf ", a_data&lt;I&gt;);
	fprintf(SP_3, "\n");

	SP_4 = fopen("SPM_test_20000_4.txt", "w");
	fprintf(SP_4, "%llu %llu %llu %llu\n", spRC+2, division_index[4]-division_index[3]+1, spRC*3, spRC*4);
	for (i=spRC*3-1; i&amp;lt;=spRC*4; i++)
		fprintf(SP_4, "%llu ", ia_data&lt;I&gt;-ia_data[spRC*3-1]+1);
	//fprintf(SP_4, "%llu ", ia_data&lt;I&gt;-ia_data[spRC*3-1]+1);
	fprintf(SP_4, "\n");
	for (i=division_index[3]; i&amp;lt;=division_index[4]; i++)
		fprintf(SP_4, "%llu ", ja_data&lt;I&gt;);
	fprintf(SP_4, "\n");
	for (i=division_index[3]; i&amp;lt;=division_index[4]; i++)
		fprintf(SP_4, "%lf ", a_data&lt;I&gt;);
	fprintf(SP_4, "\n");

	//********************************************

	fclose(SP_0);
	fclose(SP_1);
	fclose(SP_2);
	fclose(SP_3);
	fclose(SP_4);
	fclose(SP_0_1);
	fclose(SP_0_2);
	fclose(SP_0_3);
	fclose(SP_0_4);

	free(ia_data);
	free(ja_data);
	free(a_data);
	free(division_index);

}
&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/PRE&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 13 Feb 2017 01:16:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Large-sparse-matrix-solving-problem-with-cluster/m-p/1089786#M23169</guid>
      <dc:creator>YONGHEE_L_</dc:creator>
      <dc:date>2017-02-13T01:16:00Z</dc:date>
    </item>
  </channel>
</rss>

