copy the first array to second array as regular loop cycle period and parallel loop with difference length of different array

bo__john · ‎01-30-2020

I wants copy the first array to second array as regular loop cycle period, like that:

 

struct myStruct dt_v1[10];
struct myStruct dt_v2[10];
struct myStruct dt_v13[3];
struct myStruct dt_v14[3];
struct myStruct dt_v15[3];



void abc1(){
    strcpy(dt_v1[0].a, "102755703");
    strcpy(dt_v1[1].a, "ab10");
    strcpy(dt_v1[2].a, "cd10");
    strcpy(dt_v1[3].a, "13");
    strcpy(dt_v1[4].a, "aa5");
    strcpy(dt_v1[5].a, "aa184");
    strcpy(dt_v1[6].a, "1");
    strcpy(dt_v1[7].a, "&&13");
    strcpy(dt_v1[8].a, "%%14");
    strcpy(dt_v1[9].a, "!!1");

}

int i, j1, j2, j3, k;


// I put this loop in main() but compile ok,  run failed for a large number, 
//it only running more than 100, but my dat is more than 1000++

for(i=0; i<10; i++)
{
	j1 =i*3;
    j2 = j1+1;
    j3 = j1+2;
    k=j1/3;
strcpy(dt_v13.a, dt_v1[j1].a);
strcpy(dt_v14.a, dt_v1[j2].a);
strcpy(dt_v15.a, dt_v1[j3].a);

}


// this loop in main()
 for (int i = 0; i < 3; i++) {
    std::cout << dt_v3.a <<std::endl;
  }

my last code and result

int main(){

 
   
     int j, test1;
     
     
 

    abc(); // Initialisation for dt_v1[]
 
    dpcpp_parallel();    
 
     
      for (int i = 1; i < 1651; i++) {
		  j=i*25;
    std::cout << dt_v1.a <<std::endl;
 
  }
    return 0;
}

 

u35272@login-2:~/exc/dpc1/dpc_5$ cat *o*917

########################################################################
#      Date:           Wed Jan 29 12:59:16 PST 2020
#    Job ID:           477917.v-qsvr-1.aidevcloud
#      User:           u35272
# Resources:           neednodes=2:gpu:ppn=2,nodes=2:gpu:ppn=2,walltime=06:00:00
########################################################################

./t29
Intel(R) Gen9 HD Graphics NEO
102809408
102755703
102772414
102756988
102782321
102755748
102743577
102824987
102750121
102784113
102752389
102834161
102818434
102829292
102813096
102744544
102838772
102847134
102738468
102740481
102844083
102752193
102757704
102822715
102815659
102807175
102801481
102760240
102744845
102746524
102750141
102812579
102813198
102817848
102775056
102826594
102738494
102753935
102761366
102745783
102797165
102749800
102754511
102843550
102749845
102805684
102739925
102822741
102825670
102834045
102771447
102749888
102781802
102834752
102829958
102990200
102782250
102756310
102831321
102827544
102820133
102754463
102829326
102752257
102738743

Makefile:25: recipe for target 'run_dpcpp' failed

########################################################################
# End of output for job 477917.v-qsvr-1.aidevcloud
# Date: Wed Jan 29 12:59:19 PST 2020
########################################################################

then

At first copy dt_v1[0].a and dt_v1[5].a to new array dt_v3[0].a and dt_v3[1].a, then atoi to integer,

then operation to a integer array,

may be arithmetic operation , like sum to new array C[index] = sum, and where can do cout of those array.

please give a example like that:

        //Submitting command group to queue to compute matrix mulitiplication c=a*b
        device_queue.submit([&](handler &cgh){
            // Read from a and b, write to c
            auto A = a.get_access<access::mode::read>(cgh);
            auto B = b.get_access<access::mode::read>(cgh);
            auto C = c.get_access<access::mode::write>(cgh);

            int WidthA = a.get_range()[1];

            //Executing kernel
            cgh.parallel_for<class MatrixMult>(range<2>{M, P}, [=](id<2> index){
	        //Get global position in Y direction
	        int row = index[0];
	        //Get global position in X direction
	        int col = index[1];

	        double sum = 0.0;
	        //Compute the result of one element in c
	        for (int i = 0; i < WidthA; i++) {
	            sum += A[row] * B[col];
	        }

	        C[index] = sum;
            });

        });
    }    //End of scope, so we wait for kernel producing result data to host memory c_back to complete

and if I do the parallel loop with difference length of different array, use the function like below:

// Matrix size constants
#define SIZE     1200     // Must be a multiple of 8.
#define M        SIZE/8
#define N        SIZE/4
#define P        SIZE/2

     // Submitting command group to queue to initialize matrix a
        device_queue.submit([&](handler &cgh) {
            // Getting write only access to the buffer on a device
            auto Accessor = a.get_access<access::mode::write>(cgh);
            // Executing kernel
            cgh.parallel_for<class FillBuffer_a>( range<2>{M, N}, [=](id<2> index) {
                // a is identity matrix
                Accessor[index] = 1.0;
            });
        });
    
        //Submitting command group to queue to initialize matrix b
        device_queue.submit([&](handler &cgh) {
            // Getting write only access to the buffer on a device
            auto Accessor = b.get_access<access::mode::write>(cgh);
            //Executing kernel
            cgh.parallel_for<class FillBuffer_b>( range<2>{N, P}, [=](id<2> index){
	        // each column of b is the sequence 1,2,...,N	    
                Accessor[index] = index[0] + 1.;
            });    
        });

Please give a complete code example.

Thank You!

GouthamK_Intel · ‎01-30-2020

Hi John,

We couldn't understand the exact problem which you are facing. Can you please give some more clarity on the issue which you are facing.

If possible can you share your complete source code and also .e*** file (error file) which is produced along with .o**** file(output file) in devcloud. So that we can investigate more on this issue.

Thanks

Goutham

GouthamK_Intel · ‎02-04-2020

Hi John,

Can you please elaborate more on the issue which you are facing. Are you looking for a way to parallelize the copying of the first array to the second array? or Is there any other issue you are facing?

Thanks

Goutham

bo__john · ‎02-05-2020

Hi, Goutham!

Sorry so late reply to you!

I mean how to do it fast.

u35272@login-2:~/exc/dpc1/dpc_5$ cat *e*741
=>> PBS: job killed: walltime 21617 exceeded limit 21600

I have one program, about 1.8 million line 4 character data string with array, but running one devcloud more than 6 hours, 21600 second, so it is killed by system.

I mean i put the loop in main, or put all array copy operation loop in the parallel loop, like last time example I give. which is fast?

And how can I control the program less than 6 hours ? how many lines data array is max as your experience ?

Last time I put wrong total number of array and loop, so I fixed the problem myself.

This is new program without mistake.

Program like that :

void dpcpp_parallel(){
    // ---------SYCL SCOPE STARTS------------
    {
        default_selector device_selector;
        queue device_queue(device_selector);
        cout<<device_queue.get_device().get_info<info::device::name>()<<std::endl;  //print name of the device it is running on.
        buffer<struct myStruct,1> buff_dt_v1(dt_v1,range<1>{1650});
   //      buffer<struct myStruct,1> buff_dt_v2(dt_v2,range<1>{1650/25});
        device_queue.submit([&](handler &cgh){        
            auto acc_dt_v1 =buff_dt_v1.get_access<access::mode::write>(cgh);        
       //     auto acc_dt_v2 =buff_dt_v2.get_access<access::mode::write>(cgh);
            cgh.parallel_for<class StructClass>(range<1>{1650},[=](id<1> index){
             struct myStruct* myAcc1=(struct myStruct*)(&acc_dt_v1[index]);
           //  struct myStruct* myAcc2=(struct myStruct*)(&acc_dt_v2[index]);
            //**************your code logic starts from here**************************
                // to access array "a" use this    
                     char* myArray1=myAcc1->a;
                //     char* myArray2=myAcc2->a;
                // To access int "id" use this
                    int myId=myAcc1->id;
            });
        });
    }
}

void f3(string str)
{        
    
  FILE *mf;
     
     mf=fopen("w1_bs2_2.sh", "w");
         fprintf(mf,"#!/bin/bash\n");
         fprintf(mf, "echo > seq1/%s\n", str );
         fprintf(mf, "exit 0\n");
        
        fclose(mf);
}

 
int main(){


   
     int j, test1;
     
     

    abc(); // Initialisation for dt_v1[]
 
    dpcpp_parallel();    
 
   
       
      for (int i = 1; i < 1651; i++) {
          if(i<66)
          {
          j=(i*25)+1;
    std::cout << dt_v1.a <<std::endl;
    
        strcpy(dt_v2.a, dt_v1.a);
        
         std::cout << "dt2\t" << dt_v2.a <<std::endl;
        
     f3(dt_v2.a);

 
   system("chmod 777 *.sh");
   system("./w1_bs2_2.sh");
        
 }
  }
 
    return 0;
}



int main(){


   
     int j, test1;
     
     
 

    abc(); // Initialisation for dt_v1[]
 
    dpcpp_parallel();    
 
   
       
      for (int i = 1; i < 1651; i++) {
		  if(i<66)
		  {
		  j=(i*25)+1;
    std::cout << dt_v1.a <<std::endl;
    
    	strcpy(dt_v2.a, dt_v1.a);
    	
    	 std::cout << "dt2\t" << dt_v2.a <<std::endl;
    	 
     f3(dt_v2.a);

  
   system("chmod 777 *.sh");
   system("./w1_bs2_2.sh");
    	 
 }
  }
  
    return 0;
}

I wants put the for loop in main , as loop in the function " dpcpp_parallel() ", is it possible let program fast ?

this is a example of put the array operation in the function parallel loop, I try to some program, but failed.

// Matrix size constants
#define SIZE     1200     // Must be a multiple of 8.
#define M        SIZE/8
#define N        SIZE/4
#define P        SIZE/2

     // Submitting command group to queue to initialize matrix a
        device_queue.submit([&](handler &cgh) {
            // Getting write only access to the buffer on a device
            auto Accessor = a.get_access<access::mode::write>(cgh);
            // Executing kernel
            cgh.parallel_for<class FillBuffer_a>( range<2>{M, N}, [=](id<2> index) {
                // a is identity matrix
                Accessor[index] = 1.0;
            });
        });
    
        //Submitting command group to queue to initialize matrix b
        device_queue.submit([&](handler &cgh) {
            // Getting write only access to the buffer on a device
            auto Accessor = b.get_access<access::mode::write>(cgh);
            //Executing kernel
            cgh.parallel_for<class FillBuffer_b>( range<2>{N, P}, [=](id<2> index){
	        // each column of b is the sequence 1,2,...,N	    
                Accessor[index] = index[0] + 1.;
            });    
        });

Thanks a lot!

John

GouthamK_Intel · ‎02-06-2020

Hi John,

We tried running your code snippet in DevCloud and we didn't face any problem while copying the data from one array to another array for 1.8 Million data items.

but we observed that you are doing some file I/O operations inside for loop. as quoted below.

void f3(string str)
027{
028
029  FILE *mf;
030
031     mf=fopen("w1_bs2_2.sh", "w");
032         fprintf(mf,"#!/bin/bash\n");
033         fprintf(mf, "echo > seq1/%s\n", str );
034         fprintf(mf, "exit 0\n");
035
036        fclose(mf);
037}

and we also observed that you are trying to run a script file inside for loop which may have taken more time to execute, resulting in your code execution time exceed in DevCloud.

107 system("./w1_bs2_2.sh");

Can you please provide time taken to execute for below codes individually? So that we can investigate more and provide you a better solution.

087 abc(); // Initialisation for dt_v1[]

089 dpcpp_parallel();

  for (int i = 1; i < 1651; i++) {
094          if(i<66)
095          {
096          j=(i*25)+1;
097    std::cout << dt_v1.a <<std::endl;
098
099        strcpy(dt_v2.a, dt_v1.a);
100
101         std::cout << "dt2\t" << dt_v2.a <<std::endl;
102
103     f3(dt_v2.a);
104
105
106   system("chmod 777 *.sh");
107   system("./w1_bs2_2.sh");
108
109 }
110  }

Thanks

Goutham

bo__john · ‎02-06-2020

This is the code of 1.7 millions line of array,

strcpy(dic_1[1675730].a, "4EBA");
strcpy(dic_1[1675731].a, "7684");
strcpy(dic_1[1675732].a, "7956");
strcpy(dic_1[1675733].a, "9C81");
strcpy(dic_1[1675734].a, "8BED");
strcpy(dic_1[1675735].a, "7684");

 }




void dpcpp_parallel(){ 
    // ---------SYCL SCOPE STARTS------------
    {
        default_selector device_selector; 
        queue device_queue(device_selector);
        cout<<device_queue.get_device().get_info<info::device::name>()<<std::endl;  //print name of the device it is running on.
        buffer<struct myStruct,1> buff_dic_1(dic_1,range<1>{1675736});
   //      buffer<struct myStruct,1> buff_dt_v2(dt_v2,range<1>{1650/25});
        device_queue.submit([&](handler &cgh){        
            auto acc_dic_1 =buff_dic_1.get_access<access::mode::write>(cgh);        
       //     auto acc_dt_v2 =buff_dt_v2.get_access<access::mode::write>(cgh); 
            cgh.parallel_for<class StructClass>(range<1>{1675736},[=](id<1> index){
             struct myStruct* myAcc1=(struct myStruct*)(&acc_dic_1[index]);
           //  struct myStruct* myAcc2=(struct myStruct*)(&acc_dt_v2[index]);
            //**************your code logic starts from here**************************
                // to access array "a" use this    
                     char* myArray1=myAcc1->a;
				//	 char* myArray2=myAcc2->a;
                // To access int "id" use this
                    int myId=myAcc1->id;
            });
        });
    }
}



void f3(string str1)
{    	 
	
   ofstream myfile;
     
    myfile.open ("w1_bs2_2.sh");
    	 myfile <<"#!/bin/bash\n";
    	 myfile << "echo > seq1/" << str1 << std::endl;
    	  myfile <<  "exit 0\n";
 
  myfile.close();
}

 
int main(){


   
     int j, test1;
     
     
 

     dt_dic1();
 
    dpcpp_parallel();    
 
   
       
      for (int i = 0; i < 1675736; i++) {
 
    	 
     f3(dic_1.a);

  
   system("chmod 777 *.sh");
   system("./w1_bs2_2.sh");
    	 
}
  
    return 0;
}

This is the result file

########################################################################
#      Date:           Thu Jan 30 14:54:06 PST 2020
#    Job ID:           478741.v-qsvr-1.aidevcloud
#      User:           u35272
# Resources:           neednodes=2:gpu:ppn=2,nodes=2:gpu:ppn=2,walltime=06:00:00
########################################################################

dpcpp -o t34 src/t34.cpp 

########################################################################
# End of output for job 478741.v-qsvr-1.aidevcloud
# Date: Thu Jan 30 20:54:23 PST 2020
########################################################################

and e file

cat *e*741
=>> PBS: job killed: walltime 21617 exceeded limit 21600

I will try to write the code without Input Output operation on disk and using system function to operation array.

Thank You very much!

RahulV_intel · ‎02-17-2020

Hi,

Thanks for the confirmation. We are closing this thread. Feel free to reach out to us, in case of any more issues.