Intel® oneAPI DPC++/C++ Compiler
Talk to fellow users of Intel® oneAPI DPC++/C++ Compiler and companion tools like Intel® oneAPI DPC++ Library, Intel® DPC++ Compatibility Tool, and Intel® Distribution for GDB*

copy the first array to second array as regular loop cycle period and parallel loop with difference length of different array

bo__john
Beginner
1,419 Views

I wants copy the first array to second array as regular loop cycle period, like that:

 

struct myStruct dt_v1[10];
struct myStruct dt_v2[10];
struct myStruct dt_v13[3];
struct myStruct dt_v14[3];
struct myStruct dt_v15[3];



void abc1(){
    strcpy(dt_v1[0].a, "102755703");
    strcpy(dt_v1[1].a, "ab10");
    strcpy(dt_v1[2].a, "cd10");
    strcpy(dt_v1[3].a, "13");
    strcpy(dt_v1[4].a, "aa5");
    strcpy(dt_v1[5].a, "aa184");
    strcpy(dt_v1[6].a, "1");
    strcpy(dt_v1[7].a, "&&13");
    strcpy(dt_v1[8].a, "%%14");
    strcpy(dt_v1[9].a, "!!1");

}

int i, j1, j2, j3, k;


// I put this loop in main() but compile ok,  run failed for a large number, 
//it only running more than 100, but my dat is more than 1000++

for(i=0; i<10; i++)
{
	j1 =i*3;
    j2 = j1+1;
    j3 = j1+2;
    k=j1/3;
strcpy(dt_v13.a, dt_v1[j1].a);
strcpy(dt_v14.a, dt_v1[j2].a);
strcpy(dt_v15.a, dt_v1[j3].a);

}


// this loop in main()
 for (int i = 0; i < 3; i++) {
    std::cout << dt_v3.a <<std::endl;
  }

 

 

 

my last code and result

 

 

 

int main(){

 
   
     int j, test1;
     
     
 

    abc(); // Initialisation for dt_v1[]
 
    dpcpp_parallel();    
 
     
      for (int i = 1; i < 1651; i++) {
		  j=i*25;
    std::cout << dt_v1.a <<std::endl;
 
  }
    return 0;
}

 

 

  

 

u35272@login-2:~/exc/dpc1/dpc_5$ cat *o*917

########################################################################
#      Date:           Wed Jan 29 12:59:16 PST 2020
#    Job ID:           477917.v-qsvr-1.aidevcloud
#      User:           u35272
# Resources:           neednodes=2:gpu:ppn=2,nodes=2:gpu:ppn=2,walltime=06:00:00
########################################################################

./t29
Intel(R) Gen9 HD Graphics NEO
102809408
102755703
102772414
102756988
102782321
102755748
102743577
102824987
102750121
102784113
102752389
102834161
102818434
102829292
102813096
102744544
102838772
102847134
102738468
102740481
102844083
102752193
102757704
102822715
102815659
102807175
102801481
102760240
102744845
102746524
102750141
102812579
102813198
102817848
102775056
102826594
102738494
102753935
102761366
102745783
102797165
102749800
102754511
102843550
102749845
102805684
102739925
102822741
102825670
102834045
102771447
102749888
102781802
102834752
102829958
102990200
102782250
102756310
102831321
102827544
102820133
102754463
102829326
102752257
102738743

Makefile:25: recipe for target 'run_dpcpp' failed

########################################################################
# End of output for job 477917.v-qsvr-1.aidevcloud
# Date: Wed Jan 29 12:59:19 PST 2020
########################################################################

 

 

then

At first copy dt_v1[0].a and dt_v1[5].a to new array dt_v3[0].a and dt_v3[1].a, then atoi to integer,

then operation to a integer array,

may be arithmetic operation , like sum to new array C[index] = sum, and where can do cout of those array.

please give a example like that:

 

 

 

 

        //Submitting command group to queue to compute matrix mulitiplication c=a*b
        device_queue.submit([&](handler &cgh){
            // Read from a and b, write to c
            auto A = a.get_access<access::mode::read>(cgh);
            auto B = b.get_access<access::mode::read>(cgh);
            auto C = c.get_access<access::mode::write>(cgh);

            int WidthA = a.get_range()[1];

            //Executing kernel
            cgh.parallel_for<class MatrixMult>(range<2>{M, P}, [=](id<2> index){
	        //Get global position in Y direction
	        int row = index[0];
	        //Get global position in X direction
	        int col = index[1];

	        double sum = 0.0;
	        //Compute the result of one element in c
	        for (int i = 0; i < WidthA; i++) {
	            sum += A[row] * B[col];
	        }

	        C[index] = sum;
            });

        });
    }    //End of scope, so we wait for kernel producing result data to host memory c_back to complete
   

 

 

 

and if I do the parallel loop with difference length of different array, use the function like below:

 

 

 

 

// Matrix size constants
#define SIZE     1200     // Must be a multiple of 8.
#define M        SIZE/8
#define N        SIZE/4
#define P        SIZE/2

     // Submitting command group to queue to initialize matrix a
        device_queue.submit([&](handler &cgh) {
            // Getting write only access to the buffer on a device
            auto Accessor = a.get_access<access::mode::write>(cgh);
            // Executing kernel
            cgh.parallel_for<class FillBuffer_a>( range<2>{M, N}, [=](id<2> index) {
                // a is identity matrix
                Accessor[index] = 1.0;
            });
        });
    
        //Submitting command group to queue to initialize matrix b
        device_queue.submit([&](handler &cgh) {
            // Getting write only access to the buffer on a device
            auto Accessor = b.get_access<access::mode::write>(cgh);
            //Executing kernel
            cgh.parallel_for<class FillBuffer_b>( range<2>{N, P}, [=](id<2> index){
	        // each column of b is the sequence 1,2,...,N	    
                Accessor[index] = index[0] + 1.;
            });    
        });   

 

 

 

 

 

Please give a complete code example.

 

Thank You!

0 Kudos
6 Replies
GouthamK_Intel
Moderator
1,419 Views

Hi John,

We couldn't understand the exact problem which you are facing. Can you please give some more clarity on the issue which you are facing.

If possible can you share your complete source code and also .e*** file (error file) which is produced along with .o**** file(output file) in devcloud. So that we can investigate more on this issue.

 

Thanks

Goutham

0 Kudos
GouthamK_Intel
Moderator
1,419 Views

Hi John,

Can you please elaborate more on the issue which you are facing. Are you looking for a way to parallelize the copying of the first array to the second array? or Is there any other issue you are facing?

 

Thanks

Goutham

0 Kudos
bo__john
Beginner
1,419 Views

Hi, Goutham!

Sorry so late reply to you!

 

   I mean how to do it fast.

 

u35272@login-2:~/exc/dpc1/dpc_5$ cat *e*741
=>> PBS: job killed: walltime 21617 exceeded limit 21600

 

I have one program, about 1.8 million line 4 character data string with array,  but running one devcloud more than 6 hours, 21600 second, so it is killed by system.

 

I mean i put the loop in main, or put all array copy operation loop in the parallel loop, like last time example  I give. which is fast?

 

And how can I control the program less than 6 hours ? how many lines data array is max as your experience ?

Last time I put wrong total number of array and loop, so I fixed the problem myself.

This is new program without mistake.

Program like that :

  • void dpcpp_parallel(){
        // ---------SYCL SCOPE STARTS------------
        {
            default_selector device_selector;
            queue device_queue(device_selector);
            cout<<device_queue.get_device().get_info<info::device::name>()<<std::endl;  //print name of the device it is running on.
            buffer<struct myStruct,1> buff_dt_v1(dt_v1,range<1>{1650});
       //      buffer<struct myStruct,1> buff_dt_v2(dt_v2,range<1>{1650/25});
            device_queue.submit([&](handler &cgh){        
                auto acc_dt_v1 =buff_dt_v1.get_access<access::mode::write>(cgh);        
           //     auto acc_dt_v2 =buff_dt_v2.get_access<access::mode::write>(cgh);
                cgh.parallel_for<class StructClass>(range<1>{1650},[=](id<1> index){
                 struct myStruct* myAcc1=(struct myStruct*)(&acc_dt_v1[index]);
               //  struct myStruct* myAcc2=(struct myStruct*)(&acc_dt_v2[index]);
                //**************your code logic starts from here**************************
                    // to access array "a" use this    
                         char* myArray1=myAcc1->a;
                    //     char* myArray2=myAcc2->a;
                    // To access int "id" use this
                        int myId=myAcc1->id;
                });
            });
        }
    }
    
    void f3(string str)
    {        
        
      FILE *mf;
         
         mf=fopen("w1_bs2_2.sh", "w");
             fprintf(mf,"#!/bin/bash\n");
             fprintf(mf, "echo > seq1/%s\n", str );
             fprintf(mf, "exit 0\n");
            
            fclose(mf);
    }
    
     
    int main(){
    
    
       
         int j, test1;
         
         
    
        abc(); // Initialisation for dt_v1[]
     
        dpcpp_parallel();    
     
       
           
          for (int i = 1; i < 1651; i++) {
              if(i<66)
              {
              j=(i*25)+1;
        std::cout << dt_v1.a <<std::endl;
        
            strcpy(dt_v2.a, dt_v1.a);
            
             std::cout << "dt2\t" << dt_v2.a <<std::endl;
            
         f3(dt_v2.a);
    
     
       system("chmod 777 *.sh");
       system("./w1_bs2_2.sh");
            
     }
      }
     
        return 0;
    }
    
    
    
    int main(){
    
    
       
         int j, test1;
         
         
     
    
        abc(); // Initialisation for dt_v1[]
     
        dpcpp_parallel();    
     
       
           
          for (int i = 1; i < 1651; i++) {
    		  if(i<66)
    		  {
    		  j=(i*25)+1;
        std::cout << dt_v1.a <<std::endl;
        
        	strcpy(dt_v2.a, dt_v1.a);
        	
        	 std::cout << "dt2\t" << dt_v2.a <<std::endl;
        	 
         f3(dt_v2.a);
    
      
       system("chmod 777 *.sh");
       system("./w1_bs2_2.sh");
        	 
     }
      }
      
        return 0;
    }
    
    

     

I wants put the for loop in main , as loop in the function "  dpcpp_parallel() ", is it possible let program fast ?

 

this is a example of put the array operation in the function parallel loop, I try to some program, but failed.

 

// Matrix size constants
#define SIZE     1200     // Must be a multiple of 8.
#define M        SIZE/8
#define N        SIZE/4
#define P        SIZE/2

     // Submitting command group to queue to initialize matrix a
        device_queue.submit([&](handler &cgh) {
            // Getting write only access to the buffer on a device
            auto Accessor = a.get_access<access::mode::write>(cgh);
            // Executing kernel
            cgh.parallel_for<class FillBuffer_a>( range<2>{M, N}, [=](id<2> index) {
                // a is identity matrix
                Accessor[index] = 1.0;
            });
        });
    
        //Submitting command group to queue to initialize matrix b
        device_queue.submit([&](handler &cgh) {
            // Getting write only access to the buffer on a device
            auto Accessor = b.get_access<access::mode::write>(cgh);
            //Executing kernel
            cgh.parallel_for<class FillBuffer_b>( range<2>{N, P}, [=](id<2> index){
	        // each column of b is the sequence 1,2,...,N	    
                Accessor[index] = index[0] + 1.;
            });    
        });   

 

 

Thanks a lot!

 

John

 

0 Kudos
GouthamK_Intel
Moderator
1,419 Views

Hi John,

We tried running your code snippet in DevCloud and we didn't face any problem while copying the data from one array to another array for 1.8 Million data items.

but we observed that you are doing some file I/O operations inside for loop. as quoted below.

void f3(string str)

027{       

028     

029  FILE *mf;

030      

031     mf=fopen("w1_bs2_2.sh", "w");

032         fprintf(mf,"#!/bin/bash\n");

033         fprintf(mf, "echo > seq1/%s\n", str );

034         fprintf(mf, "exit 0\n");

035         

036        fclose(mf);

037}

 

and we also observed that you are trying to run a script file inside for loop which may have taken more time to execute, resulting in your code execution time exceed in DevCloud. 

107   system("./w1_bs2_2.sh");

 

Can you please provide time taken to execute for below codes individually?  So that we can investigate more and provide you a better solution. 

 

087    abc(); // Initialisation for dt_v1[]

 

 

 089    dpcpp_parallel();   

 

  for (int i = 1; i < 1651; i++) {

094          if(i<66)

095          {

096          j=(i*25)+1;

097    std::cout << dt_v1.a <<std::endl;

098     

099        strcpy(dt_v2.a, dt_v1.a);

100         

101         std::cout << "dt2\t" << dt_v2.a <<std::endl;

102          

103     f3(dt_v2.a);

104 

105   

106   system("chmod 777 *.sh");

107   system("./w1_bs2_2.sh");

108          

109 }

110  }

 

 

 

Thanks

Goutham

0 Kudos
bo__john
Beginner
1,419 Views

This is the code of 1.7 millions line of array,

 

 

strcpy(dic_1[1675730].a, "4EBA");
strcpy(dic_1[1675731].a, "7684");
strcpy(dic_1[1675732].a, "7956");
strcpy(dic_1[1675733].a, "9C81");
strcpy(dic_1[1675734].a, "8BED");
strcpy(dic_1[1675735].a, "7684");

 }




void dpcpp_parallel(){ 
    // ---------SYCL SCOPE STARTS------------
    {
        default_selector device_selector; 
        queue device_queue(device_selector);
        cout<<device_queue.get_device().get_info<info::device::name>()<<std::endl;  //print name of the device it is running on.
        buffer<struct myStruct,1> buff_dic_1(dic_1,range<1>{1675736});
   //      buffer<struct myStruct,1> buff_dt_v2(dt_v2,range<1>{1650/25});
        device_queue.submit([&](handler &cgh){        
            auto acc_dic_1 =buff_dic_1.get_access<access::mode::write>(cgh);        
       //     auto acc_dt_v2 =buff_dt_v2.get_access<access::mode::write>(cgh); 
            cgh.parallel_for<class StructClass>(range<1>{1675736},[=](id<1> index){
             struct myStruct* myAcc1=(struct myStruct*)(&acc_dic_1[index]);
           //  struct myStruct* myAcc2=(struct myStruct*)(&acc_dt_v2[index]);
            //**************your code logic starts from here**************************
                // to access array "a" use this    
                     char* myArray1=myAcc1->a;
				//	 char* myArray2=myAcc2->a;
                // To access int "id" use this
                    int myId=myAcc1->id;
            });
        });
    }
}



void f3(string str1)
{    	 
	
   ofstream myfile;
     
    myfile.open ("w1_bs2_2.sh");
    	 myfile <<"#!/bin/bash\n";
    	 myfile << "echo > seq1/" << str1 << std::endl;
    	  myfile <<  "exit 0\n";
 
  myfile.close();
}

 
int main(){


   
     int j, test1;
     
     
 

     dt_dic1();
 
    dpcpp_parallel();    
 
   
       
      for (int i = 0; i < 1675736; i++) {
 
    	 
     f3(dic_1.a);

  
   system("chmod 777 *.sh");
   system("./w1_bs2_2.sh");
    	 
}
  
    return 0;
}

 

 

 

This is the result file

 

 

########################################################################
#      Date:           Thu Jan 30 14:54:06 PST 2020
#    Job ID:           478741.v-qsvr-1.aidevcloud
#      User:           u35272
# Resources:           neednodes=2:gpu:ppn=2,nodes=2:gpu:ppn=2,walltime=06:00:00
########################################################################

dpcpp -o t34 src/t34.cpp 

########################################################################
# End of output for job 478741.v-qsvr-1.aidevcloud
# Date: Thu Jan 30 20:54:23 PST 2020
########################################################################

 

 

and e file

 

cat *e*741
=>> PBS: job killed: walltime 21617 exceeded limit 21600

 

 

 

I will try to write the code without Input Output operation on disk and using system function to operation array.

 

Thank You very much!

0 Kudos
RahulV_intel
Moderator
1,419 Views

Hi,

Thanks for the confirmation. We are closing this thread. Feel free to reach out to us, in case of any more issues.

 

0 Kudos
Reply