Hi I want to parallelize the

lto · ‎10-01-2007

Could anyone show me how to implement two-dimensional matrix multiplication with variable size 2D arrays? The example illustrated in Intel TBB book uses fixed size arrays. Thanks!

Long

ARCH_R_Intel · ‎10-02-2007

Representations of 2D variable-size matrices very widely in C++. If you sketch your preferred approach for serial multiplication of variable-size matrices, then perhaps someone can post the TBBified version.

- Arch Robison

lto · ‎10-02-2007

Here is the code from Intel TBB book that I've modified to use Numerical Recipe C++ matrix class, Mat_SP ( matrix single precision). The code compiles and executes, but calculations from serial and parallel portionsof the code do not match. I think the code using NR pass in matrix data ok but having problem returning matrix data back to the main program. Your help is greatly appreciated!

Long

p.s. I think the problem occurs within operator() code(i.e, Mat_SP c = my_c, this creates a new copy of matrix c insteads of the original matrix c; I have no idea how to pass this new copy of the matrix c back to the main program). Thanks in advance for any suggestion!

//

#include

"tbb/task_scheduler_init.h"

#include

"tbb/tick_count.h"

// C++ numerical recipe header

#include

"nr.h"

#include

"tbb/parallel_for.h"

#include

"tbb/blocked_range2d.h"

using

namespace tbb;

using

namespace std;

//! Working threads count for parallel version

static

int NThread = 2;

//const size_t L = 150;

//const size_t M = 225;

//const size_t N = 300;

//void SerialMatrixMultiply( float c, float a, float b ) {

void

SerialMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b ) {

for( int i=0; i

for( int j=0; j

float sum = 0;

for( int k=0; k

sum += a*b;

c = sum;

}

/ *

class MatrixMultiplyBody2D {

float (*my_a);

float (*my_b);

float (*my_c);

public:

void operator()( const blocked_range2d& r ) const {

float (*a) = my_a; // a,b,c used in example to emphasize

float (*b) = my_b; // commonality with serial code

float (*c) = my_c;

for( size_t i=r.rows().begin(); i!=r.rows().end(); ++i ){

for( size_t j=r.cols().begin(); j!=r.cols().end(); ++j ) {

float sum = 0;

for( size_t k=0; k

sum += a*b;

c = sum;

}

// MatrixMultiplyBody2D( float c, float a, float b ) :

MatrixMultiplyBody2D( Mat_SP c, Mat_SP a, Mat_SP b ) :

my_a(a), my_b(b), my_c(c)

{}

};

*/

class

MatrixMultiplyBody2D {

Mat_SP my_a, my_b, my_c;

public

:

void operator()( const blocked_range2d<int>& r ) const {

Mat_SP a = my_a;

// a,b,c used in example to emphasize

Mat_SP b = my_b;

// commonality with serial code

Mat_SP c = my_c;

for( int i=r.rows().begin(); i!=r.rows().end(); ++i ){

for( int j=r.cols().begin(); j!=r.cols().end(); ++j ) {

float sum = 0;

for( int k=0; k

sum += a*b;

c = sum;

}

// MatrixMultiplyBody2D( float c, float a, float b ) :

MatrixMultiplyBody2D( Mat_SP &c, Mat_SP a, Mat_SP b ) :

my_a(a), my_b(b), my_c(c)

{}

};

//void ParallelMatrixMultiply(float c, float a, float b){

void

ParallelMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b){

parallel_for( blocked_range2d<

int>(0, a.nrows(), 16, 0, b.ncols(), 32),

MatrixMultiplyBody2D(c,a,b) );

}

//static void CreateData( float a, float b ) {

static

void CreateData( Mat_SP &a, Mat_SP &b) {

for( int i=0; i

for( < /FONT>int j=0; j

a = (rand()%1000)/100.0f;

}

for( int i=0; i

for( int j=0; j

b = (rand()%1000)/100.0f;

}

//static bool VerifyResult( float c1, float c2 ) {

static

bool VerifyResult(Mat_SP c1, Mat_SP c2 ) {

for( int i=0; i

for( int j=0; j

if ( abs(c1 - c2) > 0.0001f ) {

return false;

}

return true;

}

static

void ParseCommandLine( int argc, char* argv[] ) {

int i = 1;

if( i[0]) ) {

fprintf(stderr,

"Usage: %s number-of-threads ",argv[0]);

exit(1);

}

if( i

}

int

main( int argc, char* argv[] ) {

srand(2);

ParseCommandLine( argc, argv );

int L,M,N;

cout <<

"please enter L,M,N (ex. 150 225 300) = " << endl;

cin >> L >> M >> N;

//const size_t L = 150;

//const size_t M = 225;

//const size_t N = 300;

task_scheduler_init init(NThread);

// float a;

// float b;

// using numerical recipe matrix class

Mat_SP a(M,L);

Mat_SP b(L,N);

CreateData(a, b);

// float c1;

// using numerical recipe matrix class

Mat_SP c1(M,N);

tick_count t0 = tick_count::now();< /P>

SerialMatrixMultiply(c1, a, b);

tick_count t1 = tick_count::now();

printf(

"Serial code: %.4f sec ", (t1-t0).seconds());

// float c2;

// using numerical recipe matrix class

Mat_SP c2(M,N);

t0 = tick_count::now();

ParallelMatrixMultiply(c2, a, b);

t1 = tick_count::now();

printf(

"Parallel code (%d threads): %.4f sec ",

NThread, (t1-t0).seconds());

if ( VerifyResult(c1, c2) ) {

printf(

"Results match! ");

return 0;

}

else {

printf(

"Results don't match! ");

return 2;

}

ARCH_R_Intel · ‎10-04-2007

The problem was values versus references. Each time a Mat_SP is passed by value, a copy is made. That's unnecessary expense for matrices a and b, and as you noted, gives wrong answers for matrix c.

The solution is to pass a and b by const reference, and pass c by reference.Appended isa diff that shows the necessary changes. The changes are all in MatrixMultiplyBody2D:

Changemy_a, my_b, and my_c to references. Fields my_a and my_b, because they refer to input arguments, should be const references.
Pass the arguments to the constructor by reference also, with const qualifiers on the input arguments.

With these changes, the results match.

- Arch Robison

*** matrix.cpp.orig 2007-10-04 08:48:52.936144000 -0500
--- matrix.cpp.fixed 2007-10-04 09:08:05.805223000 -0500
***************
*** 29,54 ****
 }
 }
 }

 class MatrixMultiplyBody2D {
! Mat_SP my_a, my_b, my_c;
 public:
 void operator()( const blocked_range2d& r ) const {
! Mat_SP a = my_a; // a,b,c used in example to emphasize
! Mat_SP b = my_b; // commonality with serial code
! Mat_SP c = my_c;
 for( int i=r.rows().begin(); i!=r.rows().end(); ++i ){
 for( int j=r.cols().begin(); j!=r.cols().end(); ++j ) {
 float sum = 0;
 for( int k=0; k sum += a*b;
 c = sum;
 }
 }
 }
! MatrixMultiplyBody2D( Mat_SP &c, Mat_SP a, Mat_SP b ) :
 my_a(a), my_b(b), my_c(c)
 {}
 };

 void ParallelMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b){
--- 29,55 ----
 }
 }
 }

 class MatrixMultiplyBody2D {
! const Mat_SP &my_a, &my_b;
! Mat_SP &my_c;
 public:
 void operator()( const blocked_range2d& r ) const {
! const Mat_SP& a = my_a; // a,b,c used in example to emphasize
! const Mat_SP& b = my_b; // commonality with serial code
!&
nbsp; Mat_SP& c = my_c;
 for( int i=r.rows().begin(); i!=r.rows().end(); ++i ){
 for( int j=r.cols().begin(); j!=r.cols().end(); ++j ) {
 float sum = 0;
 for( int k=0; k sum += a*b;
 c = sum;
 }
 }
 }
! MatrixMultiplyBody2D( Mat_SP &c, const Mat_SP& a, const Mat_SP& b ) :
 my_a(a), my_b(b), my_c(c)
 {}
 };

 void ParallelMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b){

ARCH_R_Intel · ‎10-04-2007

For the record, appended is thecomplete code that I ran successfully, with the commented-out code removed.

I discovereda quirkabout this forum's text editor. If I copy-and-paste the code directly from Visual Studio, all the indentation is lost. But if I copy-and-past into Word first, and then copy-and-paste from Word, the indentation is retained.

- Arch Robison

#include 
#include 
#include 
#include 
#include 
// TBB headers
#include "tbb/task_scheduler_init.h"
#include "tbb/tick_count.h"
#include "tbb/parallel_for.h"
#include "tbb/blocked_range2d.h"
// C++ numerical recipe header
#include "nr.h"
using namespace tbb;
using namespace std;
//! Working threads count for parallel version
static int NThread = 2;
void SerialMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b ) {
 for( int i=0; i
 for( int j=0; j
 float sum = 0;
 for( int k=0; k
 sum += a*b;
 c = sum;
 }
 }
}
class MatrixMultiplyBody2D {
 const Mat_SP &my_a, &my_b;
 Mat_SP &my_c;
public:
 void operator()( const blocked_range2d<int>& r ) const {
 const Mat_SP& a = my_a; // a,b,c used in example to emphasize
 const Mat_SP& b = my_b; // commonality with serial code
 Mat_SP& c = my_c;
 for( int i=r.rows().begin(); i!=r.rows().end(); ++i ){
 for( int j=r.cols().begin(); j!=r.cols().end(); ++j ) {
 float sum = 0;
 for( int k=0; k
 sum += a*b;
 c = sum;
 }
 }
 }
 MatrixMultiplyBody2D( Mat_SP &c, const Mat_SP& a, const Mat_SP& b ) :
 my_a(a), my_b(b), my_c(c)
 {}
};
void ParallelMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b){
 parallel_for( blocked_range2d<int>(0, a.nrows(), 16, 0, b.ncols(), 32), 
 
MatrixMultiplyBody2D(c,a,b) );
}
static void CreateData( Mat_SP &a, Mat_SP &b) {
 for( int i=0; i
 for( int j=0; j
 a = (rand()%1000)/100.0f;
 }
 }
 for( int i=0; i
 for( int j=0; j
 b = (rand()%1000)/100.0f;
 }
 }
}
static bool VerifyResult(Mat_SP c1, Mat_SP c2 ) {
 for( int i=0; i
 for( int j=0; j
 if ( abs(c1 - c2) > 0.0001f ) {
 return false;
 }
 }
 }
 return true;
}
static void ParseCommandLine( int argc, char* argv[] ) {
 int i = 1;
 if( i[0]) ) {
 fprintf(stderr,"Usage: %s number-of-threads
",argv[0]);
 exit(1);
 }
 if( i
} 
int main( int argc, char* argv[] ) {
 srand(2);
 ParseCommandLine( argc, argv );
 int L,M,N;
 cout << "please enter L,M,N (ex. 150 225 300) = " << endl;
 cin >> L >> M >> N;
 task_scheduler_init init(NThread);
 // using numerical recipe matrix class
 Mat_SP a(M,L);
 Mat_SP b(L,N);
 CreateData(a, b);
 // using numerical recipe matrix class
 Mat_SP c1(M,N);
 tick_count t0 = tick_count::now();
 SerialMatrixMultiply(c1, a, b);
 tick_count t1 = tick_count::now();
 printf("Serial code: %.4f sec
", (t1-t0).seconds());
 // using numerical recipe matrix class
 Mat_SP c2(M,N);
 t0 = tick_count::now();
 ParallelMatrixMultiply(c2, a, b);
 t1 = tick_count::now();
 printf("Parallel code (%d threads): %.4f sec
",
 NThread, (t1-t0).seconds());
 if ( VerifyResult(c1, c2) ) {
 printf("Results match!
");
 return 0;
 } else {
 printf("Results don't match!
");
 return 2;
 }
}

lto · ‎10-04-2007

Hello Arch,

Thanks very much for your help, and your explanation of the 'const ref' makes sense; it clearly shows how to properly use the parallel_for routine.

Best regards,

Long

lto · ‎10-06-2007

It's hard to learn and apply something new so that it would work efficiently and correctly. I hope my experience on using tbb will help others.

Thanks for the tbb.dll and the help of Intel TBB developers, my image edit and reconstrunction program which operates on large 2d matriceshas more than double its execution speed. I also notice that by replacing the default new and delete operators used in NR's Mat_DP with TBB book's new and delete operators, I can furthur improve the program significantly on my duo core laptop.

Reason for using TBB book's new/delete operators was due to mytesting that the same program ran twice as fast on my laptop was more than 4x slow down on my 4 processors Xeon desktop machine at work. I have not tried the new version using tbbmalloc.dll on my work desktop yet. I will report the result here nextweekif no one here object.

Long

lto · ‎10-10-2007

Hello,

Tested this morning at work, the new version of my application using tbbmalloc.dll run twice as fast as my duo core laptop. I compiled the application using ms visual studio2005 professional (I guess their memory allcation routine is not optimal for parallel processing.)with autopartioned grain size.Btw evenon single processor machine, TBB actually speeds up the application.

Regards,

Long

ARCH_R_Intel · ‎10-10-2007

It's quite possible for TBB's parallel_for to speed up a matrix multiply even on a single processor, because of cache blocking effects. Can you describe the typical dimensions of the matrices that you are multiplying? Knowing that, I or someone else on this thread can suggest potentially faster reordings of the loops.

lto · ‎10-10-2007

The 2D matrix multiplication application was just my testing to get aquainted with TBB library. The actual application I am working on is basically a 2D SAR (synthetic aperture radar) polar reformat application that computes 2D interpolations for data matrices as large as 2048x2048, sometime larger. It has two for loops with 2d interpolations done within inner loop. I am experimenting with different grain size to see if it would furthur speed up the processing. Thanks!

Long

handerpeder · ‎09-28-2009

Hi.

Im trying to get the example code from this post working, but I'm having some problems regarding blocked_range2d.

I couldn't just copy/paste the code because I don't have the NR library, so I made a few modifications and used newmat instead.

I compile with

g++ matrix_tbb.cpp -ltbb -lnewmat

and get the following error:

matrix_tbb.cpp: In member function void MatrixMultiplyBody2D::operator()(const tbb::blocked_range2d&) const:

matrix_tbb.cpp:34: error: r->tbb::blocked_range2d::rows [with RowValue = int, ColValue = int] does not have class type

matrix_tbb.cpp:35: error: r->tbb::blocked_range2d::cols [with RowValue = int, ColValue = int] does not have class type

The relevant code looks pretty much the same as the one posted above:

class MatrixMultiplyBody2D {

const Matrix &my_a, &my_b;

Matrix &my_c;

public:

void operator()(const blocked_range2d& r) const {

const Matrix& a = my_a;

const Matrix& b = my_b;

Matrix& c = my_c;

for(int i=r.rows().begin(); i!=r.rows.end(); ++i){

for(int j=r.cols().begin(); j!=r.cols.end(); ++j){

float sum = 0;

for(int k=1;k
sum += a(i,k)*b(k,j);

c(i,j) = sum;

}

}

}

void parallelMult(Matrix &c, const Matrix &a, const Matrix &b){

parallel_for( blocked_range2d(1, a.Nrows(), 16, 1, b.Ncols()+1, 32),

MatrixMultiplyBody2D(c,a,b) );

}

Im pretty new to c++ and have been trying to figure out what the whole 'does not have class type' thing means.

Any help would be appreciated.

RafSchietekat · ‎09-28-2009

rows and cols are member functions: you have to call them with end() just like you do with begin(), so adding two pairs of empty braces might be enough, but I have not checked that myself.

handerpeder · ‎09-29-2009

Quoting - Raf Schietekat

rows and cols are member functions: you have to call them with end() just like you do with begin(), so adding two pairs of empty braces might be enough, but I have not checked that myself.

Wow, must have been at it too long yesterday. That was the problem indeed.

Thanks so much!

uh18104 · ‎01-01-2010

I am new to TBB. In particular, I am curious of the usage of blocked_range2d in the parallel_for. It appears that the parallel_for for the matrix multiplication exploites tiling. If my guess is correct, then I wonder whey the column_range_size is 2 * row_range_size.

Quoting - Arch Robison (Intel)

The problem was values versus references. Each time a Mat_SP is passed by value, a copy is made. That's unnecessary expense for matrices a and b, and as you noted, gives wrong answers for matrix c.

The solution is to pass a and b by const reference, and pass c by reference.Appended isa diff that shows the necessary changes. The changes are all in MatrixMultiplyBody2D:

Changemy_a, my_b, and my_c to references. Fields my_a and my_b, because they refer to input arguments, should be const references.

Pass the arguments to the constructor by reference also, with const qualifiers on the input arguments.

With these changes, the results match.

- Arch Robison

*** matrix.cpp.orig 2007-10-04 08:48:52.936144000 -0500
--- matrix.cpp.fixed 2007-10-04 09:08:05.805223000 -0500
***************
*** 29,54 ****
 }
 }
 }

 class MatrixMultiplyBody2D {
! Mat_SP my_a, my_b, my_c;
 public:
 void operator()( const blocked_range2d& r ) const {
! Mat_SP a = my_a; // a,b,c used in example to emphasize
! Mat_SP b = my_b; // commonality with serial code
! Mat_SP c = my_c;
 for( int i=r.rows().begin(); i!=r.rows().end(); ++i ){
 for( int j=r.cols().begin(); j!=r.cols().end(); ++j ) {
 float sum = 0;
 for( int k=0; k sum += a*b;
 c = sum;
 }
 }
 }
! MatrixMultiplyBody2D( Mat_SP &c, Mat_SP a, Mat_SP b ) :
 my_a(a), my_b(b), my_c(c)
 {}
 };

 void ParallelMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b){
--- 29,55 ----
 }
 }
 }

 class MatrixMultiplyBody2D {
! const Mat_SP &my_a, &my_b;
! Mat_SP &my_c;
 public:
 void operator()( const blocked_range2d& r ) const {
! const Mat_SP& a = my_a; // a,b,c used in example to emphasize
! const Mat_SP& b = my_b; // commonality with serial code
!&
nbsp; Mat_SP& c = my_c;
 for( int i=r.rows().begin(); i!=r.rows().end(); ++i ){
 for( int j=r.cols().begin(); j!=r.cols().end(); ++j ) {
 float sum = 0;
 for( int k=0; k sum += a*b;
 c = sum;
 }
 }
 }
! MatrixMultiplyBody2D( Mat_SP &c, const Mat_SP& a, const Mat_SP& b ) :
 my_a(a), my_b(b), my_c(c)
 {}
 };

 void ParallelMatrixMultiply( Mat_SP &c, Mat_SP a, Mat_SP b){

RafSchietekat · ‎01-02-2010

"I am new to TBB. In particular, I am curious of the usage of blocked_range2d in the parallel_for. It appears that the parallel_for for the matrix multiplication exploites tiling."
Each tile in the product results from two corresponding bands in the multiplicands, which can be a lot faster, I suppose, than a naive application of the matrix product definition if the bands and the tile fit in the cache but not the three matrices.

"If my guess is correct, then I wonder whey the column_range_size is 2 * row_range_size."
I don't see what you mean?

RafSchietekat · ‎01-02-2010

I see the forum has decided to leave out the editing option today. :-)

Sorry, I have to correct myself, the naive definition would not require all three matrices to fit, either. Probably "just" one of the matrices plus a row/column in the other, but that's already quadratic compared to two bands and a tile.

sukhad_a_ · ‎07-02-2017

Hi I want to parallelize the following code :

This is the code for the same :

for(long j = range.first ;j<range.second ;j++)
{
for (unsigned long k = 0; k < samples.shapeResiduals.size(); ++k)
{
samples.shapeResiduals=samples.shapeResiduals-learning_rate*temp;
}
}

shapeResidual is a vector which is stored in a C++ structure object. samples is an array of those structure objects.temp is a vector storing elements of same type as shapeResiduals.

variable size two dimensional matrix multiplication