Re: performance of parallel_for

may_max11 · ‎02-28-2008

hi ...
I am new to TBB, I developed a simple program to access the performance of parallel_reduce and parallel_for against serial execution. the program simpy sums up the content of a large array (order of 10M) and display the seq in which objects are copied and joined. To my astonishment the parallized program is taking a lot more time compared to the serial program to sum up the contents. Please give me some explanation and solution... I am copying the program below.

I am using :
Processor : Core 2 Duo
OS : RHEL 4.0

PS : I even timed the program using time cmd

Program :
//-------------------------------------------------starts here-----------------------------------------

//to disable printing of values undef PRINT
#include
#include
#include
#include
#include

using namespace std;
using namespace tbb;

#define REAL double
#define VALUE 0.025;

#define DEBUG
#define MAX 10000000
#define GRAINSIZE 500000
#define PRINT

class ApplyFoo {
REAL *a;
size_t len;

#ifdef DEBUG
static unsigned int noo;
static unsigned int nob;
int objNo;
clock_t obtime;
static clock_t max_obtime;
static clock_t main_obtime;
#endif

public :

ApplyFoo(REAL array[], size_t length) {
a=array;
len=length;

#ifdef DEBUG
#ifdef PRINT
cout<<" --------------------------ApplyFoo -------------------------------- ";
cout<<"inside ApplyFoo(REAL array[], size_t length) ";
cout<<"Number of object new objt destroying objt"< #endif
objNo = nob;
main_obtime = -1;
noo++;
nob++;
while(main_obtime == -1) main_obtime = clock();
#ifdef PRINT
cout<<<"("<<<") "<< #endif
#endif
}

ApplyFoo(const ApplyFoo& f) {
a = f.a;
len = f.len;

#ifdef DEBUG
obtime = -1;
noo++;
nob++;
objNo = nob;
while(obtime==-1) obtime = clock();
#ifdef PRINT
cout<<<" "<< #endif
#endif
}

void operator()(const blocked_range& r) const {
REAL *arr = a;
for(size_t i=r.begin(); i!=r.end(); i++) arr = VALUE;

}

~ApplyFoo() {
#ifdef DEBUG
noo--;
obtime = clock() - obtime;
REAL fobtime = (double)obtime/CLOCKS_PER_SEC;
#ifdef PRINT
cout<<" "<<<" ("<<<")"< #endif
if(max_obtime max_obtime = obtime;
}else if(objNo == 1) {
main_obtime = clock() - main_obtime;
#ifdef PRINT
cout<<"max_obtime : "<<<" --->"<<(double)max_obtime/CLOCKS_PER_SEC< cout<<"main_obtime : "<<<" --->"<<(double)main_obtime/CLOCKS_PER_SEC< #endif
}
#endif
}

#ifdef DEBUG
void display_time() {
cout<<"max_obtime : "<< cout<<"main_obtime : "<< }
#endif
};

#ifdef DEBUG
unsigned int ApplyFoo::noo = 0;
unsigned int ApplyFoo::nob = 0;
clock_t ApplyFoo::max_obtime;
clock_t ApplyFoo::main_obtime;
#endif

#undef DEBUG
void ParallelApplyFoo(REAL a[], size_t n, size_t gs) {
ApplyFoo af(a,n);
parallel_for(blocked_range(0,n,gs), af);
#ifdef DEBUG
af.display_time();
#endif
}

#define DEBUG

class SumFoo {
REAL *a;
size_t len;
REAL sum;

#ifdef DEBUG
static unsigned int noo;
static unsigned int nob;
int objNo;
clock_t obtime;
static clock_t max_obtime;
static clock_t main_obtime;
#endif

public :

SumFoo(REAL array[], size_t length) {
a=array;
len=length;
sum = 0.0;
#ifdef PRINT
cout<<" ----------------------------- Summing using SumFoo ----------------------- ";
#endif

#ifdef DEBUG
main_obtime = -1;
#ifdef PRINT
cout<<"inside SumFoo(REAL array[], size_t length) ";
cout<<"Number of object new objt destroying objt"< #endif
objNo = nob;
noo++;
nob++;
while(main_obtime == -1) main_obtime = clock();
#ifdef PRINT
cout<<<"("<<<") "<< #endif
#endif
}

SumFoo(const SumFoo& f, split) {
a = f.a;
len = f.len;
; sum = 0.0;
#ifdef DEBUG
obtime = -1;
noo++;
nob++;
objNo = nob;
while(obtime==-1) obtime = clock();
#ifdef PRINT
cout<<<" "<< #endif
#endif
}

void operator()(const blocked_range& r) {
REAL *arr = a;
for(size_t i=r.begin(); i!=r.end(); i++) sum+=arr;
}

void displaysum() {
cout<< }

void join(SumFoo &f) {

sum += f.sum;
#ifdef DEBUG
noo--;
f.obtime = clock() - f.obtime;
REAL fobtime = (double)f.obtime/CLOCKS_PER_SEC;
#ifdef PRINT
cout<<" "<<<" ("<<<") sum = "<< #endif
if(max_obtime max_obtime = f.obtime;
}else if(f.objNo == 1) {
main_obtime = clock() - main_obtime;
#ifdef PRINT
cout<<"max_obtime : "<<<" --->"<<(double)max_obtime/CLOCKS_PER_SEC< cout<<"main_obtime : "<<<" --->"<<(double)main_obtime/CLOCKS_PER_SEC< #endif
}
#endif
}

#i fdef DEBUG
void display_time() {
cout<<"max_obtime : "<< cout<<"main_obtime : "<< }
#endif
};

#ifdef DEBUG
unsigned int SumFoo::noo = 0;
unsigned int SumFoo::nob = 0;
clock_t SumFoo::max_obtime;
clock_t SumFoo::main_obtime;
#endif

//#undef DEBUG
void ParallelSumFoo(REAL a[], size_t n, size_t gs) {

/*#ifdef COMPARE_ALL
#ifdef AUTO_PART
#undef AUTO_PART
#endif
#ifdef SIMPLE_PART
#undef SIMPLE_PART
#endif
SumFoo sf1(a,n), sf2(a,n);
parallel_reduce(blocked_range(0,n), sf1, simple_partitioner());
cout<<"using SIMPLE_PART sum : "; sf1.displaysum();
parallel_reduce(blocked_range(0,n), sf2, auto_partitioner());
cout<<"using AUTO_PART sum : "; sf2.displaysum();
#endif
*/
SumFoo sf(a,n);
#ifdef AUTO_PART
parallel_reduce(blocked_range(0,n), sf, auto_partitioner());
cout<<"using AUTO_PART ";
#else
#ifdef SIMPLE_PART
parallel_reduce(blocked_range(0,n), sf, simple_partitioner());
cout<<"using SIMPLE_PART ";
#else
parallel_reduce(blocked_range(0,n,gs), sf);
cout<<"using GRAINSIZE = "<<<" ";
#endif
#endif

cout<<"sum : ";
sf.displaysum();
#ifdef DEBUG
sf.display_time();
#endif
}

#define SERIAL

#ifdef SERIAL
class SerialApplyFoo {
REAL *a;
size_t len;
double sum;
clock_t tserial;

public :
SerialApplyFoo(REAL *array, size_t length) : a(array), len(length){
cout<<" ------------------------ Summing using SerialApplyFoo --------------------- ";
&nb sp; sum = 0.0;
tserial = -1;
while(tserial == -1) tserial = clock();
for(size_t i=0; i sum += a;
}
tserial = clock() - tserial;
}

void displaysum() {
cout<<"SerialApplyFoo sum : "<< }

~SerialApplyFoo() {
displaysum();
cout<<"Approx time taken by SerialApplyFoo = "<<<" --->"<<(double)tserial/CLOCKS_PER_SEC< }
};
#endif

int main() {
task_scheduler_init init;
REAL *array = new REAL[MAX];
if(array == NULL) {
cout<<"array == NULL ";
exit(0);
}
ParallelApplyFoo(array, MAX, GRAINSIZE);

#ifdef SERIAL
SerialApplyFoo sf(array, MAX);
#endif

ParallelSumFoo(array, MAX, GRAINSIZE);
return 0;
}

may_max11 · ‎02-28-2008

quick note : the ApplyFoo class simply intitializes the contents of array to VALUE using parallel_for and SumFoo class sums the array using parallel_reduce.

AJ13 · ‎02-28-2008

I just looked at your code quickly now, but you might be interested to know TBB also includes a "tick_count" which is designed to help you with timing. I'll have to look at your code in more detail in a few hours.

AJ

Alexey-Kukanov · ‎02-29-2008

may_max11:
I developed a simple program to access the performance of parallel_reduce and parallel_for against serial execution. the program simpy sums up the content of a large array (order of 10M) and display the seq in which objects are copied and joined. To my astonishment the parallized program is taking a lot more time compared to the serial program to sum up the contents. Please give me some explanation and solution...

It's almost always the case with these simple and naive parallel benchmark programs that their authors are astonished by bad parallel performance :)

Just imagine you are the compiler, and compare the two summing loops from this point of view. First, the serial loop:

for(size_t i=0; i    sum += a;
}

It's very simple to optimize, having in mind that len is a constant to the compiler. For example, VC++ 2005 made the following code out of that:

         fldz
         lea eax, DWORD PTR [edi+010h]
  mov ecx, 0x2625A0h
main+0x105:fadd QWORD PTR [eax-16]
            add eax, 0x20h
            sub ecx, 0x1h
            fadd QWORD PTR [eax-40]
            fadd QWORD PTR [eax-32]
            fadd QWORD PTR [eax-24]
            jnz main+0x105

This magic 0x2625A0 constant is 2500000, one fourth of the array length. There are four floating point additions, loop index (ecx) decrement, and bumping array pointer (eax) 4 elements forward. That's it, very small code which has exactly one memory load operation per iteration followed by one store at the end (not shown here). It's very efficient.

Now consider the TBB variant of it:

for(size_t i=r.begin(); i!=r.end(); i++) {
    sum += a;
}

The compiler is much more conservative in interpreting this loop because its boundaries are variable, and it's length is unknown. Also sum is the object data field which possibly adds other restrictions. In the above serial code the compiler put sum onto a register, might be due to the fact that the codeis in constructor and no data field can be visible until it finishes? Anyway, in the TBB code it did not do that:

execute+0x52:mov edi, DWORD PTR [edx]
           lea edi, DWORD PTR [edi+ecx*8]
execute+0x57:fld QWORD PTR [edi]
              add ecx, 0x1h
              fadd QWORD PTR [edx+08h]
              add edi, 0x8h
              fstp QWORD PTR [edx+08h]
              cmp ecx, DWORD PTR [esi+08h]
              jnz execute+0x57

The array pointer is in edi, and you see in the first two lines it is loaded in twosteps: first base, then incremented by offset. On each iterati on, it is incremented by 0x8h, i.e. just one element. The loop variable (ecx) is also incremented by one, and compared to a memory location (range.end()). Finally, the result is saved back to sum each time, and consequently sum is loaded each time. So we have 3 memory loads per loop iteration, comparing to 1 in serial case.

Do you now understand better why your benchmark "favors" serial case?

Alexey-Kukanov · ‎02-29-2008

And by the way I think you now should be able to find how your code in SumFoo should be improved to gain some speedup over the serial case; but if you do not, please don't hesitate to ask.

may_max11 · ‎03-01-2008

I tried out the code after incorporating your suggestions. I also used tick_cout as suggested. But the results are still the same ..... serial way of adding the array is still the fastest of the two. I am attaching the code and the output. Even the answer given by parallel summing method is wrong this time. Please suggest me what is happening and the solution .

code :
--------------------------------------code--------------------------------------------------

#include

#include
#include
#include
#include

#define REAL double
#define VALUE 1.125
using namespace std;
using namespace tbb;

class SumFoo{
public :
REAL *a;
REAL sum;

SumFoo() {
a = NULL;
sum = 0.0;
}

SumFoo(SumFoo &f, split): a(f.a), sum(0.0) {

}

void operator()(blocked_range &r) {
REAL *al = a;
for(int i = r.begin(); i<=r.end(); i++) {
sum += al;
}
}

void join(SumFoo f) {
sum+=f.sum;
}
};

SumFoo sf;

REAL ParallelSum(REAL *a_, unsigned long int n) {
sf.a = a_;
sf.sum = 0.0;
parallel_reduce(blocked_range(0, n), sf, simple_partitioner());
return sf.sum;
}

REAL SerialSum(REAL *a_, unsigned long int start, unsigned long int end) {
REAL sum = 0.0;
for(unsigned long int i=start; i sum += a_;
return sum;
}

int main() {
task_scheduler_init init;
REAL *array = NULL;
unsigned long int n = 0;
cout<<"n = "; cin>>n;
array = new REAL;
for(unsigned long int i=0; i = VALUE;
tick_count t0 = tick_count::now();
cout< tick_count t1 = tick_count::now();
cout<<"ParallelSum took : "<<(t1 - t0).seconds()<<"seconds ";
t0 = tick_count::now();
cout< t1 = tick_count::now();
cout<<"SerialSum took : "<<(t1 - t0).seconds()<<"seconds ";
delete array;
return 0;
}

------------------------output-------------------------------------------

n = 10000000
2.25e+07
ParallelSum took : 2.70939seconds
1.125e+07
SerialSum took : 0.036173seconds

may_max11 · ‎03-01-2008

I even tried with grainsize = 10000. the improvement in speed is still not significant the output is:

n = 100000000
1.12518e+08
ParallelSum took : 0.54274seconds
1.125e+08
SerialSum took : 0.523009seconds

may_max11 · ‎03-01-2008

and with grain size = 1000000 the output is:

n = 100000000
1.125e+08
ParallelSum took : 0.547122seconds
1.125e+08
SerialSum took : 0.558095seconds

Alexey-Kukanov · ‎03-02-2008

Wrong results are due to i<=r.end() in your operator(). The blocked_range r is half-open [r.begin,r.end); in other words, the end point shouldn't count and the comparison should be < or !=.

If no grainsize specified explicitly, blocked_range assumes the grainsize is 1. For your example, this value may work well with auto_partitioner but not with simple_partitioner, because one iteration contains too little work to justify the overhead. Thus you see much better time with grain size of 10000 than with that of 1. If you use simple_partitioner, do not use the default grainsize (of 1) unless you know each iteration has a plenty of work.

I will comment more on performance later. In fact I am going to write a blog entry about this case; it should not bemuch work considering how much I can copy from here :)

Alexey-Kukanov · ‎03-04-2008

may_max11:
I tried out the code after incorporating your suggestions. I also used tick_cout as suggested. But the results are still the same ..... serial way of adding the array is still the fastest of the two.

As promised, I wrote a blog post about this case: http://softwareblogs.intel.com/2008/03/04/why-a-simple-test-can-get-parallel-slowdown/

I hope it helps.