- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I am new to TBB, I developed a simple program to access the performance of parallel_reduce and parallel_for against serial execution. the program simpy sums up the content of a large array (order of 10M) and display the seq in which objects are copied and joined. To my astonishment the parallized program is taking a lot more time compared to the serial program to sum up the contents. Please give me some explanation and solution... I am copying the program below.
I am using :
Processor : Core 2 Duo
OS : RHEL 4.0
PS : I even timed the program using time cmd
Program :
//-------------------------------------------------starts here-----------------------------------------
//to disable printing of values undef PRINT
#include
#include
#include
#include
#include
using namespace std;
using namespace tbb;
#define REAL double
#define VALUE 0.025;
#define DEBUG
#define MAX 10000000
#define GRAINSIZE 500000
#define PRINT
class ApplyFoo {
REAL *a;
size_t len;
#ifdef DEBUG
static unsigned int noo;
static unsigned int nob;
int objNo;
clock_t obtime;
static clock_t max_obtime;
static clock_t main_obtime;
#endif
public :
ApplyFoo(REAL array[], size_t length) {
a=array;
len=length;
#ifdef DEBUG
#ifdef PRINT
cout<<" --------------------------ApplyFoo -------------------------------- ";
cout<<"inside ApplyFoo(REAL array[], size_t length) ";
cout<<"Number of object new objt destroying objt"<
objNo = nob;
main_obtime = -1;
noo++;
nob++;
while(main_obtime == -1) main_obtime = clock();
#ifdef PRINT
cout<
#endif
}
ApplyFoo(const ApplyFoo& f) {
a = f.a;
len = f.len;
#ifdef DEBUG
obtime = -1;
noo++;
nob++;
objNo = nob;
while(obtime==-1) obtime = clock();
#ifdef PRINT
cout<
#endif
}
void operator()(const blocked_range
REAL *arr = a;
for(size_t i=r.begin(); i!=r.end(); i++) arr = VALUE;
}
~ApplyFoo() {
#ifdef DEBUG
noo--;
obtime = clock() - obtime;
REAL fobtime = (double)obtime/CLOCKS_PER_SEC;
#ifdef PRINT
cout<<" "<
if(max_obtime
}else if(objNo == 1) {
main_obtime = clock() - main_obtime;
#ifdef PRINT
cout<<"max_obtime : "<
}
#endif
}
#ifdef DEBUG
void display_time() {
cout<<"max_obtime : "<
#endif
};
#ifdef DEBUG
unsigned int ApplyFoo::noo = 0;
unsigned int ApplyFoo::nob = 0;
clock_t ApplyFoo::max_obtime;
clock_t ApplyFoo::main_obtime;
#endif
#undef DEBUG
void ParallelApplyFoo(REAL a[], size_t n, size_t gs) {
ApplyFoo af(a,n);
parallel_for(blocked_range
#ifdef DEBUG
af.display_time();
#endif
}
#define DEBUG
class SumFoo {
REAL *a;
size_t len;
REAL sum;
#ifdef DEBUG
static unsigned int noo;
static unsigned int nob;
int objNo;
clock_t obtime;
static clock_t max_obtime;
static clock_t main_obtime;
#endif
public :
SumFoo(REAL array[], size_t length) {
a=array;
len=length;
sum = 0.0;
#ifdef PRINT
cout<<" ----------------------------- Summing using SumFoo ----------------------- ";
#endif
#ifdef DEBUG
main_obtime = -1;
#ifdef PRINT
cout<<"inside SumFoo(REAL array[], size_t length) ";
cout<<"Number of object new objt destroying objt"<
objNo = nob;
noo++;
nob++;
while(main_obtime == -1) main_obtime = clock();
#ifdef PRINT
cout<
#endif
}
SumFoo(const SumFoo& f, split) {
a = f.a;
len = f.len;
; sum = 0.0;
#ifdef DEBUG
obtime = -1;
noo++;
nob++;
objNo = nob;
while(obtime==-1) obtime = clock();
#ifdef PRINT
cout<
#endif
}
void operator()(const blocked_range
REAL *arr = a;
for(size_t i=r.begin(); i!=r.end(); i++) sum+=arr;
}
void displaysum() {
cout<
void join(SumFoo &f) {
sum += f.sum;
#ifdef DEBUG
noo--;
f.obtime = clock() - f.obtime;
REAL fobtime = (double)f.obtime/CLOCKS_PER_SEC;
#ifdef PRINT
cout<<" "<
if(max_obtime
}else if(f.objNo == 1) {
main_obtime = clock() - main_obtime;
#ifdef PRINT
cout<<"max_obtime : "<
}
#endif
}
#i fdef DEBUG
void display_time() {
cout<<"max_obtime : "<
#endif
};
#ifdef DEBUG
unsigned int SumFoo::noo = 0;
unsigned int SumFoo::nob = 0;
clock_t SumFoo::max_obtime;
clock_t SumFoo::main_obtime;
#endif
//#undef DEBUG
void ParallelSumFoo(REAL a[], size_t n, size_t gs) {
/*#ifdef COMPARE_ALL
#ifdef AUTO_PART
#undef AUTO_PART
#endif
#ifdef SIMPLE_PART
#undef SIMPLE_PART
#endif
SumFoo sf1(a,n), sf2(a,n);
parallel_reduce(blocked_range
cout<<"using SIMPLE_PART sum : "; sf1.displaysum();
parallel_reduce(blocked_range
cout<<"using AUTO_PART sum : "; sf2.displaysum();
#endif
*/
SumFoo sf(a,n);
#ifdef AUTO_PART
parallel_reduce(blocked_range
cout<<"using AUTO_PART ";
#else
#ifdef SIMPLE_PART
parallel_reduce(blocked_range
cout<<"using SIMPLE_PART ";
#else
parallel_reduce(blocked_range
cout<<"using GRAINSIZE = "<
#endif
#endif
cout<<"sum : ";
sf.displaysum();
#ifdef DEBUG
sf.display_time();
#endif
}
#define SERIAL
#ifdef SERIAL
class SerialApplyFoo {
REAL *a;
size_t len;
double sum;
clock_t tserial;
public :
SerialApplyFoo(REAL *array, size_t length) : a(array), len(length){
cout<<" ------------------------ Summing using SerialApplyFoo --------------------- ";
&nb sp; sum = 0.0;
tserial = -1;
while(tserial == -1) tserial = clock();
for(size_t i=0; i
}
tserial = clock() - tserial;
}
void displaysum() {
cout<<"SerialApplyFoo sum : "<
~SerialApplyFoo() {
displaysum();
cout<<"Approx time taken by SerialApplyFoo = "<
};
#endif
int main() {
task_scheduler_init init;
REAL *array = new REAL[MAX];
if(array == NULL) {
cout<<"array == NULL ";
exit(0);
}
ParallelApplyFoo(array, MAX, GRAINSIZE);
#ifdef SERIAL
SerialApplyFoo sf(array, MAX);
#endif
ParallelSumFoo(array, MAX, GRAINSIZE);
return 0;
}
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
AJ
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
may_max11:I developed a simple program to access the performance of parallel_reduce and parallel_for against serial execution. the program simpy sums up the content of a large array (order of 10M) and display the seq in which objects are copied and joined. To my astonishment the parallized program is taking a lot more time compared to the serial program to sum up the contents. Please give me some explanation and solution...
It's almost always the case with these simple and naive parallel benchmark programs that their authors are astonished by bad parallel performance :)
Just imagine you are the compiler, and compare the two summing loops from this point of view. First, the serial loop:
for(size_t i=0; isum += a;
}
It's very simple to optimize, having in mind that len is a constant to the compiler. For example, VC++ 2005 made the following code out of that:
fldz
lea eax, DWORD PTR [edi+010h]
mov ecx, 0x2625A0h
main+0x105:fadd QWORD PTR [eax-16]
add eax, 0x20h
sub ecx, 0x1h
fadd QWORD PTR [eax-40]
fadd QWORD PTR [eax-32]
fadd QWORD PTR [eax-24]
jnz main+0x105
This magic 0x2625A0 constant is 2500000, one fourth of the array length. There are four floating point additions, loop index (ecx) decrement, and bumping array pointer (eax) 4 elements forward. That's it, very small code which has exactly one memory load operation per iteration followed by one store at the end (not shown here). It's very efficient.
Now consider the TBB variant of it:
for(size_t i=r.begin(); i!=r.end(); i++) {
sum += a;
}
The compiler is much more conservative in interpreting this loop because its boundaries are variable, and it's length is unknown. Also sum is the object data field which possibly adds other restrictions. In the above serial code the compiler put sum onto a register, might be due to the fact that the codeis in constructor and no data field can be visible until it finishes? Anyway, in the TBB code it did not do that:
execute+0x52:mov edi, DWORD PTR [edx]
lea edi, DWORD PTR [edi+ecx*8]
execute+0x57:fld QWORD PTR [edi]
add ecx, 0x1h
fadd QWORD PTR [edx+08h]
add edi, 0x8h
fstp QWORD PTR [edx+08h]
cmp ecx, DWORD PTR [esi+08h]
jnz execute+0x57
The array pointer is in edi, and you see in the first two lines it is loaded in twosteps: first base, then incremented by offset. On each iterati on, it is incremented by 0x8h, i.e. just one element. The loop variable (ecx) is also incremented by one, and compared to a memory location (range.end()). Finally, the result is saved back to sum each time, and consequently sum is loaded each time. So we have 3 memory loads per loop iteration, comparing to 1 in serial case.
Do you now understand better why your benchmark "favors" serial case?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
code :
--------------------------------------code--------------------------------------------------
#include
#include
#include
#include
#include
#define REAL double
#define VALUE 1.125
using namespace std;
using namespace tbb;
class SumFoo{
public :
REAL *a;
REAL sum;
SumFoo() {
a = NULL;
sum = 0.0;
}
SumFoo(SumFoo &f, split): a(f.a), sum(0.0) {
}
void operator()(blocked_range
REAL *al = a;
for(int i = r.begin(); i<=r.end(); i++) {
sum += al;
}
}
void join(SumFoo f) {
sum+=f.sum;
}
};
SumFoo sf;
REAL ParallelSum(REAL *a_, unsigned long int n) {
sf.a = a_;
sf.sum = 0.0;
parallel_reduce(blocked_range
return sf.sum;
}
REAL SerialSum(REAL *a_, unsigned long int start, unsigned long int end) {
REAL sum = 0.0;
for(unsigned long int i=start; i
return sum;
}
int main() {
task_scheduler_init init;
REAL *array = NULL;
unsigned long int n = 0;
cout<<"n = "; cin>>n;
array = new REAL
for(unsigned long int i=0; i
tick_count t0 = tick_count::now();
cout<
cout<<"ParallelSum took : "<<(t1 - t0).seconds()<<"seconds ";
t0 = tick_count::now();
cout<
cout<<"SerialSum took : "<<(t1 - t0).seconds()<<"seconds ";
delete array;
return 0;
}
------------------------output-------------------------------------------
n = 10000000
2.25e+07
ParallelSum took : 2.70939seconds
1.125e+07
SerialSum took : 0.036173seconds
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
n = 100000000
1.12518e+08
ParallelSum took : 0.54274seconds
1.125e+08
SerialSum took : 0.523009seconds
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
n = 100000000
1.125e+08
ParallelSum took : 0.547122seconds
1.125e+08
SerialSum took : 0.558095seconds
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Wrong results are due to i<=r.end() in your operator(). The blocked_range r is half-open [r.begin,r.end); in other words, the end point shouldn't count and the comparison should be < or !=.
If no grainsize specified explicitly, blocked_range assumes the grainsize is 1. For your example, this value may work well with auto_partitioner but not with simple_partitioner, because one iteration contains too little work to justify the overhead. Thus you see much better time with grain size of 10000 than with that of 1. If you use simple_partitioner, do not use the default grainsize (of 1) unless you know each iteration has a plenty of work.
I will comment more on performance later. In fact I am going to write a blog entry about this case; it should not bemuch work considering how much I can copy from here :)
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
may_max11:I tried out the code after incorporating your suggestions. I also used tick_cout as suggested. But the results are still the same ..... serial way of adding the array is still the fastest of the two.
As promised, I wrote a blog post about this case: http://softwareblogs.intel.com/2008/03/04/why-a-simple-test-can-get-parallel-slowdown/
I hope it helps.
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page