- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
I am considering writing a benchmark/unit test for parallel_for that would help in determining whether the parallelized loop outperforms the raw loop. Specifically I was imagining parameterizing: loop size, time/body execution, grain_size and wrapping both the parallel_for and the raw loop in high precision timers (e.g. QuereyPerformanceCounter API on windows) - Has anyone already done this as would be willing to share the code ?
thanks
Sean
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I'm working on a micro-benchmark, and am including parallel_for as part of it. Following Gustafsons approach, it measures how much work can be done in a fixed amount of time. E.g., for how large a value of N can the machine run parallel_for(blocked_range(0,N,1),b,simple_partitioner()) ?
I'm using 1 msec as my fixed amount of time, which seems the right order of magnitude for desktop parallelism (e.g. think video frames).
parallel_for(blocked_range(0,N,1),b,simple_partitioner()) is a good thing to time because it tells you the overhead of a loop slice.
Replacing simple_partitioner() with auto_partitioner results in a test that is difficult to interpret, because the auto_partitioner games the test. It does tell you how efficient auto_partitioner is at eliminating unnecessary subdivision, so maybe that is the point.
I've appended the current draft of the test after my signature. [Weird feature of blogging software: cutting-and-pasting from Visual Studio results in loss of leading whitespace. But cutting-and-pasting from Visual Studio into Outlook message, and then cutting-and-pasting from there to here retains the correct whitespace.]
I've only run it on 64-bit Linux so far. There's may be Linuxisms that have to be corrected.
- Arch Robison (Intel TBB Developer)
#include "tbb/parallel_for.h"
#include "tbb/parallel_reduce.h"
#include "tbb/blocked_range.h"
#include "tbb/task_scheduler_init.h"
#include "tbb/tick_count.h"
#include
#include
#include
#include
#include
// This file is a microbenchmark for TBB.& nbsp;
//
// Command line:
// test_unit.exe
filter1 filter2 ...
// where
// p is of the form m or m:n, which indicates the number of threads to use. m:n indicates a closed range
// filtern is a string indicating which tests to select. If the filter is xyz, only tests with xyz in their name will be run.
//
// Results are printed in tab-separate form suitable for importing into a spreadsheet.
// Each line has the form
// name P N rate0...rate4
// where
// name describes the test
// P is the number of threads
// N is the value of N used. It is set so that the test runs in 1 millisecond.
// rate0...rate4 are the rates that were measured. Rates are in printed in per-millisecond.
int filterc;
//! Pointer into argv where filter arguments begin.
char** filterv;
//! Miniumum number of threads.
int min_thread=1;
//! Maximum number of thread
int max_thread=1;
//! Type of a count.
typedef size_t count_type;
typedef void (*benchmark_type)(count_type);
//! Desired time in which each test must run.
/** It is a millisecond, because TBB targets desktop parallelism, and
interactive programs have to do calculations in fractions of a video frame. */
const double DESIRED_TEST_TIME = 0.001;
//! Number of times to repeat a test.
const int REPETITIONS = 10;
//! Each run must not vary more than a relative tolerand of (+/-)TOLERANCE
const double TOLERANCE = 0.1;
//! Number of times to run each test.
const int N_TRIAL = 5;
//! Measure time it takes to run test with n iterations.
double time_one( benchmark_type test, count_type n ) {
tbb::tick_count t0 = tbb::tick_count::now();
for( int i=0; i
test(n);
tbb::tick_count t1 = tbb::tick_count::now();
return (t1-t0).seconds()/REPETITIONS;
}
//! Print a rate.
/** This routine prints a typical rate in only 7 characters, instead of the 9
that would be print if a %g format were used. The savings are because
print_one_rate prints the exponent without a "+", and without a "0"
if the exponent is a single digit. */
void print_one_rate( double rate ) {
int exponent = floor(log10(rate));
printf("%.3fe%d", rate/pow(10.0,exponent), exponent );
}
//! Time a benchmark
/** Fills in elements of rate[] with per-second rates.
Returns nominal number of times the benchmark was run in DESIRED_TIME */
count_type run_one( const char* name, benchmark_type benchmark, double rate[N_TRIAL] ) {
double time;
count_type n;
for(n=1;;n*=2) {
time = time_one(benchmark,n);
// Quit early if overflow might happen
if( time>=DESIRED_TEST_TIME ) break;
if( n*2
printf("error: overflow for %s ",name);
exit(1);
}
}
n = (n*DESIRED_TEST_TIME/time+0.5);
int n_retry = 0;
retry:
time = time_one(benchmark,n);
n = (n*DESIRED_TEST_TIME/time+0.5);
double expected_rate = n/time;
for( int i=0; i
// Test is run n+wobble times, to reveal any sensitivity to small variations in n.
int wobble = (i-N_TRIAL)/2;
time = time_one(benchmark,n+wobble);
rate = n/time;
if( rate
if( ++n_retry==3 ) {
printf("error: cannot get repeatable time for %s, n=%ld ",name,long(n));
&nb sp; exit(1);
}
goto retry;
}
}
return n;
}
void run( const char* name, benchmark_type benchmark ) {
for( int k=0; k
if( std::strstr( name, filterv
return;
double rate[N_TRIAL];
for( int nthread=min_thread; nthread<=max_thread; ++nthread ) {
tbb::task_scheduler_init init(nthread);
count_type n = run_one( name, benchmark, rate );
printf("%-64s %4u %10lu",name,nthread,(unsigned long)n);
for( count_type i=0; i
printf(" ");
// Rates in rate[] are per-second. We print them in per-milliscond form
print_one_rate(rate*DESIRED_TEST_TIME);
}
printf(" ");
}
}
//! Print table header
void print_header() {
p rintf("%-64s %4s %10s","TEST","P","N");
for( int i=0; i
printf(" rate%d",i);
printf(" ");
}
void time_tick_count( count_type n ) {
for( count_type i=0; i
volatile tbb::tick_count t = tbb::tick_count::now();
}
}
struct trivial_body {
void operator()( const tbb::blocked_range
volatile long x;
int end = r.end();
for( int i=r.begin(); i
x = 0;
}
};
template<typename Partitioner>
void time_empty_parallel_for( count_type n ) {
Partitioner partitioner;
volatile count_type zero = 0;
for( count_type i=0; i
tbb::parallel_for( tbb::blocked_range
}
}
template<typename Partitioner, const size_t chunk_size>
void time_n_parallel_for( count_type n ) {
Par titioner partitioner;
tbb::parallel_for( tbb::blocked_range
}
struct trivial_reduction_body {
int sum;
trivial_reduction_body() {sum=0;}
trivial_reduction_body( trivial_reduction_body& other, tbb::split ) {sum=0;}
void join( trivial_reduction_body& other ) {sum+=other.sum;}
void operator()( const tbb::blocked_range
volatile long x;
int end = r.end();
for( int i=r.begin(); i
x = 0;
sum = 0;
}
};
template<typename Partitioner>
void time_empty_parallel_reduce( count_type n ) {
Partitioner partitioner;
trivial_reduction_body body;
volatile count_type zero = 0;
for( count_type i=0; i
tbb::parallel_reduce( tbb::blocked_range
}
}
template<typename Partitioner, const size_t chunk_size>
void time_n_parallel_reduce( count_type n ) {
Partitioner partitioner;
trivial_reduction_body bod y;
tbb::parallel_reduce( tbb::blocked_range
}
//! Parse the command line
void parse_command_line( int argc, char* argv[] ) {
int i = 1;
if( i
char* endptr;
min_thread = strtol( argv, &endptr, 0 );
if( *endptr==':' )
max_thread = strtol( endptr+1, &endptr, 0 );
else if( *endptr=='' )
max_thread = min_thread;
if( *endptr!='' ) {
fprintf(stderr,"garbled nthread range ");
exit(1);
}
if( min_thread<0 ) {
fprintf(stderr,"nthread must be nonnegative ");
exit(1);
}
if( max_thread
fprintf(stderr,"nthread range is backwards ");
exit(1);
}
++i;
}
filterv = argv+i;
filterc = argc-i;
}
int main( int argc, char* argv[] ) {
parse_command_line( argc, argv );
print_header();
run( "tick_count::now()", time_tick_count );
// parallel_for
run( "parallel_for(blocked_range(0,zero,1),b,simple_partitioner())", time_empty_parallel_for<:SIMPLE_PARTITIONER> );
run( "parallel_for(blocked_range(0,N,1),b,simple_partitioner())", time_n_parallel_for<:SIMPLE_PARTITIONER> );
run( "parallel_for(blocked_range(0,N,1),b,auto_partitioner())", time_n_parallel_for<:AUTO_PARTITIONER> );
#if TBB_TASK_AFFINITY
run( "parallel_for(blocked_range(0,N,1),b,affinity_partitioner())", time_n_parallel_for<:AFFINITY_PARTITIONER> );
#endif /* TBB_TASK_AFFINITY */
run( "parallel_for(blocked_range(0,N,10000),b,simple_partitioner())", time_n_parallel_for<:SIMPLE_PARTITIONER> );
run( "parallel_for(blocked_range(0,N,10000),b,auto_partitioner())", time_n_parallel_for<:AUTO_PARTITIONER> );
#if TBB_TASK_AFFINITY
run( "parallel_for(blocked_range(0,N,10000),b,affinity_partitioner())", time_n_parallel_for<:AFFINITY_PARTITIONER> );
#endif /* TBB_TASK_AFFINITY */
// parallel_reduce
run( "parallel_reduce(blocked_range(0,zero,1),b,simple_partitioner())", time_empty_parallel_reduce<:SIMPLE_PARTITIONER> );
run( "parallel_reduce(blocked_range(0,N,1),b,simple_partitioner())", time_n_parallel_reduce<:SIMPLE_PARTITIONER>
run( "parallel_reduce(blocked_range(0,N,1),b,auto_partitioner())", time_n_parallel_reduce<:AUTO_PARTITIONER> );
#if TBB_TASK_AFFINITY
run( "parallel_reduce(blocked_range(0,N,1),b,affinity_partitioner())", time_n_parallel_reduce<:AFFINITY_PARTITIONER> );
#endif /* TBB_TASK_AFFINITY */
run( "parallel_reduce(blocked_range(0,N,10000),b,simple_partitioner())", time_n_parallel_reduce<:SIMPLE_PARTITIONER> );
run( "parallel_reduce(blocked_range(0,N,10000),b,auto_partitioner())", time_n_parallel_reduce<:AUTO_PARTITIONER> );
#if TBB_TASK_AFFINITY
run( "parallel_reduce(blocked_range(0,N,10000),b,affinity_partitioner())", time_n_parallel_reduce<:AFFINITY_PARTITIONER> );
#endif /* TBB_TASK_AFFINITY */
return 0;
}
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Sean
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page