- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hello,
I'm new to TBB and just started experimenting with it using tutorials. My first attempt is to test performance of a simple loop over a big array of floats. Once using TBB, and without. Comparing the time required for each tech, it was surprising. Check yourself and correct me if I'm doing somethign wrong:
#include "tbb/task_scheduler_init.h"
#include "tbb/parallel_for.h"
#include "tbb/blocked_range.h"
#include "tbb/tick_count.h"
#include
using namespace tbb;
#define BIGARRSIZE 100000
float big_arr[BIGARRSIZE];
void Foo(float *a)
{
(*a)++;
}
class ApplyFoo {
public:
void operator()(const blocked_range& r) const {
for(size_t i = r.begin(); i != r.end(); i++) {
Foo(&big_arr);
}
}
};
void main()
{
tick_count t0, t1;
int nthreads = 2;
task_scheduler_init init(task_scheduler_init::deferred);
if (nthreads >= 1)
init.initialize(nthreads);
t0 = tick_count::now();
parallel_for(blocked_range(0,BIGARRSIZE), ApplyFoo(), auto_partitioner());
t1 = tick_count::now();
printf("\n*** work took %g seconds ***", (t1 - t0).seconds());
if (nthreads >= 1)
init.terminate();
t0 = tick_count::now();
for (int i = 0; i < BIGARRSIZE; i++)
Foo(&big_arr);
t1 = tick_count::now();
printf("\n*** work took %f seconds ***", (t1 - t0).seconds());
printf("\n");
}
Thanks.
I'm new to TBB and just started experimenting with it using tutorials. My first attempt is to test performance of a simple loop over a big array of floats. Once using TBB, and without. Comparing the time required for each tech, it was surprising. Check yourself and correct me if I'm doing somethign wrong:
#include "tbb/task_scheduler_init.h"
#include "tbb/parallel_for.h"
#include "tbb/blocked_range.h"
#include "tbb/tick_count.h"
#include
using namespace tbb;
#define BIGARRSIZE 100000
float big_arr[BIGARRSIZE];
void Foo(float *a)
{
(*a)++;
}
class ApplyFoo {
public:
void operator()(const blocked_range
for(size_t i = r.begin(); i != r.end(); i++) {
Foo(&big_arr);
}
}
};
void main()
{
tick_count t0, t1;
int nthreads = 2;
task_scheduler_init init(task_scheduler_init::deferred);
if (nthreads >= 1)
init.initialize(nthreads);
t0 = tick_count::now();
parallel_for(blocked_range
t1 = tick_count::now();
printf("\n*** work took %g seconds ***", (t1 - t0).seconds());
if (nthreads >= 1)
init.terminate();
t0 = tick_count::now();
for (int i = 0; i < BIGARRSIZE; i++)
Foo(&big_arr);
t1 = tick_count::now();
printf("\n*** work took %f seconds ***", (t1 - t0).seconds());
printf("\n");
}
Thanks.
Link Copied
2 Replies
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
An update to this piece of code:
#include
#include
#include
#include "tbb/task_scheduler_init.h"
#include "tbb/parallel_for.h"
#include "tbb/blocked_range.h"
#include "tbb/tick_count.h"
using namespace tbb;
#define BIGARRSIZE 10000000
float big_arr[BIGARRSIZE];
void Foo(float *a)
{
(*a)++;
}
class ApplyFoo {
public:
void operator()(const blocked_range& r) const {
for(size_t i = r.begin(); i != r.end(); i++) {
Foo(&big_arr);
}
}
};
void ApplyFooRange(int start, int end)
{
for(int i = start; i <= end; i++)
Foo(&big_arr);
}
DWORD WINAPI FooPart1(LPVOID param)
{
ApplyFooRange(0, BIGARRSIZE / 2 - 1);
return 0;
}
DWORD WINAPI FooPart2(LPVOID param)
{
ApplyFooRange(BIGARRSIZE / 2, BIGARRSIZE - 1);
return 0;
}
DWORD threadIDs[4];
HANDLE hThreads[4];
void main()
{
tick_count t0, t1;
int nthreads = 2;
task_scheduler_init init(task_scheduler_init::deferred);
if (nthreads >= 1)
init.initialize(nthreads);
t0 = tick_count::now();
parallel_for(blocked_range(0,BIGARRSIZE), ApplyFoo(), auto_partitioner());
t1 = tick_count::now();
printf("n*** work took %g seconds ***", (t1 - t0).seconds());
if (nthreads >= 1)
init.terminate();
t0 = tick_count::now();
for (int i = 0; i < BIGARRSIZE; i++)
Foo(&big_arr);
t1 = tick_count::now();
printf("n*** work took %f seconds ***", (t1 - t0).seconds());
omp_set_num_threads(2);
t0 = tick_count::now();
#pragma omp default(none) private(i) shared(big_arr)
{
#pragma omp for
for (int i = 0; i < BIGARRSIZE; i++)
#pragma omp atomic
Foo(&big_arr);
}
t1 = tick_count::now();
printf("n*** work took %f seconds ***", (t1 - t0).seconds());
t0 = tick_count::now();
hThreads[0] = CreateThread(NULL, 0, FooPart1, NULL, 0, &threadIDs[0]);
hThreads[1] = CreateThread(NULL, 0, FooPart2, NULL, 0, &threadIDs[1]);
WaitForMultipleObjects(2, hThreads, TRUE, INFINITE);
t1 = tick_count::now();
printf("n*** work took %f seconds ***", (t1 - t0).seconds());
CloseHandle(hThreads[0]);
CloseHandle(hThreads[1]);
printf("n");
}
Try it yourself. Really impressive! :)
#include
#include
#include
#include "tbb/task_scheduler_init.h"
#include "tbb/parallel_for.h"
#include "tbb/blocked_range.h"
#include "tbb/tick_count.h"
using namespace tbb;
#define BIGARRSIZE 10000000
float big_arr[BIGARRSIZE];
void Foo(float *a)
{
(*a)++;
}
class ApplyFoo {
public:
void operator()(const blocked_range
for(size_t i = r.begin(); i != r.end(); i++) {
Foo(&big_arr);
}
}
};
void ApplyFooRange(int start, int end)
{
for(int i = start; i <= end; i++)
Foo(&big_arr);
}
DWORD WINAPI FooPart1(LPVOID param)
{
ApplyFooRange(0, BIGARRSIZE / 2 - 1);
return 0;
}
DWORD WINAPI FooPart2(LPVOID param)
{
ApplyFooRange(BIGARRSIZE / 2, BIGARRSIZE - 1);
return 0;
}
DWORD threadIDs[4];
HANDLE hThreads[4];
void main()
{
tick_count t0, t1;
int nthreads = 2;
task_scheduler_init init(task_scheduler_init::deferred);
if (nthreads >= 1)
init.initialize(nthreads);
t0 = tick_count::now();
parallel_for(blocked_range
t1 = tick_count::now();
printf("n*** work took %g seconds ***", (t1 - t0).seconds());
if (nthreads >= 1)
init.terminate();
t0 = tick_count::now();
for (int i = 0; i < BIGARRSIZE; i++)
Foo(&big_arr);
t1 = tick_count::now();
printf("n*** work took %f seconds ***", (t1 - t0).seconds());
omp_set_num_threads(2);
t0 = tick_count::now();
#pragma omp default(none) private(i) shared(big_arr)
{
#pragma omp for
for (int i = 0; i < BIGARRSIZE; i++)
#pragma omp atomic
Foo(&big_arr);
}
t1 = tick_count::now();
printf("n*** work took %f seconds ***", (t1 - t0).seconds());
t0 = tick_count::now();
hThreads[0] = CreateThread(NULL, 0, FooPart1, NULL, 0, &threadIDs[0]);
hThreads[1] = CreateThread(NULL, 0, FooPart2, NULL, 0, &threadIDs[1]);
WaitForMultipleObjects(2, hThreads, TRUE, INFINITE);
t1 = tick_count::now();
printf("n*** work took %f seconds ***", (t1 - t0).seconds());
CloseHandle(hThreads[0]);
CloseHandle(hThreads[1]);
printf("n");
}
Try it yourself. Really impressive! :)
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hope my conclusion will help. As the complexity of the inner threaded loop increase, and the size of the array goes up by order of magnitudes, the gain in performance is very apparent, and makes difference in TBB. Creating threads manually can be a lightly faster, but as the code becomes more complicated, the TBB roles and rocks. My vote is TBB are awesome!
Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page