[cpp]void PtIterationDestinationRunner::operator()( const blocked_rangewhich is called like this& range ) const { // tbb::spin_mutex::scoped_lock lock(m_lock); // 41.5s
m_timestamps.push_back(make_tuple(tick_count::now(), m_id, "operator()-entered"));
PTPathBuilder ptPathBuilder(*m_ptCommand);
PtOutputRecorderController ptRecorderController(m_ptCommand, m_output);
shared_ptrpath = shared_ptr (new PtPath(*m_ptCommand, m_options));
m_timestamps.push_back(make_tuple(tick_count::now(), m_id, "operator()-initialised"));
// tbb::spin_mutex::scoped_lock lock(m_lock); // 43.4sec
for (int dest = range.begin(); dest != range.end(); ++dest)
{
runDestination(dest, ptPathBuilder, path, ptRecorderController);
}
m_timestamps.push_back(make_tuple(tick_count::now(), m_id, "operator()-finished-" +
ToString(range.begin()) + "-" + ToString(range.end())));
}
void PtIterationDestinationRunner::runDestination(int dest, PTPathBuilder& ptPathBuilder, shared_ptrpath, PtOutputRecorderController& ptRecorderController) const
{
int destOtNr = m_model->get_centroid(dest)->ot_pointnr();
int numOrigins = m_props->GetOriginsToAssignForDestination(destOtNr).size();
if (numOrigins == 0 && !m_options.doSkim()) return; // nothing to do here
// tbb::spin_mutex::scoped_lock lock(m_lock); // 45.2 seconds
ptPathBuilder.build(path, dest, m_options);
// tbb::spin_mutex::scoped_lock lock(m_lock); // 71.2 seconds
set::iterator classIterator = m_clazzes.begin();
for (; classIterator != m_clazzes.end(); ++classIterator)
{
int clazz = *classIterator;
int access_mode = m_props->GetAccessMode(clazz);
path->add_access_mode(access_mode);
ptPathBuilder.calc_access(path, access_mode);
ptRecorderController.Record(path, clazz);
}
// with no mutex at all run time is 112 seconds
}
[/cpp]
[cpp]PtIterationDestinationRunner destinationRunner(ptCommand, ptPathBuildOptions, output, timestamps); tbb::task_scheduler_init init(4); parallel_for(blocked_range(0, num_centroids, 100), destinationRunner); [/cpp]
Link Copied
[cpp]// ---- run the work ---- 1:36, 1:22 for (size_t i = 0; i < m_work_vec.size(); ++i) run_trend( i );[/cpp]---
[cpp]// ---- run the work tbb ---- 7:21, 9:41 parallel_for( blocked_range---(0, m_work_vec.size() ), compute_trends_tbb(this), auto_partitioner() );[/cpp]
[cpp]// ---- run the work boost ---- 1 thread: 1:40, 1:32 2 threads: 5:58, 5:43 size_t midpoint = m_work_vec.size() / 2, endpoint = m_work_vec.size(); boost::thread thrd1( compute_trends_boost(this, 0, midpoint /*endpoint*/) ); boost::thread thrd2( compute_trends_boost(this, midpoint, endpoint ) ); thrd1.join(); thrd2.join();[/cpp]---
[cpp]class mod_trend_runner::compute_trends_tbb { mod_trend_runner * m_module; public: // constructor compute_trends_tbb(mod_trend_runner * module ) : m_module( module ) {} void operator()( const tbb::blocked_range<:SIZE_T>& r ) const { for( std::size_t i = r.begin(); i != r.end(); ++i ) m_module->run_trend( i ); } }; class mod_trend_runner::compute_trends_boost { size_t start, end; mod_trend_runner * m_module; public: // constructor compute_trends_boost(mod_trend_runner * module, size_t startIndex, size_t endIndex) : m_module(module), start(startIndex), end(endIndex) { } void operator()() { for (size_t i = start; i < end; ++i) m_module->run_trend(i); } };[/cpp]A typical range size is about 10,000 trends. A telling result comes from the boost version, where one thread running 10,000 trends runsin nearly the same time as the purely serial version, but two threads, each running 5000 trends, runs inmore than doubletime of the serial version - instead of half the time, as I was expecting. This seems to support data sharing and cache miss issues as Alexey suggests.
For more complete information about compiler optimizations, see our Optimization Notice.