Community
cancel
Showing results for 
Search instead for 
Did you mean: 
Seunghwa_Kang
Beginner
96 Views

TBB's malloc segfaulting if used with MPI and TBB

Jump to solution
Hello,
I am developing a programusing TBB (v4.0 update 1), MPI (mpich2-1.4.1p1), and TBB's scalable memory allocation library.
The program is segfaulting inside TBB's malloc library, but the program works OK if I do not use TBB's scalable memory allocation library or if I do not use MPI library (the bug occurseven whenI do not use any MPI routine except for MPI_Init_thread). The bug also does not happen if I use OpenMP instead of TBB for parallelization (using MPI, OpenMP, and TBB's malloc library is OK).
The bug is a hisenbug and does not happen consistently, but the bug appears more frequently with a larger number of MPI nodes and appears almost every time with more than 40 MPI nodes in my test environment.
Is there any known inter-operability issue among MPI, TBB, and TBB's memory allocator?
I attached a simplifed version of cpp source code to reproduce the bug (though this bug is a hisenbug and I am not sure the bug will appear in different systems) and gdb call stack printfrom the original version of program. The simple version also segfaults at the same point (../../src/tbbmalloc/backref.cpp:158) but I cannot print call stack as the process becomes a zombie process after segmentation fault.
* the simple version of cpp source code (compiled with -std=c++0x -g -O1 -openmp and used link options -openmp-link static -openmp -ltbb_debug -ltbbmalloc_porxy_debug -ltbbmalloc_debug -lgfortran -lmpich -lmpl -lpthread)
#include
#include
#include
#include
#include "mpi.h"
#include
#include
#define DIMENSION 3
#define NUM_PARTITIONS 60
#define NUM_SUBPARTITIONS 8000
using namespace std;
using namespace tbb;
struct vector_t {
float a_val[DIMENSION];
};
struct dummy_t {
int a_dummy[18];
};
struct partitionData_t {
vector av_subpartData[NUM_SUBPARTITIONS];
};
struct tmpForceData_t {
vector av_subpartForceData[NUM_SUBPARTITIONS];
};
class Sim {
public:
vector vp_partitionData;
public:
Sim( void );
~Sim( void );
void init( void );
void run( void );
void term( void );
};
static void updatePartitionData( partitionData_t& partitionData );
static void updateForce( const vector& v_data0, const vector& v_data1, vector& v_force0, vector& v_force1 );
int main( void ) {
Sim mySim;
mySim.init();
mySim.run();
mySim.term();
return 0;
}
Sim::Sim( void ) {
/* nothing to do */
}
Sim::~Sim( void ) {
/* nothing to do */
}
void Sim::init( void ) {
unsigned int seed = 100;
int level;
int ret;
/* initialize MPI */
ret = MPI_Init_thread( NULL, NULL, MPI_THREAD_MULTIPLE, &level );/* program does not segfault if I do not call MPI_Init_thread */
if( ret != MPI_SUCCESS ) {
exit( -1 );
}
else if( level != MPI_THREAD_MULTIPLE ) {
exit( -1 );
}
/* initialize partitions */
for( int i = 0 ; i < NUM_PARTITIONS ; i++ ) {
partitionData_t* p_data = new partitionData_t;
this->vp_partitionData.push_back( p_data );
partitionData_t& partitionData = *p_data;
for( int j = 0 ; j < NUM_SUBPARTITIONS ; j++ ) {
int numDummies = rand_r( &seed ) % 20;
if( numDummies != 0 ) {
vector& v_data = partitionData.av_subpartData;
dummy_t* p_dummies = new dummy_t[numDummies];
v_data.insert( v_data.end(), p_dummies, p_dummies + numDummies );
delete[] p_dummies;
}
}
}
cout << "initialization finished." << endl;
return;
}
void Sim::run( void ) {
for( int i = 0 ; i < ( int )this->vp_partitionData.size() ; i++ ) {
updatePartitionData( *( this->vp_partitionData ) );
}
cout << "update finished." << endl;
return;
}
void Sim::term( void ) {
int ret;
for( int i = 0 ; i < ( int )this->vp_partitionData.size() ; i++ ) {
delete this->vp_partitionData;
}
/* terminate MPI */
ret = MPI_Finalize();
if( ret != 0 ) {
exit( -1 );
}
cout << "termination finished." << endl;
return;
}
static void updatePartitionData( partitionData_t& partitionData ) {
tmpForceData_t* p_tmpForceData = new tmpForceData_t;
vector_t nullVector = { 0.0, 0.0, 0.0 };
for( int i = 0 ; i < NUM_SUBPARTITIONS ; i++ ) {
const vector& v_data = partitionData.av_subpartData;
p_tmpForceData->av_subpartForceData.resize( v_data.size(), nullVector );
}
parallel_for( blocked_range ( 0, NUM_SUBPARTITIONS - 1 ), [&]( const blocked_range& r ) {
for( int i = r.begin() ; i < r.end() ; i++ ) {
const vector& v_data0 = partitionData.av_subpartData;
vector& v_force0 = p_tmpForceData->av_subpartForceData;
const vector& v_data1 = partitionData.av_subpartData[i + 1];
vector& v_force1 = p_tmpForceData->av_subpartForceData[i + 1];
updateForce( v_data0, v_data1, v_force0, v_force1 );
}
} );
delete p_tmpForceData;
return;
}
static void updateForce( const vector& v_data0, const vector& v_data1, vector& v_force0, vector& v_force1 ) {
vector v_idx0;
vector v_idx1;
vector v_tmpForce;
vector_t nullVector = { 0.0, 0.0, 0.0 };
assert( v_data0.size() == v_force0.size() );
assert( v_data1.size() == v_force1.size() );
if( ( v_data0.size() == 0 ) || ( v_data1.size() == 0 ) ) {
return;
}
for( unsigned short i = 0 ; i < ( unsigned short )v_data0.size() ; i++ ) {
for( unsigned short j = 0 ; j < ( unsigned short )v_data1.size() ; j++ ) {
v_idx0.push_back( i );
v_idx1.push_back( j );
}
}
v_tmpForce.resize( v_idx0.size(), nullVector );
for( int i = 0 ; i < ( int )v_tmpForce.size() ; i++ ) {
assert( v_idx0 < v_force0.size() );
assert( v_idx1 < v_force1.size() );
vector_t& force0 = v_force0[v_idx0];
vector_t& force1 = v_force1[v_idx1];
for( int j = 0 ; j < DIMENSION ; j++ ) {
#pragma omp atomic
force0.a_val += v_tmpForce.a_val;
#pragma omp atomic
force1.a_val -= v_tmpForce.a_val;
}
}
return;
}
* segfault call stack
Program received signal SIGSEGV, Segmentation fau lt.
[Switching to Thread 0x41c55940 (LWP 31735)]
0x00002b2a88b88d09 in rml::internal::getBackRef
(backRefIdx=..., $56=)
at ../../src/tbbmalloc/backref.cpp:158
158 ../../src/tbbmalloc/backref.cpp: No such file or directory.
in ../../src/tbbmalloc/backref.cpp
(gdb) where
#0 0x00002b2a88b88d09 in rml::internal::getBackR ef (backRefIdx=..., $56=)
at ../../src/tbbmalloc/backref.cpp:158
#1 0x00002b2a88b82389 in rml::internal::isLargeO bject (object=0xd000c,
$@5=)
at ../../src/tbbmalloc/frontend.cpp:1818
#2 0x00002b2a88b8390f in scalable_free (
object=0xd000c)
at ../../src/tbbmalloc/frontend.cpp:2163
---Type to continue, or q to quit--- #3 0x00002b2a88a7a737 in operator delete (
ptr=0xd000c)
at ../../src/tbbmalloc/proxy.cpp:169
#4 0x000000000042fd53 in std::vector >::push_back
(this=0xd000c, __x=@0x2b2aa997c2a0,
$\\4=,
$\\5=)
at /usr/include/c++/4.1.2/bits/stl_vector.h:610
---Type to continue, or q to quit---
#5 0x000000000042a917 in enumerateNbrPairs (
dMaxSquare=1, nbrVIdxOffset=...,
lv3Subpart0=..., lv3Subpart1=...,
v_idx0=std::vector of length 17, capacity 32 = {...},
v_idx1=std::vector of length 16, capacity 16 = {...}) at position.cpp:1498
#6 0x0000000000428eff in updateForce (
sameLv3Partition=12, nbrVIdxOffset=...,
lv2PartitionData=...,
---Type to continue, or q to quit---
lv3PartitionData0=..., lv3SubpartData0=...,
lv3PartitionData1=..., lv3SubpartData1=...,
lv3ForceData0=..., lv3ForceData1=...)
at position.cpp:535
#7 0x000000000042ce69 in tbb::interface6::internal::partition_type_base<:INTERFACE6::INTERNAL::AUTO_PARTITION_TYPE>::execute (this=0xd000c,
start=..., range=...,
$alue optimized out>,
$
---Type to continue, or q to quit---
$
at /home/kang697/install/tbb/include/tbb/partitioner.h:265
#8 0x000000000042ad46 in tbb::interface6::internal::start_for<:BLOCKED_RANGE>, lambda [], tbb::auto_partitioner>::execute (
this=0xd000c, $alue optimized out>)
at /home/kang697/install/tbb/include/tbb/parallel_for.h:116
#9 0x00002b2a889564fc in tbb::internal::custom_s---Type to continue, or q to quit---
cheduler<:INTERNAL::INTELSCHEDULERTRAITS>::local_wait_for_all (this=0x2b2aa6b83d80,
parent=..., child=0x6,
$F6=,
$F7=,
$F8=)
at ../../src/tbb/custom_scheduler.h:441
#10 0x00002b2a88953f6f in tbb::internal::generic_scheduler::local_spawn_root_and_wait (
this=0xd000c, first=..., next=@0x6,
---Type to continue, or q to quit---
$V4=,
$V5=,
$V6=)
at ../../src/tbb/scheduler.cpp:619
#11 0x00002b2a88953e7a in tbb::internal::generic_scheduler::spawn_root_and_wait (this=0xd000c,
first=..., next=@0x6,
$Z4=,
$Z5=,
$Z6=)
---Type to continue, or q to quit---
at ../../src/tbb/scheduler.h:527
#12 0x000000000042d734 in tbb::interface6::internal::partition_type_base<:INTERFACE6::INTERNAL::AUTO_PARTITION_TYPE>::execute (this=0xd000c,
start=..., range=...,
$4=,
$8=,
$9=)
at /home/kang697/install/tbb/include/tbb/partitioner.h:265
---Type to continue, or q to quit---
#13 0x000000000042ad82 in tbb::interface6::internal::start_for<:BLOCKED_RANGE>, lambda [], tbb::auto_partitioner>::execute (
this=0xd000c, $5=)
at /home/kang697/install/tbb/include/tbb/parallel_for.h:116
#14 0x00002b2a889564fc in tbb::internal::custom_scheduler<:INTERNAL::INTELSCHEDULERTRAITS>::local_wait_for_all (this=0x2b2aa6b83d80,
parent=..., child=0x6,
---Type to continue, or q to quit---
$F6=,
$F7=,
$F8=)
at ../../src/tbb/custom_scheduler.h:441
#15 0x00002b2a889537d9 in tbb::internal::arena::process (this=0xd000c, s=...,
$T1=,
$T2=)
at ../../src/tbb/arena.cpp:91
#16 0x00002b2a88951c8b in tbb::internal::market::---Type to continue, or q to quit---
process (this=0xd000c, j=...,
$V0=,
$V1=)
at ../../src/tbb/market.cpp:385
#17 0x00002b2a8894f2f7 in tbb::internal::rml::private_worker::run (this=0xd000c,
$;6=)
at ../../src/tbb/private_server.cpp:255
#18 0x00002b2a8894f206 in tbb::internal::rml::pri
---Type to continue, or q to qu
$;8=)
at ../../src/tbb/private_server.cpp:223
#19 0x000000329920673d in start_thread ()
from /lib64/libpthread.so.0
#20 0x00000032986d44bd in clone ()
from /lib64/libc.so.6
0 Kudos
1 Solution
96 Views
helloSeunghwa,

the issue was reproduced without MPI usage. we will address it.
thanks for the report and reproducer.
--Vladimir

View solution in original post

4 Replies
97 Views
helloSeunghwa,

the issue was reproduced without MPI usage. we will address it.
thanks for the report and reproducer.
--Vladimir

View solution in original post

96 Views
Thank you again for the bug report and reproducer! I hope we fixed the issue, TBB development release with the fix can be found at http://threadingbuildingblocks.org/ver.php?fid=178

Could you please check that the issue is really fixed?
Seunghwa_Kang
Beginner
96 Views
Thank you for fixing the bug and my code is not segfaulting anymore!!!

p.s. Where can I change my account setting so I can get an email notification if someone replies my post or reply???
96 Views
there is "Subscribed to this Thread" checkbox. on the top the thread.
Reply