hidden text to trigger early load of fonts ПродукцияПродукцияПродукцияПродукция Các sản phẩmCác sản phẩmCác sản phẩmCác sản phẩm المنتجاتالمنتجاتالمنتجاتالمنتجات מוצריםמוצריםמוצריםמוצרים
Intel® oneAPI Threading Building Blocks
Ask questions and share information about adding parallelism to your applications when using this threading library.

memory corruption in using tbb task

missing__zlw
Beginner
2,578 Views
Hi,

I encountered the following problem.

I have an array with size of num_items * length. Pointer to array is pArray.

For each item "i" in the range of num_items, I launch a task. In the execution of this task, I update this array at pArray+i*length.The update length is "length". I used memcpy. The size of data type has been considered.

The code looks like this: pArray is pointer to float array.

memcpy(pArray+length*i, pSourceArray, length*sizeof(float));

pArray is correctly allocated before launching tbb::task.

So in theory, there should not be any memory overlap on writing. The sequential execution of this method has been proved.

However, when I run in parallel version using tbb task, I got glibc, memory corruption error.

Why did I do wrong here? Thanks.

0 Kudos
26 Replies
SergeyKostrov
Valued Contributor II
1,899 Views
>>...Why did I do wrong here?..

Please review a threaded part of your application.I have not found any problems with a
non-threaded implementation.

Here is my C Test-Case:

...
int iNumItems = 8;
int iLength = 4;
const int iSize = 32; // ( iNumItems * iLength );

float fSrcArray[ iSize ] = { 0.0f };
// float fDstArray[ iSize - 4 ] = { 0.0f }; // to simulate a buffer overflow
// float fDstArray[ iSize ] = { 0.0f };
float fDstArray[ iSize + 4 ] = { 0.0f };// 4 more floats in order to detect & seea buffer overflow

int i;

for( i = 0; i < iSize; i++ )
fSrcArray[ i ] = ( float )7.7f;

for( i = 0; i < iNumItems; i++ )
{
memcpy( &fDstArray[ iLength * i ], &fSrcArray[0], iLength * sizeof( float ) );
}
...

Best regards,
Sergey

0 Kudos
SergeyKostrov
Valued Contributor II
1,899 Views
>>...glibc memory corruption error...

Speaking about glibc library, is it a single-threaded or multi-threaded?

If you're using a single-threaded version in a multi-tasking environment it could be the reasonof a
memory corruption andcrash.

Best regards,
Sergey
0 Kudos
missing__zlw
Beginner
1,899 Views
Here is my sample program and when I run it, I got glibc error. This is the extraction from my original program.

void SerialRun(int numItem, int len, float* cache){

float* fSrcArray = new float[len];

int i, j;

for ( i = 0; i < numItem; i++ ) {

for ( j = 0; j < len; j++ )

fSrcArray[ j ] = ( float )i;

memcpy( cache+i*len, fSrcArray, len * sizeof( float ) );

}

delete fSrcArray;

}

class OneTask: public tbb::task {

public:

tbb::task* execute() {

std::cout<<"Process id "<

float* fSrcArray = new float[ilen];

for (int i = 0; i < ilen; i++ )

fSrcArray[ i ] = ( float )i;

memcpy( pCache+inum*ilen, fSrcArray, ilen * sizeof( float ) );

delete fSrcArray;

return NULL;

}

OneTask(int numItem, int len, float* cache):

inum(numItem), ilen(len), pCache(cache) {};

int inum; int ilen;

float * pCache;

};

void ParallelRun(int numItem, int len, float* cache){

int i;

tbb::task_list list;

for ( i = 0; i < numItem; i++ ) {

OneTask &rmit = *new( tbb::task::allocate_root() )OneTask ( numItem, len, cache );

list.push_back( rmit );

}

tbb::task::spawn_root_and_wait(list);

}

int main (int argc, char *argv[])

{

int iNumItems = 8;

int iLength = 4;

float* fDstArray = new float[iNumItems * iLength];

tbb::task_scheduler_init init(4);

tbb::tick_count starttick = tbb::tick_count::now();

SerialRun(iNumItems, iLength, fDstArray);

tbb::tick_count endtick = tbb::tick_count::now();

cout<<"Total time in SerialRun = " << (endtick - starttick).seconds() <

starttick = tbb::tick_count::now();

ParallelRun(iNumItems, iLength, fDstArray);

endtick = tbb::tick_count::now();

cout<<"Total time in ParallelRun = " << (endtick - starttick).seconds() <

delete fDstArray;

return 0;

}

0 Kudos
RafSchietekat
Valued Contributor III
1,899 Views
OneTask &rmit = *new( tbb::task::allocate_root() )OneTask ( numItem, len, cache );
0 Kudos
SergeyKostrov
Valued Contributor II
1,899 Views
...
void SerialRun(int numItem, int len, float* cache)
{
...
memcpy( cache+i*len, fSrcArray, len * sizeof( float ) );
...
}
...

tbb::task* execute()
{
...
memcpy( pCache+inum*ilen, fSrcArray, ilen * sizeof( float ) );
...
}

Is everything fine with the 2nd 'memcpy'?

0 Kudos
RafSchietekat
Valued Contributor III
1,899 Views
"Is everything fine with the 2nd 'memcpy'?"
Sure.

Maybe I should have been more explicit in #4 (where I simply used boldface to indicate the problem): ParallelRun() loops over all numItem items, but then passes numItem instead of i to OneTask, thus always causing it to write past the end of the destination array.
0 Kudos
missing__zlw
Beginner
1,899 Views
Sorry for my typo. I meant :

void ParallelRun(int numItem, int len, float* cache){

int i;

tbb::task_list list;

for ( i = 0; i < numItem; i++ ) {

OneTask &rmit = *new( tbb::task::allocate_root() )OneTask ( i, len, cache );

list.push_back( rmit );

}

tbb::task::spawn_root_and_wait(list);

}

The interesting part is that I found my extaction example seems to be working, while my real project is not. I have checked in the index in my project.

So you don't see any problem with using memcpy here in different tbb tasks?

Also, as the example is very small, there is only one thread. All the thread ids are the same. But in my real project, there are differnt threads.

0 Kudos
RafSchietekat
Valued Contributor III
1,899 Views
"So you don't see any problem with using memcpy here in different tbb tasks?"
Why not substitute a simple loop and see what happens?
0 Kudos
SergeyKostrov
Valued Contributor II
1,899 Views
>>...Why not substitute a simple loop and see what happens?..

It was proven twice that everything is fine. In a single-thread TBB case and in a single-thread Non-TBB case.
0 Kudos
RafSchietekat
Valued Contributor III
1,899 Views
I mean replace memcpy with a simple loop (over individual bytes), in the multithreaded case. Your program may become somewhat slower, depending on how big the data is, but, if it still fails, you'll at least know that glibc's memcpy is not to blame, or at least not the only problem. It would probably only take fifteen minutes (including a coffee break), so...
0 Kudos
SergeyKostrov
Valued Contributor II
1,899 Views
>>...that glibc's memcpy is not to blame...

It is stillnot clearwhatglibc library was used,a single-threaded or multi-threaded?
0 Kudos
RafSchietekat
Valued Contributor III
1,899 Views
"It is stillnot clearwhatglibc library was used,a single-threaded or multi-threaded?"
How does one even get single-threaded glibc and TBB together in the same program, I wonder?

My money is still on another bug (hence my suggestion to take memcpy off the table). :-)
0 Kudos
missing__zlw
Beginner
1,899 Views
Thanks.
I have tried and it works fine in a simple loop, no tbb task. It also works ok with tbb task loop when only one thread is used (using process id). But memory corrpution error when more than one processes are used.

Puzzled...
0 Kudos
SergeyKostrov
Valued Contributor II
1,899 Views
>>...But memory corrpution error when more than one processes are used...

There is one fundamental question:it is stillnot clearwhatglibc library is used,a single-threaded or multi-threaded?

I'll look at your Test-Case tomorrow on Windows XP.

Also, what version of TBB are you using? Keep an eye on the thread, please.

Best regards,
Sergey
0 Kudos
missing__zlw
Beginner
1,899 Views
I still see this problem. Is there a limit on how much memory each tbb task can allocate?
0 Kudos
RafSchietekat
Valued Contributor III
1,899 Views
Please state clearly whether you are seeing the problem without use of memcmp because you replaced it with your own equivalent. Although it would then most likely no longer be an issue, and although I'm not sure how you would even get a single-threaded glibc into a TBB program, I'm sure Sergey would still also like to know whether you are using a multi-threaded or single-threaded version of glibc.

There is no limit on each task's use of dynamic memory, and you should see allocation failures as hard-to-miss exceptions when using the new operator (although, just to make sure, why not assert() that these pointers are not NULL).

Please use a debugger to inspect program memory near the offending access to get more information.

And have you undertaken peer review of your code? Maybe there's another "typo" somethere. :-)
0 Kudos
missing__zlw
Beginner
1,899 Views
Sorry, I don't know which glibc I am using.
Is there a way to find out?
0 Kudos
SergeyKostrov
Valued Contributor II
1,899 Views
>>...Is there a limit on how much memory each tbb task can allocate?..

I don't think so. Guys, please correct me if I'm wrong.

For example, on Win32 platforms an application could allocate up to 2GB of memory.

OnaLinux platforma number could be similar...
0 Kudos
SergeyKostrov
Valued Contributor II
1,899 Views

Tested on Windows XP andeverything isworking after modifications are applied.

Here are results:

There is an incorrect application of a C++ operator 'new'. It doesn't work with Microsoft's Visual C/C++ compiler.

Instead of:
...
float *fSrcArray = new float( len );
...

Try to use:
...
float *fSrcArray = new float[ len ];
...

> Test1148 Start <
Destination Array ( 1x32 ):
7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7 7.7

SerialRun Test-Case
Total time in SerialRun : 0.00001955 secs
Destination Array ( 1x32 ):
0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 3.0 3.0 3.0 3.0 4.0 4.0 4.0 4.0 5.0 5.0 5.0 5.0 6.0 6.0 6.0 6.0 7.0 7.0 7.0 7.0

ParallelRun Test-Case
Process NUM: 7
Process NUM: 6
Process NUM: 5
Process NUM: 4
Process NUM: 3
Process NUM: 2
Process NUM: 1
Process NUM: 0
Total time in ParallelRun : 0.00177509 secs
Destination Array ( 1x32 ):
0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 3.0 3.0 3.0 3.0 4.0 4.0 4.0 4.0 5.0 5.0 5.0 5.0 6.0 6.0 6.0 6.0 7.0 7.0 7.0 7.0

> Test1148 End <


PS: Also, a GCC C/C++ compiler has an option for verification of the stack:

`-fstack-check'

Generate code to verify that you do not go beyond the boundary of
the stack. You should specify this flag if you are running in an
environment with multiple threads, but only rarely need to specify
it in a single-threaded environment since stack overflow is
automatically detected on nearly all systems if there is only one
stack.

Note that this switch does not actually cause checking to be done;
the operating system must do that. The switch causes generation
of code to ensure that the operating system sees the stack being
extended.

0 Kudos
SergeyKostrov
Valued Contributor II
1,813 Views
Please don't worry about the glibc library.

The source codes are enclosed AS IS and you will need to do some small modifications related to 'RT',
'Crt', 'RTU' prefixes I used in my standard test-environment.

Note:I useCRT'malloc'-'free' functions instead of 'new'-'delete' C++ operators.

RTvoid SerialRun( RTint iNumItem, RTint iLen, RTfloat *pfCache );

RTvoid SerialRun( RTint iNumItem, RTint iLen, RTfloat *pfCache )
{
RTfloat *pfSrcArray = ( RTfloat * )CrtMalloc( iLen * sizeof( RTfloat ) );

RTint i, j;

for( i = 0; i < iNumItem; i++ )
{
for( j = 0; j < iLen; j++ )
pfSrcArray[ j ] = ( RTfloat )0.0f + i;

CrtMemcpy( pfCache + ( i*iLen ), pfSrcArray, iLen * sizeof( RTfloat ) );

//CrtPrintf( RTU("\t") );
//for( RTint p = 0; p < iLen; p++ )
// CrtPrintf( RTU("%4.1f "), pfCache[ ( i*iLen ) + p ] );
//CrtPrintf( RTU("\n") );
}

if( pfSrcArray != RTnull )
CrtFree( pfSrcArray );
}

class OneTask: public tbb::task
{
public:
tbb::task *execute()
{
//std::cout << "Process ID :" << getpid() << std::endl;
//CrtPrintf( RTU("Process ID : %ld\n"), ( RTint )::GetCurrentProcess() );
//CrtPrintf( RTU("Thread ID : %ld\n"), ( RTint )::GetCurrentThread() );
CrtPrintf( RTU("Process NUM: %ld\n"), ( RTint )m_iNum );

RTfloat *pfSrcArray = ( RTfloat * )CrtMalloc( m_iLen * sizeof( RTfloat ) );

for( RTint i = 0; i < m_iLen; i++ )
pfSrcArray[ i ] = ( RTfloat )0.0f + m_iNum;

CrtMemcpy( m_pfCache + ( m_iNum*m_iLen ), pfSrcArray, m_iLen * sizeof( RTfloat ) );

//CrtPrintf( RTU("\t") );
//for( RTint p = 0; p < m_iLen; p++ )
// CrtPrintf( RTU("%4.1f "), m_pfCache[ ( m_iNum*m_iLen ) + p ] );
//CrtPrintf( RTU("\n") );

if( pfSrcArray != RTnull )
CrtFree( pfSrcArray );

return ( tbb::task * )RTnull;
}

OneTask( RTint iNumItem, RTint iLen, RTfloat *pfCache )
{
m_iNum = iNumItem;
m_iLen = iLen;
m_pfCache = pfCache;
};

RTint m_iNum;
RTint m_iLen;
RTfloat *m_pfCache;
};

RTvoid ParallelRun( RTint iNumItem, RTint iLen, RTfloat *pfCache );

RTvoid ParallelRun( RTint iNumItem, RTint iLen, RTfloat *pfCache )
{
tbb::task_list TaskList;

for( RTint i = 0; i < iNumItem; i++ )
{ // Wrong
//OneTask &rmit = *new( tbb::task::allocate_root() )OneTask ( iNumItem, iLen, pfCache );
// Correct
OneTask &rmit = *new( tbb::task::allocate_root() )OneTask ( i, iLen, pfCache );

TaskList.push_back( rmit );
}

tbb::task::spawn_root_and_wait( TaskList );
}

...
int iNumItems = 8;
int iLen = 4;
int iArraySize = ( iNumItems * iLen );

RTfloat *pfDstArray = ( RTfloat * )CrtMalloc( iArraySize * sizeof( RTfloat ) );

CrtPrintf( RTU("Destination Array ( 1x32 ):\n") );
for( int i = 0; i < iArraySize; i++ )
{
pfDstArray[ i ] = 7.7f;
CrtPrintf( RTU("%3.1f "), pfDstArray[ i ] );
}
CrtPrintf( RTU("\n\n") );

tbb::task_scheduler_init init( 4 );

CrtPrintf( RTU("SerialRun Test-Case\n") );
tbb::tick_count starttick = tbb::tick_count::now();
SerialRun( iNumItems, iLen, pfDstArray );
tbb::tick_count endtick = tbb::tick_count::now();
cout << "Total time in SerialRun : " << ( endtick - starttick ).seconds() << " secs" << endl;
CrtPrintf( RTU("Destination Array ( 1x32 ):\n") );
for( int i = 0; i < iArraySize; i++ )
CrtPrintf( RTU("%3.1f "), pfDstArray[ i ] );
CrtPrintf( RTU("\n\n") );

CrtPrintf( RTU("ParallelRun Test-Case\n") );
starttick = tbb::tick_count::now();
ParallelRun( iNumItems, iLen, pfDstArray );
endtick = tbb::tick_count::now();
cout << "Total time in ParallelRun : " << ( endtick - starttick ).seconds() << " secs" << endl;
CrtPrintf( RTU("Destination Array ( 1x32 ):\n") );
for( int i = 0; i < iArraySize; i++ )
CrtPrintf( RTU("%3.1f "), pfDstArray[ i ] );
CrtPrintf( RTU("\n\n") );

if( pfDstArray != RTnull )
CrtFree( pfDstArray );
...

0 Kudos
Reply