- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
//This is an illustration of a program which fills data into one table1,
//processes the info in table1 to produce table2, then processes the data in
//table2 to produce table3. It does this with many blocks of input data and the
//processing is fairly simple. The routines in Proc1 and Proc2 take about the
//same amount of time as in the real application.
//when we set up to do this with multi-threading on a 4 cpu system in DOS with
//no internet or other background tasks, it ran slower than if done on a single
//cpu.
//We tried variations line CREATE_SUSPENDED with ResumeThread and
//with no luck. Perhaps the overhead of sharing memory is too great.
//The align 128 is to avoid cache restores
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
typedef struct {
int do_thread;
int thread_done;
int end_thread;
int *table1_ptr;
int *table2_ptr;
int *table3_ptr;
} THREADCOM,*PTHREADCOM;
unsigned int __stdcall Thread(PVOID pvoid);
void Proc1(int *table_ptr1,int *table_ptr2);
void Proc2(int *table_ptr1,int *table_ptr2);
int main(void)
{
int i;
int toggle;
int in_time,out_time;
int calc_time;
int x_table1[5];
int y_table1[5];
int z_table1[5];
int x_table2[5];
int y_table2[5];
int z_table2[5];
unsigned int threadid;
HANDLE hthread;
__declspec(align (128)) THREADCOM thread_com;
toggle = 0;
in_time = clock();
for (i = 0;i < 500000;i++)
{
if (toggle == 0)
{
x_table1[0] = 1;
x_table1[1] = 2;
x_table1[2] = 3;
x_table1[3] = 4;
x_table1[4] = 5;
Proc1(&y_table1[0],&x_table1[0]);
Proc2(&z_table1[0],&y_table1[0]);
toggle = 1;
}
else
{
x_table2[0] = 1;
x_table2[1] = 2;
x_table2[2] = 3;
x_table2[3] = 4;
x_table2[4] = 5;
Proc1(&y_table2[0],&x_table2[0]);
Proc2(&z_table2[0],&y_table2[0]);
toggle = 0;
}
}
out_time = clock();
calc_time = out_time-in_time;
printf("time before beginthreadex = %ld\n",
calc_time);
thread_com.end_thread = 0;
thread_com.do_thread = 0;
thread_com.thread_done = 1;
hthread = (HANDLE)_beginthreadex(NULL,
0,
Thread,
&thread_com,
0,
&threadid);
toggle = 0;
in_time = clock();
for (i = 0;i < 500000;i++)
{
if (toggle == 0)
{
x_table1[0] = 1;
x_table1[1] = 2;
x_table1[2] = 3;
x_table1[3] = 4;
x_table1[4] = 5;
Proc1(&y_table1[0],&x_table1[0]);
while (thread_com.thread_done == 0)
{
}
thread_com.table1_ptr = &x_table1[0];
thread_com.table2_ptr = &y_table1[0];
thread_com.table3_ptr = &z_table1[0];
toggle = 1;
}
else
{
x_table2[0] = 1;
x_table2[1] = 2;
x_table2[2] = 3;
x_table2[3] = 4;
x_table2[4] = 5;
Proc1(&y_table2[0],&x_table2[0]);
while (thread_com.thread_done == 0)
{
}
thread_com.table1_ptr = &x_table2[0];
thread_com.table2_ptr = &y_table2[0];
thread_com.table3_ptr = &z_table2[0];
toggle = 0;
}
thread_com.thread_done = 0;
thread_com.do_thread = 1;
}
thread_com.end_thread = 1;
out_time = clock();
calc_time = out_time-in_time;
printf("time after beginthreadex = %ld\n",
calc_time);
return(0);
}
unsigned int __stdcall Thread(PTHREADCOM thread_com_ptr)
{
while (thread_com_ptr->end_thread == 0)
{
if (thread_com_ptr->do_thread == 1)
{
Proc2(thread_com_ptr->table3_ptr,thread_com_ptr->table2_ptr);
thread_com_ptr->do_thread = 0;
thread_com_ptr->thread_done = 1;
}
}
return(0);
}
void Proc1(int *to_table_ptr,int *from_table_ptr)
{
int i;
int local_table[5];
int *local_table_ptr;
local_table_ptr = &local_table[0];
*local_table_ptr = *from_table_ptr;
*(local_table_ptr+1) = *(from_table_ptr+1);
*(local_table_ptr+2) = *(from_table_ptr+2);
*(local_table_ptr+3) = *(from_table_ptr+3);
*(local_table_ptr+4) = *(from_table_ptr+4);
for (i = 0;i < 25;i++)
{
*local_table_ptr += i+1;
*(local_table_ptr+1) += i+2;
*(local_table_ptr+2) += i+3;
*(local_table_ptr+3) += i+4;
*(local_table_ptr+4) += i+5;
}
*to_table_ptr = *local_table_ptr;
*(to_table_ptr+1) = *(local_table_ptr+1);
*(to_table_ptr+2) = *(local_table_ptr+2);
*(to_table_ptr+3) = *(local_table_ptr+3);
*(to_table_ptr+4) = *(local_table_ptr+4);
return;
}
void Proc2(int *to_table_ptr,int *from_table_ptr)
{
int i;
int local_table[5];
int *local_table_ptr;
local_table_ptr = &local_table[0];
*local_table_ptr = *from_table_ptr;
*(local_table_ptr+1) = *(from_table_ptr+1);
*(local_table_ptr+2) = *(from_table_ptr+2);
*(local_table_ptr+3) = *(from_table_ptr+3);
*(local_table_ptr+4) = *(from_table_ptr+4);
for (i = 0;i < 25;i++)
{
*local_table_ptr += i+1;
*(local_table_ptr+1) += i+2;
*(local_table_ptr+2) += i+3;
*(local_table_ptr+3) += i+4;
*(local_table_ptr+4) += i+5;
}
*to_table_ptr = *local_table_ptr;
*(to_table_ptr+1) = *(local_table_ptr+1);
*(to_table_ptr+2) = *(local_table_ptr+2);
*(to_table_ptr+3) = *(local_table_ptr+3);
*(to_table_ptr+4) = *(local_table_ptr+4);
return;
}
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
//This is an illustration of a program which fills data into one table1,
//processes the info in table1 to produce table2, then processes the data in
//table2 to produce table3. It does this with many blocks of input data and the
//processing is fairly simple. The routines in Proc1 and Proc2 take about the
//same amount of time as in the real application.
//when we set up to do this with multi-threading on a 4 cpu system in DOS with
//no internet or other background tasks, it ran slower than if done on a single
//cpu.
[snip]
}
Hello,
First, please do not post your complete code here. It's simply an overkill, respect the space .
Your problem can be better identified with Intel Vtune profiler(free evaluation copy is available). Try it and analyse the reports to know what is the bottleneck(s).
Also, you need to explain your code; otherwise it's difficult to investigate any further.
Thanks!
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
[cpp]//This is an illustration of a program which fills data into one table1, //processes the info in table1 to produce table2, then processes the data in //table2 to produce table3. It does this with many blocks of input data and the //processing is fairly simple. The routines in Proc1 and Proc2 take about the //same amount of time as in the real application. //when we set up to do this with multi-threading on a 4 cpu system in DOS with //no internet or other background tasks, it ran slower than if done on a single //cpu. //We tried variations line CREATE_SUSPENDED with ResumeThread and //with no luck. Perhaps the overhead of sharing memory is too great. //The align 128 is to avoid cache restores #include#include #include #include #include #include #include #include #include #include #include typedef struct { int do_thread; int thread_done; int end_thread; int *table1_ptr; int *table2_ptr; int *table3_ptr; } THREADCOM,*PTHREADCOM; unsigned int __stdcall Thread(PVOID pvoid); void Proc1(int *table_ptr1,int *table_ptr2); void Proc2(int *table_ptr1,int *table_ptr2); int main(void) { int i; int toggle; int in_time,out_time; int calc_time; int x_table1[5]; int y_table1[5]; int z_table1[5]; int x_table2[5]; int y_table2[5]; int z_table2[5]; unsigned int threadid; HANDLE hthread; __declspec(align (128)) THREADCOM thread_com; toggle = 0; in_time = clock(); for (i = 0;i < 500000;i++) { if (toggle == 0) { x_table1[0] = 1; x_table1[1] = 2; x_table1[2] = 3; x_table1[3] = 4; x_table1[4] = 5; Proc1(&y_table1[0],&x_table1[0]); Proc2(&z_table1[0],&y_table1[0]); toggle = 1; } else { x_table2[0] = 1; x_table2[1] = 2; x_table2[2] = 3; x_table2[3] = 4; x_table2[4] = 5; Proc1(&y_table2[0],&x_table2[0]); Proc2(&z_table2[0],&y_table2[0]); toggle = 0; } } out_time = clock(); calc_time = out_time-in_time; printf("time before beginthreadex = %ldn", calc_time); thread_com.end_thread = 0; thread_com.do_thread = 0; thread_com.thread_done = 1; hthread = (HANDLE)_beginthreadex(NULL, 0, Thread, &thread_com, 0, &threadid); toggle = 0; in_time = clock(); for (i = 0;i < 500000;i++) { if (toggle == 0) { x_table1[0] = 1; x_table1[1] = 2; x_table1[2] = 3; x_table1[3] = 4; x_table1[4] = 5; Proc1(&y_table1[0],&x_table1[0]); while (thread_com.thread_done == 0) { } thread_com.table1_ptr = &x_table1[0]; thread_com.table2_ptr = &y_table1[0]; thread_com.table3_ptr = &z_table1[0]; toggle = 1; } else { x_table2[0] = 1; x_table2[1] = 2; x_table2[2] = 3; x_table2[3] = 4; x_table2[4] = 5; Proc1(&y_table2[0],&x_table2[0]); while (thread_com.thread_done == 0) { } thread_com.table1_ptr = &x_table2[0]; thread_com.table2_ptr = &y_table2[0]; thread_com.table3_ptr = &z_table2[0]; toggle = 0; } thread_com.thread_done = 0; thread_com.do_thread = 1; } thread_com.end_thread = 1; out_time = clock(); calc_time = out_time-in_time; printf("time after beginthreadex = %ldn", calc_time); return(0); } unsigned int __stdcall Thread(PTHREADCOM thread_com_ptr) { while (thread_com_ptr->end_thread == 0) { if (thread_com_ptr->do_thread == 1) { Proc2(thread_com_ptr->table3_ptr,thread_com_ptr->table2_ptr); thread_com_ptr->do_thread = 0; thread_com_ptr->thread_done = 1; } } return(0); } void Proc1(int *to_table_ptr,int *from_table_ptr) { int i; int local_table[5]; int *local_table_ptr; local_table_ptr = &local_table[0]; *local_table_ptr = *from_table_ptr; *(local_table_ptr+1) = *(from_table_ptr+1); *(local_table_ptr+2) = *(from_table_ptr+2); *(local_table_ptr+3) = *(from_table_ptr+3); *(local_table_ptr+4) = *(from_table_ptr+4); for (i = 0;i < 25;i++) { *local_table_ptr += i+1; *(local_table_ptr+1) += i+2; *(local_table_ptr+2) += i+3; *(local_table_ptr+3) += i+4; *(local_table_ptr+4) += i+5; } *to_table_ptr = *local_table_ptr; *(to_table_ptr+1) = *(local_table_ptr+1); *(to_table_ptr+2) = *(local_table_ptr+2); *(to_table_ptr+3) = *(local_table_ptr+3); *(to_table_ptr+4) = *(local_table_ptr+4); return; } void Proc2(int *to_table_ptr,int *from_table_ptr) { int i; int local_table[5]; int *local_table_ptr; local_table_ptr = &local_table[0]; *local_table_ptr = *from_table_ptr; *(local_table_ptr+1) = *(from_table_ptr+1); *(local_table_ptr+2) = *(from_table_ptr+2); *(local_table_ptr+3) = *(from_table_ptr+3); *(local_table_ptr+4) = *(from_table_ptr+4); for (i = 0;i < 25;i++) { *local_table_ptr += i+1; *(local_table_ptr+1) += i+2; *(local_table_ptr+2) += i+3; *(local_table_ptr+3) += i+4; *(local_table_ptr+4) += i+5; } *to_table_ptr = *local_table_ptr; *(to_table_ptr+1) = *(local_table_ptr+1); *(to_table_ptr+2) = *(local_table_ptr+2); *(to_table_ptr+3) = *(local_table_ptr+3); *(to_table_ptr+4) = *(local_table_ptr+4); return; }[/cpp]
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Thanks,
We toggled between memory blocks so that the thread was processing one block while the main was processing another. We also copied shared memory to local (stack), processed it 25 times, and then copied result back to shared memory. Still think it is memory? We used 4 cpu system so had extra cpus to do junk that is always going on in Windows.
Appreciate your thoughts, Dave Schroeder
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Appreciate your thoughts, Dave Schroeder
The other observation regards the effort to copy parameters into and out of the local buffer inProc1 and Proc2. Since the middle, computational section is still doing pointer derefs to manipulate the data, the same ostensible cost as doing the calculations in place, this is presumably donebecause the in-place location is shared memory with additional possible contention cost. However, what this also looks like is an attempt to construct a multi-buffer pipeline to process a stream. Have you considered using a real pipeline like you can get in Intel Threading Building Blocks? In such an architecture I would bucketize the in and out portions of the buffers into composite units that could be passed along the pipeline discretely. Computations would be self-contained withina buffer and a buffer per thread for as many stages as you need in the stream.

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page