THREADX.C

das · ‎05-29-2009

//This is an illustration of a program which fills data into one table1,

//processes the info in table1 to produce table2, then processes the data in

//table2 to produce table3. It does this with many blocks of input data and the

//processing is fairly simple. The routines in Proc1 and Proc2 take about the

//same amount of time as in the real application.

//when we set up to do this with multi-threading on a 4 cpu system in DOS with

//no internet or other background tasks, it ran slower than if done on a single

//cpu.

//We tried variations line CREATE_SUSPENDED with ResumeThread and

//with no luck. Perhaps the overhead of sharing memory is too great.

//The align 128 is to avoid cache restores

#include

typedef struct {

int do_thread;

int thread_done;

int end_thread;

int *table1_ptr;

int *table2_ptr;

int *table3_ptr;

} THREADCOM,*PTHREADCOM;

unsigned int __stdcall Thread(PVOID pvoid);

void Proc1(int *table_ptr1,int *table_ptr2);

void Proc2(int *table_ptr1,int *table_ptr2);

int main(void)

{

int i;

int toggle;

int in_time,out_time;

int calc_time;

int x_table1[5];

int y_table1[5];

int z_table1[5];

int x_table2[5];

int y_table2[5];

int z_table2[5];

unsigned int threadid;

HANDLE hthread;

__declspec(align (128)) THREADCOM thread_com;

toggle = 0;

in_time = clock();

for (i = 0;i < 500000;i++)

{

if (toggle == 0)

{

x_table1[0] = 1;

x_table1[1] = 2;

x_table1[2] = 3;

x_table1[3] = 4;

x_table1[4] = 5;

Proc1(&y_table1[0],&x_table1[0]);

Proc2(&z_table1[0],&y_table1[0]);

toggle = 1;

}

else

{

x_table2[0] = 1;

x_table2[1] = 2;

x_table2[2] = 3;

x_table2[3] = 4;

x_table2[4] = 5;

Proc1(&y_table2[0],&x_table2[0]);

Proc2(&z_table2[0],&y_table2[0]);

toggle = 0;

}

out_time = clock();

calc_time = out_time-in_time;

printf("time before beginthreadex = %ld\n",

calc_time);

thread_com.end_thread = 0;

thread_com.do_thread = 0;

thread_com.thread_done = 1;

hthread = (HANDLE)_beginthreadex(NULL,

0,

Thread,

&thread_com,

0,

&threadid);

toggle = 0;

in_time = clock();

for (i = 0;i < 500000;i++)

{

if (toggle == 0)

{

x_table1[0] = 1;

x_table1[1] = 2;

x_table1[2] = 3;

x_table1[3] = 4;

x_table1[4] = 5;

Proc1(&y_table1[0],&x_table1[0]);

while (thread_com.thread_done == 0)

{

}

thread_com.table1_ptr = &x_table1[0];

thread_com.table2_ptr = &y_table1[0];

thread_com.table3_ptr = &z_table1[0];

toggle = 1;

}

else

{

x_table2[0] = 1;

x_table2[1] = 2;

x_table2[2] = 3;

x_table2[3] = 4;

x_table2[4] = 5;

Proc1(&y_table2[0],&x_table2[0]);

while (thread_com.thread_done == 0)

{

}

thread_com.table1_ptr = &x_table2[0];

thread_com.table2_ptr = &y_table2[0];

thread_com.table3_ptr = &z_table2[0];

toggle = 0;

}

thread_com.thread_done = 0;

thread_com.do_thread = 1;

}

thread_com.end_thread = 1;

out_time = clock();

calc_time = out_time-in_time;

printf("time after beginthreadex = %ld\n",

calc_time);

return(0);

}

unsigned int __stdcall Thread(PTHREADCOM thread_com_ptr)

{

while (thread_com_ptr->end_thread == 0)

{

if (thread_com_ptr->do_thread == 1)

{

Proc2(thread_com_ptr->table3_ptr,thread_com_ptr->table2_ptr);

thread_com_ptr->do_thread = 0;

thread_com_ptr->thread_done = 1;

}

return(0);

}

void Proc1(int *to_table_ptr,int *from_table_ptr)

{

int i;

int local_table[5];

int *local_table_ptr;

local_table_ptr = &local_table[0];

*local_table_ptr = *from_table_ptr;

*(local_table_ptr+1) = *(from_table_ptr+1);

*(local_table_ptr+2) = *(from_table_ptr+2);

*(local_table_ptr+3) = *(from_table_ptr+3);

*(local_table_ptr+4) = *(from_table_ptr+4);

for (i = 0;i < 25;i++)

{

*local_table_ptr += i+1;

*(local_table_ptr+1) += i+2;

*(local_table_ptr+2) += i+3;

*(local_table_ptr+3) += i+4;

*(local_table_ptr+4) += i+5;

}

*to_table_ptr = *local_table_ptr;

*(to_table_ptr+1) = *(local_table_ptr+1);

*(to_table_ptr+2) = *(local_table_ptr+2);

*(to_table_ptr+3) = *(local_table_ptr+3);

*(to_table_ptr+4) = *(local_table_ptr+4);

return;

}

void Proc2(int *to_table_ptr,int *from_table_ptr)

{

int i;

int local_table[5];

int *local_table_ptr;

local_table_ptr = &local_table[0];

*local_table_ptr = *from_table_ptr;

*(local_table_ptr+1) = *(from_table_ptr+1);

*(local_table_ptr+2) = *(from_table_ptr+2);

*(local_table_ptr+3) = *(from_table_ptr+3);

*(local_table_ptr+4) = *(from_table_ptr+4);

for (i = 0;i < 25;i++)

{

*local_table_ptr += i+1;

*(local_table_ptr+1) += i+2;

*(local_table_ptr+2) += i+3;

*(local_table_ptr+3) += i+4;

*(local_table_ptr+4) += i+5;

}

*to_table_ptr = *local_table_ptr;

*(to_table_ptr+1) = *(local_table_ptr+1);

*(to_table_ptr+2) = *(local_table_ptr+2);

*(to_table_ptr+3) = *(local_table_ptr+3);

*(to_table_ptr+4) = *(local_table_ptr+4);

return;

}

kv_ishl · ‎05-29-2009

Quoting - das@instantimage.com

//This is an illustration of a program which fills data into one table1,

//processes the info in table1 to produce table2, then processes the data in

//table2 to produce table3. It does this with many blocks of input data and the

//processing is fairly simple. The routines in Proc1 and Proc2 take about the

//same amount of time as in the real application.

//when we set up to do this with multi-threading on a 4 cpu system in DOS with

//no internet or other background tasks, it ran slower than if done on a single

//cpu.

[snip]

}

Hello,
First, please do not post your complete code here. It's simply an overkill, respect the space .
Your problem can be better identified with Intel Vtune profiler(free evaluation copy is available). Try it and analyse the reports to know what is the bottleneck(s).
Also, you need to explain your code; otherwise it's difficult to investigate any further.

Thanks!

robert-reed · ‎05-29-2009

Quoting - das@instantimage.com

[cpp]//This is an illustration of a program which fills data into one table1,
//processes the info in table1 to produce table2, then processes the data in
//table2 to produce table3. It does this with many blocks of input data and the
//processing is fairly simple. The routines in Proc1 and Proc2 take about the
//same amount of time as in the real application.
//when we set up to do this with multi-threading on a 4 cpu system in DOS with
//no internet or other background tasks, it ran slower than if done on a single
//cpu.

//We tried variations line CREATE_SUSPENDED with ResumeThread and
//with no luck. Perhaps the overhead of sharing memory is too great.
//The align 128 is to avoid cache restores

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

typedef struct {
   int do_thread;
   int thread_done;
   int end_thread;
   int *table1_ptr;
   int *table2_ptr;
   int *table3_ptr;
} THREADCOM,*PTHREADCOM;

unsigned int __stdcall Thread(PVOID pvoid);
void Proc1(int *table_ptr1,int *table_ptr2);
void Proc2(int *table_ptr1,int *table_ptr2);

int main(void)
{
   int i;
   int toggle;
   int in_time,out_time;
   int calc_time;
   int x_table1[5];
   int y_table1[5];
   int z_table1[5];
   int x_table2[5];
   int y_table2[5];
   int z_table2[5];
   unsigned int threadid;
   HANDLE hthread;
   __declspec(align (128)) THREADCOM thread_com;

   toggle = 0;
   in_time = clock();
   for (i = 0;i < 500000;i++)
   {
      if (toggle == 0)
      {
         x_table1[0] = 1;
         x_table1[1] = 2;
         x_table1[2] = 3;
         x_table1[3] = 4;
         x_table1[4] = 5;
         Proc1(&y_table1[0],&x_table1[0]);
         Proc2(&z_table1[0],&y_table1[0]);
         toggle = 1;
      }
      else
      {
         x_table2[0] = 1;
         x_table2[1] = 2;
         x_table2[2] = 3;
         x_table2[3] = 4;
         x_table2[4] = 5;
         Proc1(&y_table2[0],&x_table2[0]);
         Proc2(&z_table2[0],&y_table2[0]);
         toggle = 0;
      }
   }
   out_time = clock();
   calc_time = out_time-in_time;
   printf("time before beginthreadex = %ldn", calc_time);

   thread_com.end_thread = 0;
   thread_com.do_thread = 0;
   thread_com.thread_done = 1;
   hthread = (HANDLE)_beginthreadex(NULL, 0, Thread, &thread_com, 0, &threadid);
   toggle = 0;

   in_time = clock();
   for (i = 0;i < 500000;i++)
   {
      if (toggle == 0)
      {
         x_table1[0] = 1;
         x_table1[1] = 2;
         x_table1[2] = 3;
         x_table1[3] = 4;
         x_table1[4] = 5;
         Proc1(&y_table1[0],&x_table1[0]);
         while (thread_com.thread_done == 0)
         {
            
         }
         thread_com.table1_ptr = &x_table1[0];
         thread_com.table2_ptr = &y_table1[0];
         thread_com.table3_ptr = &z_table1[0];
         toggle = 1;
      }
      else
      {
         x_table2[0] = 1;
         x_table2[1] = 2;
         x_table2[2] = 3;
         x_table2[3] = 4;
         x_table2[4] = 5;
         Proc1(&y_table2[0],&x_table2[0]);
         while (thread_com.thread_done == 0)
         {

         }
         thread_com.table1_ptr = &x_table2[0];
         thread_com.table2_ptr = &y_table2[0];
         thread_com.table3_ptr = &z_table2[0];
         toggle = 0;
      }
      thread_com.thread_done = 0;
      thread_com.do_thread = 1;
   }
   thread_com.end_thread = 1;
   out_time = clock();
   calc_time = out_time-in_time;
   printf("time after beginthreadex = %ldn", calc_time);
   return(0);
}

unsigned int __stdcall Thread(PTHREADCOM thread_com_ptr)
{
   while (thread_com_ptr->end_thread == 0)
   {
      if (thread_com_ptr->do_thread == 1)
      {
         Proc2(thread_com_ptr->table3_ptr,thread_com_ptr->table2_ptr);
         thread_com_ptr->do_thread = 0;
         thread_com_ptr->thread_done = 1;
      }
   }
   return(0);
}

void Proc1(int *to_table_ptr,int *from_table_ptr)
{
   int i;
   int local_table[5];
   int *local_table_ptr;
   local_table_ptr = &local_table[0];
   *local_table_ptr = *from_table_ptr;
   *(local_table_ptr+1) = *(from_table_ptr+1);
   *(local_table_ptr+2) = *(from_table_ptr+2);
   *(local_table_ptr+3) = *(from_table_ptr+3);
   *(local_table_ptr+4) = *(from_table_ptr+4);
   for (i = 0;i < 25;i++)
   {
      *local_table_ptr += i+1;
      *(local_table_ptr+1) += i+2;
      *(local_table_ptr+2) += i+3;
      *(local_table_ptr+3) += i+4;
      *(local_table_ptr+4) += i+5;
   }
   *to_table_ptr = *local_table_ptr;
   *(to_table_ptr+1) = *(local_table_ptr+1);
   *(to_table_ptr+2) = *(local_table_ptr+2);
   *(to_table_ptr+3) = *(local_table_ptr+3);
   *(to_table_ptr+4) = *(local_table_ptr+4);
   return;
}

void Proc2(int *to_table_ptr,int *from_table_ptr)
{
   int i;
   int local_table[5];
   int *local_table_ptr;
   local_table_ptr = &local_table[0];
   *local_table_ptr = *from_table_ptr;
   *(local_table_ptr+1) = *(from_table_ptr+1);
   *(local_table_ptr+2) = *(from_table_ptr+2);
   *(local_table_ptr+3) = *(from_table_ptr+3);
   *(local_table_ptr+4) = *(from_table_ptr+4);
   for (i = 0;i < 25;i++)
   {
      *local_table_ptr += i+1;
      *(local_table_ptr+1) += i+2;
      *(local_table_ptr+2) += i+3;
      *(local_table_ptr+3) += i+4;
      *(local_table_ptr+4) += i+5;
   }
   *to_table_ptr = *local_table_ptr;
   *(to_table_ptr+1) = *(local_table_ptr+1);
   *(to_table_ptr+2) = *(local_table_ptr+2);
   *(to_table_ptr+3) = *(local_table_ptr+3);
   *(to_table_ptr+4) = *(local_table_ptr+4);
   return;
}[/cpp]

I suspect that memory contention will be a big problem with this code, since it does little more than copy memory within buffers shared between threads. I also suspect that trying this on a four HW thread system is probably a waste since it only employs two threads toggling in lock-step with each other. This algorithm pays a lot of penalty for its parallelism without getting much benefit.

das · ‎05-29-2009

Quoting - Robert Reed (Intel)

I suspect that memory contention will be a big problem with this code, since it does little more than copy memory within buffers shared between threads. I also suspect that trying this on a four HW thread system is probably a waste since it only employs two threads toggling in lock-step with each other. This algorithm pays a lot of penalty for its parallelism without getting much benefit.

Thanks,
We toggled between memory blocks so that the thread was processing one block while the main was processing another. We also copied shared memory to local (stack), processed it 25 times, and then copied result back to shared memory. Still think it is memory? We used 4 cpu system so had extra cpus to do junk that is always going on in Windows.
Appreciate your thoughts, Dave Schroeder

robert-reed · ‎05-29-2009

Quoting - das@instantimage.com

We toggled between memory blocks so that the thread was processing one block while the main was processing another. We also copied shared memory to local (stack), processed it 25 times, and then copied result back to shared memory. Still think it is memory? We used 4 cpu system so had extra cpus to do junk that is always going on in Windows.

Appreciate your thoughts, Dave Schroeder

It seems clear that your example is intended to convey an example of data organization and movement without revealing the exact purpose, but it's in that purpose that the devil in the details becomes visible. For example, the pointer copying of sequential array elements could be written more simply but as a genotype could cause problems depending on the data conflicts that might result from the real gather/scatter operations implied. If the satellite thread and the main could really keep out of each other's hair (buffers), that would maximize the potential for scaling, but even in this example the base tables (table1 and table2 in main) comprise about 60 bytes and so may well share a cache line themselves.

The other observation regards the effort to copy parameters into and out of the local buffer inProc1 and Proc2. Since the middle, computational section is still doing pointer derefs to manipulate the data, the same ostensible cost as doing the calculations in place, this is presumably donebecause the in-place location is shared memory with additional possible contention cost. However, what this also looks like is an attempt to construct a multi-buffer pipeline to process a stream. Have you considered using a real pipeline like you can get in Intel Threading Building Blocks? In such an architecture I would bucketize the in and out portions of the buffers into composite units that could be passed along the pipeline discretely. Computations would be self-contained withina buffer and a buffer per thread for as many stages as you need in the stream.