Also if your glibc already

YangHun_P_ · ‎02-23-2015

I have intel xeon cpu E3-1230 v3 machine which has TSX.

I just want to test that TSX runs well.

From manual, i got this example pseucode

void rtm_wrapped_lock(lock) {
   if (_xbegin() == _XBEGIN_STARTED) {
      if (lock is free)
         /* add lock to the read-set */
         return; /* Execute transactionally */
      _xabort(0xff);
      /* 0xff means the lock was not free */
   }
   /* come here following the transactional abort */
   original_locking_code(lock);
}

void rtm_wrapped_unlock(lock) {
   /* If lock is free, assume that the lock was elided */
   if (lock is free)
      _xend(); /* commit */
   else
      original_unlocking_code(lock);
}

My test code for RTM which is a set of TSX is like this.

void main(void)
{
   int i;
   int sum[20];
   int data[20];

   pthread_mutex_t mutex;

   pthread_mutex_init(&mutex,NULL);

   for(i=0;i<20;i++)
   {
      data=i;
      sum=0;
   }

   omp_set_num_threads(4);

#pragma omp parallel for private(i)
   for(i=0;i<20;i++)
   {
      if(_xbegin()==_XBEGIN_STARTED)
      {
         if(pthread_mutex_trylock(&mutex)==0)                  //if lock is free
         {
            /* transactional execution */
            pthread_mutex_unlock(&mutex);
            sum+=data;
            _xend();
         }
         else
            _xabort(0xff);
         }      
      else
      {
#pragma omp critical
         {
             pthread_mutex_lock(&mutex);
             sum+=data;
             pthread_mutex_unlock(&mutex);
         }
      }   
   }
}

and

#define _XBEGIN_STARTED		(~0u)
#define _XABORT_EXPLICIT	(1 << 0)
#define _XABORT_RETRY		(1 << 1)
#define _XABORT_CONFLICT	(1 << 2)
#define _XABORT_CAPACITY	(1 << 3)
#define _XABORT_DEBUG		(1 << 4)
#define _XABORT_NESTED		(1 << 5)
#define _XABORT_CODE(x)		(((x) >> 24) & 0xff)

#define __rtm_force_inline __attribute__((__always_inline__)) inline

static __rtm_force_inline int _xbegin(void)
{
	int ret = _XBEGIN_STARTED;
	asm volatile(".byte 0xc7,0xf8 ; .long 0" : "+a" (ret) :: "memory");
	return ret;
}

static __rtm_force_inline void _xend(void)
{
	 asm volatile(".byte 0x0f,0x01,0xd5" ::: "memory");
}

static __rtm_force_inline void _xabort(const unsigned int status)
{
	asm volatile(".byte 0xc6,0xf8,%P0" :: "i" (status) : "memory");
}

static __rtm_force_inline int _xtest(void)
{
	unsigned char out;
	asm volatile(".byte 0x0f,0x01,0xd6 ; setnz %0" : "=r" (out) :: "memory");
	return out;
}

#endif

This is rtm.h code

when executing this program, the source code can not enter the Xbegin status !!.

All threads runs with fallback path !

It means really all time lock is busy? or my test code is wrong for testing RTM?

Roman_D_Intel · ‎02-23-2015

Hi,

the problems in the method you use for checking the lock state (pthread_mutex_trylock + pthread_mutex_unlock). It modifies the lock state which causes conflicts on the lock variable -> transaction will most likely always abort. The method to check the lock state must not modify the lock state. Unfortunately the standard pthread lock does not provide such method.

You should rather use one of the proven TSX/RTM lock elision implementations: www.intel.com/software/tsx (Lock implementations with Intel TSX support).

Best regards,

Roman

Andreas_K_Intel · ‎02-25-2015

Also if your glibc already does RTM (which several modern distributions do) pthread_mutex_trylock would always force an abort, to enforce correct semantics. Otherwise trylock cannot know if a elided lock is hold or not.

jimdempseyatthecove · ‎02-26-2015

Additionally in the fail to begin section redundantly has an omp critical and mutex lock. If you are only using omp threads then the critical is sufficient. If you are using combination of omp threads and pthreads of your own then use the mutex.

I

void main(void)
{
   int i;
   int sum[20];
   int data[20];

   pthread_mutex_t mutex;

   pthread_mutex_init(&mutex,NULL);

   for(i=0;i<20;i++)
   {
      data=i;
      sum=0;
   }

   omp_set_num_threads(4);

#pragma omp parallel for private(i)
   for(i=0;i<20;i++)
   {
      if(_xbegin()==_XBEGIN_STARTED)
      {
         /* transactional execution */
         sum+=data;
         _xend();
      }
      else
      {
#pragma omp critical
         {
             sum+=data; // in this case an omp atomic would be better
         }
      }   
   }
}

In the above code, the TSX section is unwarranted due to each thread manipulating different ranges of .

void main(void)
{
   int i,j, nCritical;
   int sum[20];
   int data[20];

   pthread_mutex_t mutex;

   pthread_mutex_init(&mutex,NULL);

   for(i=0;i<20;i++)
   {
      data=i;
      sum=0;
   }
   nCritical = 0;
   omp_set_num_threads(4);
#define LOOP_CNT
#pragma omp parallel for private(i,j)
   for(j=0;j<LOOP_CNT;j++)
   {
      // each thread now competing for same 
      for(i=0;i<20;i++)
      {
         if(_xbegin()==_XBEGIN_STARTED)
         {
            /* transactional execution */
            sum+=data;
            _xend();
         }
         else
         {
#pragma omp critical
            {
               sum+=data;
               ++nCritical;
            }
         }
      }   
   }
   printf("TSX: %d, critical: %d\n", LOOP_CNT-nCritical, nCritical);
}

The above would better approximate your test. However, the zone of contention is only 2 cache lines and is highly contended. Therefore, it is expected to experience a very high number of aborted transactions. A better facsimile might be:

void main(void)
{
   int i,j, nCritical;
   int sum[20];
   int data[20];

   pthread_mutex_t mutex;

   pthread_mutex_init(&mutex,NULL);

   for(i=0;i<20;i++)
   {
      data=i;
      sum=0;
   }
   nCritical = 0;
   omp_set_num_threads(4);
#define LOOP_CNT
#pragma omp parallel for private(i,j)
   for(j=0;j<LOOP_CNT;j++)
   {
      // arbitrary do short do work
      _mm_pause(); _mm_pause(); _mm_pause();
      // now results of do work accumulated into sum[]
      if(_xbegin()==_XBEGIN_STARTED)
      {
         /* transactional execution */
         for(i=0;i<20;i++)
         {
            sum+=data;
         }
         _xend();
       }
       else
       {
#pragma omp critical
         for(i=0;i<20;i++)
         {
            sum+=data;
         }
         ++nCritical;
      }   
   }
   printf("TSX: %d, critical: %d\n", LOOP_CNT-nCritical, nCritical);
}

Jim Dempsey

Roman_D_Intel · ‎02-26-2015

If sum is potentially updated by several threads then the usage of "#pragma omp critical" is wrong (race between the TSX execution and the fall-back). You must subscribe to the fall-back lock state inside the transaction (read the lock state) or use lock-free operations in the fall-back. Please consider this blog describing the general issue with the "#pragma omp critical" usage.

Best regards,

Roman

jimdempseyatthecove · ‎02-26-2015

In the sample code above, the protected regions are correct. *** however, in a practical application where the protected region manipulates cache lines or the same cache line multiple times, the above code outline is incorrect. In order to be correct, as Roman indicates you would need to add a pthread_mutex_is_locked(&mutex) function and do something like this:

void main(void)
{
   int i;
   int sum[20];
   int data[20];

   pthread_mutex_t mutex;

   pthread_mutex_init(&mutex,NULL);

   for(i=0;i<20;i++)
   {
      data=i;
      sum=0;
   }

   omp_set_num_threads(4);

#pragma omp parallel for private(i)
   for(i=0;i<20;i++)
   {
      // *** hypothetical pthread_mutex_is_lock, you must add this function
      if(!pthread_mutex_is_locked(&mutex) && _xbegin()==_XBEGIN_STARTED)
      {
         if(pthread_mutex_is_locked(&mutex))
            _xabort(0xff);
         
         /* transactional execution */
         sum+=data;
         _xend();
      }      
      else
      {
          pthread_mutex_lock(&mutex);
          sum+=data;
          pthread_mutex_unlock(&mutex);
      }   
   }
}

The pthread_mutex_is_locked function can be an inline test of the mutex being not in free state. (you can extend this if you wish).

Read the pthread.h and its include files to determine how to create a generic test function. This should have been part of the standard mutex functions.

Jim Dempsey

TSX example code doesn't work