Intrinsic function on MIC (512)

Guillaume_S_ · ‎01-13-2015

Hey everyone,

I'm working on a simple financial application (actually benchmarking CPU vs MIC), the first version of the code is without intrinsics function (the compiler is vectorizing the loops) and I wanted to try with the intrinsics. Here is my problem on the CPU, I can observe a gain of performance of 30% with the m256 intrinsics function (vs the CPU without intrinsics) but on the MIC with the m512 the performance is worst than the MIC without the intrinsics (OpenMP + intrinsics), is it normal ?

I can not post the code because it is too big but I can maybe try to reproduce it on a simple piece of code.

Thank you

GS

TimP · ‎01-13-2015

Among many possible explanations of your observations :

Auto-vectorization in the absence of pragma is necessarily more aggressive for mic target. It may vary with host isa or optimization flags.

intrinsics may inhibit compiler optimization of instructions schedule which is more important for Mic.

if your code is large investigation may be time consuming. You might compare opt-report4 on your hot spots.

Guillaume_S_ · ‎01-13-2015

Thank you for your reponse, I ll try to look into vect report.

I posted an example if you want to reproduce it.

Sunny_G_Intel · ‎01-13-2015

Hello Guillaume,

Can you please share the link where you posted your example. I do not see any attachment on this thread. If in case you do not want to share your code publicly, you can send a private message for us to investigate the issue further.

Thanks !!

Guillaume_S_ · ‎01-13-2015

To compile for MIC + intrinsic: icpc prog.cpp -O3 -openmp -DWITH_INTR
To compile for MIC: icpc prog.cpp -O3 -openmp

To compile for CPU + intrinsic: icpc prog.cpp -O3 -openmp -DWITH_INTR -no-offload
To compile for CPU: icpc prog.cpp -O3 -openmp -no-offload

MIC+INTR ~ 5.18 sec
MIC ~ 4.75 sec

CPU+INTR ~ 4.63 sec
CPU ~ 6.47 sec

CPU: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz | SandyBridge (2x8cores - 32 threads)
MIC: Intel(R) Xeon Phi(TM) coprocessor x100 family (61 cores - 244 threads)

#include <stdio.h>
#include <omp.h>
#include <offload.h>
#include <math.h>
#include <immintrin.h>

#define N 2<<17
#define P 2<<14

__declspec(target(mic:0)) void testVctr( double *a, double *b, double *c )
{
	__assume_aligned( a, 64 );
	__assume_aligned( b, 64 );
	__assume_aligned( c, 64 );

	int i;
	int j;
	int k;

	#ifdef WITH_INTR
		#ifdef __MIC__
			__m512d  n1    = _mm512_set1_pd( 1. );
			__m512d  n1024 = _mm512_set1_pd( 1024. );
			__m512d  n230  = _mm512_set1_pd( 230. );
		#else
			__m256d n1    = _mm256_set1_pd( 1. );
			__m256d n1024 = _mm256_set1_pd( 1024. );
			__m256d n230  = _mm256_set1_pd( 230. );
		#endif
	#endif

	#pragma omp parallel for private( i, j, k ) schedule( dynamic )
	for( i=0; i<N; ++i )
	{
		#ifdef WITH_INTR
			#ifdef __MIC__	
				double *A = (double *) _mm_malloc( (size_t)( (8) * sizeof(double) ), 64 );

				__m512d res   = _mm512_setzero_pd(), r0, r1;

				for( j=0; j<P; j+=8 )
				{
					r0 = _mm512_set_pd( b[j+7], b[j+6], b[j+5], b[j+4], b[j+3], b[j+2], b[j+1], b );
					r0 = _mm512_add_pd( r0, n1 );
					r0 = _mm512_div_pd( n1, r0 );
					r0 = _mm512_exp_pd( r0 );
					
					r1 = _mm512_set_pd( c[j+7], c[j+6], c[j+5], c[j+4], c[j+3], c[j+2], c[j+1], c );
					r1 = _mm512_mul_pd( r1, n1024 );
					r1 = _mm512_add_pd( r1, n230 );
					r1 = _mm512_log_pd( r1 );
				
					r0 = _mm512_div_pd( r0, r1 );

					res = _mm512_add_pd( res, r0 );
				}

				_mm512_store_pd( A, res );

				double tmp(0.);
				for( k=0; k<8; ++k )
					tmp += A;

				a = tmp;

				_mm_free( (double *) A );

			#else
				double *A = (double *) _mm_malloc( (size_t)( (4) * sizeof(double) ), 64 );

				__m256d res   = _mm256_setzero_pd(), r0, r1;

				for( j=0; j<P; j+=4 )
				{
					r0 = _mm256_set_pd( b[j+3], b[j+2], b[j+1], b );
					r0 = _mm256_add_pd( r0, n1 );
					r0 = _mm256_div_pd( n1, r0 );
					r0 = _mm256_exp_pd( r0 );
					
					r1 = _mm256_set_pd( c[j+3], c[j+2], c[j+1], c );
					r1 = _mm256_mul_pd( r1, n1024 );
					r1 = _mm256_add_pd( r1, n230 );
					r1 = _mm256_log_pd( r1 );
				
					r0 = _mm256_div_pd( r0, r1 );

					res = _mm256_add_pd( res, r0 );
				}

				_mm256_store_pd( A, res );

				double tmp(0.);
				for( k=0; k<4; ++k )
					tmp += A;

				a = tmp;

				_mm_free( (double *) A );

			#endif
		#else
			double res = 0.;

			for( j=0; j<P; ++j )
			{
				double tmp0 = 1./(b+1.);
				double tmp1 = exp( tmp0 );
				double tmp2 = c * 1024;
				double tmp3 = tmp2 + 230;
				double tmp4 = log( tmp3 );
				double tmp5 = tmp1 / tmp4;
				res += tmp5;
			}

			a = res;
		#endif
	}
}

int main( void )
{
	int i;

	printf("\nOuter loop (N) %d iterations \nInner loop (P) %d iterations\n", N, P );

	double * a = (double *) _mm_malloc( (size_t)( (N) * sizeof(double) ), 64 );
	double * b = (double *) _mm_malloc( (size_t)( (P) * sizeof(double) ), 64 );
	double * c = (double *) _mm_malloc( (size_t)( (P) * sizeof(double) ), 64 ); 

	for( i=0; i<P; ++i )
	{
		b = rand()/RAND_MAX;
		c = rand()/RAND_MAX;
	}
	#pragma offload target( mic : 0 ) \
	out( a : length( N ) align(512) ) \
	in ( b : length( P ) align(512) ) \
	in ( c : length( P ) align(512) )
	testVctr( a, b, c );		

	printf( "\nCheck last result: %f (~ 1.)\n", a[N-1]*2./(P) );

	_mm_free( (double *) a );
	_mm_free( (double *) b );
	_mm_free( (double *) c );
	
	return 0;
}

Leonardo_B_Intel · ‎01-13-2015

Hello,

Have you considered replacing the lines

r0 = _mm512_set_pd( b[j+7], b[j+6], b[j+5], b[j+4], b[j+3], b[j+2], b[j+1], b );

and

r1 = _mm512_set_pd( c[j+7], c[j+6], c[j+5], c[j+4], c[j+3], c[j+2], c[j+1], c );

by something like

r0 = _mm512_load_pd( &b);

and

r1 = _mm512_load_pd( &c);

respectivelly? By glancing at the code I assume this should be possible.

Those SET commands can be very expensive (generate many intructions) when compared against a straight memory load for all the elements. And my guess is that the compiler can figure out on the high level code that all the arrays are aligned for the load - since all the declarions are in the same program scope.

Best,

Leo.

Guillaume_S_ · ‎01-14-2015

Thank you for your response. I tried what you suggested here are the results:

before

MIC+INTR ~ 5.18 sec
MIC ~ 4.75 sec

CPU+INTR ~ 4.63 sec
CPU ~ 6.47 sec

after

MIC+INTR ~ 4.74 sec
MIC ~ 4.75 sec

CPU+INTR ~ 4.31 sec
CPU ~ 6.47 sec

It's better and it still doesn't outperform the auto-vectorization (for the MIC). Maybe something else is wrong in my code or I forget to specify some pragma directives ?

GS