Why this AVX code is slower than SSE?

missing__zlw · ‎09-28-2011

I converted one linear interpolation from SSE to AVX and it is almost 2x *slower*. Why?

The number of loads and calculations has been reduced. I also changed to use aligned load.

SSE code:

__m128 xxx128_x_coeff1 = _mm_loadu_ps( &interp_coef_x[0] );

__m128 xxx128_x_coeff2 = _mm_set_ps( 0.0f, 0.0f, interp_coef_x[5], interp_coef_x[4]);

__m128 xxx128_data0a = _mm_loadu_ps( &pf[index_signal_start] );

__m128 xxx128_data0b = _mm_loadu_ps( &pf[index_signal_start+4] );

__m128 xxx128_data1a = _mm_loadu_ps( &pf[index_signal_start+nx] );

__m128 xxx128_data1b = _mm_loadu_ps( &pf[index_signal_start+nx+4] );

__m128 xxx128_data2a = _mm_loadu_ps( &pf[index_signal_start+nx2] );

__m128 xxx128_data2b = _mm_loadu_ps( &pf[index_signal_start+nx2+4] );

__m128 xxx128_data3a = _mm_loadu_ps( &pf[index_signal_start+nx3] );

__m128 xxx128_data3b = _mm_loadu_ps( &pf[index_signal_start+nx3+4] );

__m128 xxx128_data4a = _mm_loadu_ps( &pf[index_signal_start+nx4] );

__m128 xxx128_data4b = _mm_loadu_ps( &pf[index_signal_start+nx4+4] );

__m128 xxx128_data5a = _mm_loadu_ps( &pf[index_signal_start+nx5] );

__m128 xxx128_data5b = _mm_loadu_ps( &pf[index_signal_start+nx5+4] );

xxx128_data0a = _mm_mul_ps( xxx128_data0a, xxx128_x_coeff1 );

xxx128_data0b = _mm_mul_ps( xxx128_data0b, xxx128_x_coeff2 );

xxx128_data1a = _mm_mul_ps( xxx128_data1a, xxx128_x_coeff1 );

xxx128_data1b = _mm_mul_ps( xxx128_data1b, xxx128_x_coeff2 );

xxx128_data2a = _mm_mul_ps( xxx128_data2a, xxx128_x_coeff1 );

xxx128_data2b = _mm_mul_ps( xxx128_data2b, xxx128_x_coeff2 );

xxx128_data3a = _mm_mul_ps( xxx128_data3a, xxx128_x_coeff1 );

xxx128_data3b = _mm_mul_ps( xxx128_data3b, xxx128_x_coeff2 );

xxx128_data4a = _mm_mul_ps( xxx128_data4a, xxx128_x_coeff1 );

xxx128_data4b = _mm_mul_ps( xxx128_data4b, xxx128_x_coeff2 );

xxx128_data5a = _mm_mul_ps( xxx128_data5a, xxx128_x_coeff1 );

xxx128_data5b = _mm_mul_ps( xxx128_data5b, xxx128_x_coeff2 );

__m128 xxx128_sum = _mm_add_ps(_mm_mul_ps(_mm_add_ps(xxx128_data0a, xxx128_data0b),xxx128_y_coeff0), _mm_mul_ps(_mm_add_ps(xxx128_data1a, xxx128_data1b), xxx128_y_coeff1) );

__m128 xxx128_sum1 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(xxx128_data2a, xxx128_data2b),xxx128_y_coeff2) , _mm_mul_ps(_mm_add_ps(xxx128_data3a, xxx128_data3b),xxx128_y_coeff3));

__m128 xxx128_sum2 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(xxx128_data4a, xxx128_data4b),xxx128_y_coeff4), _mm_mul_ps(_mm_add_ps(xxx128_data5a, xxx128_data5b),xxx128_y_coeff5) );

xxx128_sum = _mm_add_ps(xxx128_sum, _mm_add_ps(xxx128_sum1, xxx128_sum2) );

xxx128_sum = _mm_add_ps(xxx128_sum, _mm_movehl_ps(xxx128_sum, xxx128_sum));

xxx128_sum = _mm_add_ss(xxx128_sum, _mm_shuffle_ps(xxx128_sum, xxx128_sum, 1));

_mm_store_ss( &signal_value, xxx128_sum );

AVX code:

__m256 xxx256_x_coeff1 = _mm256_load_ps( &interp_coef_x[0] ); // load 8, only use 6 though

__m256 xxx256_data0a = _mm256_load_ps( &pf[index_signal_start] ); // load 8

__m256 xxx256_data1a = _mm256_load_ps( &pf[index_signal_start+nx] );

__m256 xxx256_data2a = _mm256_load_ps( &pf[index_signal_start+nx2] );

__m256 xxx256_data3a = _mm256_load_ps( &pf[index_signal_start+nx3] );

__m256 xxx256_data4a = _mm256_load_ps( &pf[index_signal_start+nx4] );

__m256 xxx256_data5a = _mm256_load_ps( &pf[index_signal_start+nx5] );

xxx256_data0a = _mm256_mul_ps( xxx256_data0a, xxx256_x_coeff1 );

xxx256_data1a = _mm256_mul_ps( xxx256_data1a, xxx256_x_coeff1 );

xxx256_data2a = _mm256_mul_ps( xxx256_data2a, xxx256_x_coeff1 );

xxx256_data3a = _mm256_mul_ps( xxx256_data3a, xxx256_x_coeff1 );

xxx256_data4a = _mm256_mul_ps( xxx256_data4a, xxx256_x_coeff1 );

xxx256_data5a = _mm256_mul_ps( xxx256_data5a, xxx256_x_coeff1 );

__m256 xxx256_sum = _mm256_add_ps(_mm256_mul_ps(xxx256_data0a,xxx256_y_coeff0),

_mm256_mul_ps(xxx256_data1a, xxx256_y_coeff1) );

__m256 xxx256_sum1 = _mm256_add_ps(_mm256_mul_ps(xxx256_data2a,xxx256_y_coeff2),

_mm256_mul_ps(xxx256_data3a, xxx256_y_coeff3) );

__m256 xxx256_sum2 = _mm256_add_ps(_mm256_mul_ps(xxx256_data4a,xxx256_y_coeff4),

_mm256_mul_ps(xxx256_data5a, xxx256_y_coeff5) );

xxx256_sum = _mm256_add_ps(xxx256_sum, _mm256_add_ps(xxx256_sum1, xxx256_sum2) );

xxx256_sum = _mm256_add_ps(xxx256_sum, _mm256_movehdup_ps(xxx256_sum));

xxx256_sum1 = _mm256_unpackhi_ps(xxx256_sum, xxx256_sum) ;

xxx256_sum1 = _mm256_add_ps(xxx256_sum, xxx256_sum1) ;

xxx256_sum = _mm256_permute2f128_ps(xxx256_sum1, xxx256_sum1, 0x01) ;

xxx256_sum = _mm256_add_ps(xxx256_sum, xxx256_sum1);

_mm256_store_ps( f, xxx256_sum );

signal_value = f[0];

Please help!!

capens__nicolas · ‎09-29-2011

Quoting zlw

load 8, only use 6 though

What are the values of the other elements? Make sure they won't cause overflow/underflow and such as that may cause interrupts to be triggered. I'm not sure if that still applies to Sandy Bridge since it's supposed to handle these cases in hardware instead of microcode, but there might be exceptions.

Also, have you taken a look at the actual assembly to check for unexpected instructions?

Brijender_B_Intel · ‎09-29-2011

I could not understand one thing, the SSE code has unaligned load but AVX has aligned load. It looks like data is aligned in both case. However, for AVX code i will suggest use 128bit loads and then use vperl2f128 to put data in upper lane from second 128bit load. If you can guarantee that there is no page fault or cache line split then above code will be good. Otherwise 256bit loads are more prone to that.
This typical code has big load in the beginning, so this load is not changing in both SSE and AVX. AVX will only give advantage in processing 2times elements (8 ) as compared to SSE(4). So, you should expect AVX gain only from those instructions. It wont be 2x as code is memory limited.

missing__zlw · ‎09-29-2011

Thanks.

The code is extracted from real example, where we have to use un-aligned load. I tried to make AVX run faster, so I used aligned load for AVX in the extracted code, although it is still slower than the SSE code.

How do you load 128 bits and then use _mm256_permute2f128_ps to __mm256? The parameters in_mm256_permute2f128_ps are for __mm256 only.

Brijender_B_Intel · ‎09-29-2011

__m256 tmp0 = _mm256_castps128_ps256(_mm_load_ps(first 4element address));

__m256 tmp1 = _mm256_castps128_ps256(_mm_load_ps(first 4element address + 4)); <-- reading next 4 elements

dest = _mm256_insertf128_ps(tmp0, _mm256_castps256_ps128(tmp1), 0x01);

you can do same thing with verpm2f128:

dest = vperm2f128(tmp0, _mm256_castps128_ps256(_mm_load_ps(first 4element address + 4)), cntrl);

If your code gets faster with this approach, then you are hit with page faults/cache line splits.

missing__zlw · ‎09-30-2011

Thank, I will try.

bronxzv · ‎10-04-2011

__m256 tmp0 = _mm256_castps128_ps256(_mm_load_ps(first 4element address));
__m256 tmp1 = _mm256_castps128_ps256(_mm_load_ps(first 4element address + 4)); <-- reading next 4 elements
dest = _mm256_insertf128_ps(tmp0, _mm256_castps256_ps128(tmp1), 0x01);

hint: with the Intel compiler _mm256_loadu_ps do all of that for you

missing__zlw · ‎11-08-2011

Thanks. That seems going back to my original implementation.