I converted one linear interpolation from SSE to AVX and it is almost 2x *slower*. Why?
The number of loads and calculations has been reduced. I also changed to use aligned load.
SSE code:
__m128 xxx128_x_coeff1 = _mm_loadu_ps( &interp_coef_x[0] );
__m128 xxx128_x_coeff2 = _mm_set_ps( 0.0f, 0.0f, interp_coef_x[5], interp_coef_x[4]);
__m128 xxx128_data0a = _mm_loadu_ps( &pf[index_signal_start] );
__m128 xxx128_data0b = _mm_loadu_ps( &pf[index_signal_start+4] );
__m128 xxx128_data1a = _mm_loadu_ps( &pf[index_signal_start+nx] );
__m128 xxx128_data1b = _mm_loadu_ps( &pf[index_signal_start+nx+4] );
__m128 xxx128_data2a = _mm_loadu_ps( &pf[index_signal_start+nx2] );
__m128 xxx128_data2b = _mm_loadu_ps( &pf[index_signal_start+nx2+4] );
__m128 xxx128_data3a = _mm_loadu_ps( &pf[index_signal_start+nx3] );
__m128 xxx128_data3b = _mm_loadu_ps( &pf[index_signal_start+nx3+4] );
__m128 xxx128_data4a = _mm_loadu_ps( &pf[index_signal_start+nx4] );
__m128 xxx128_data4b = _mm_loadu_ps( &pf[index_signal_start+nx4+4] );
__m128 xxx128_data5a = _mm_loadu_ps( &pf[index_signal_start+nx5] );
__m128 xxx128_data5b = _mm_loadu_ps( &pf[index_signal_start+nx5+4] );
xxx128_data0a = _mm_mul_ps( xxx128_data0a, xxx128_x_coeff1 );
xxx128_data0b = _mm_mul_ps( xxx128_data0b, xxx128_x_coeff2 );
xxx128_data1a = _mm_mul_ps( xxx128_data1a, xxx128_x_coeff1 );
xxx128_data1b = _mm_mul_ps( xxx128_data1b, xxx128_x_coeff2 );
xxx128_data2a = _mm_mul_ps( xxx128_data2a, xxx128_x_coeff1 );
xxx128_data2b = _mm_mul_ps( xxx128_data2b, xxx128_x_coeff2 );
xxx128_data3a = _mm_mul_ps( xxx128_data3a, xxx128_x_coeff1 );
xxx128_data3b = _mm_mul_ps( xxx128_data3b, xxx128_x_coeff2 );
xxx128_data4a = _mm_mul_ps( xxx128_data4a, xxx128_x_coeff1 );
xxx128_data4b = _mm_mul_ps( xxx128_data4b, xxx128_x_coeff2 );
xxx128_data5a = _mm_mul_ps( xxx128_data5a, xxx128_x_coeff1 );
xxx128_data5b = _mm_mul_ps( xxx128_data5b, xxx128_x_coeff2 );
__m128 xxx128_sum = _mm_add_ps(_mm_mul_ps(_mm_add_ps(xxx128_data0a, xxx128_data0b),xxx128_y_coeff0), _mm_mul_ps(_mm_add_ps(xxx128_data1a, xxx128_data1b), xxx128_y_coeff1) );
__m128 xxx128_sum1 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(xxx128_data2a, xxx128_data2b),xxx128_y_coeff2) , _mm_mul_ps(_mm_add_ps(xxx128_data3a, xxx128_data3b),xxx128_y_coeff3));
__m128 xxx128_sum2 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(xxx128_data4a, xxx128_data4b),xxx128_y_coeff4), _mm_mul_ps(_mm_add_ps(xxx128_data5a, xxx128_data5b),xxx128_y_coeff5) );
xxx128_sum = _mm_add_ps(xxx128_sum, _mm_add_ps(xxx128_sum1, xxx128_sum2) );
xxx128_sum = _mm_add_ps(xxx128_sum, _mm_movehl_ps(xxx128_sum, xxx128_sum));
xxx128_sum = _mm_add_ss(xxx128_sum, _mm_shuffle_ps(xxx128_sum, xxx128_sum, 1));
_mm_store_ss( &signal_value, xxx128_sum );
AVX code:
__m256 xxx256_x_coeff1 = _mm256_load_ps( &interp_coef_x[0] ); // load 8, only use 6 though
__m256 xxx256_data0a = _mm256_load_ps( &pf[index_signal_start] ); // load 8
__m256 xxx256_data1a = _mm256_load_ps( &pf[index_signal_start+nx] );
__m256 xxx256_data2a = _mm256_load_ps( &pf[index_signal_start+nx2] );
__m256 xxx256_data3a = _mm256_load_ps( &pf[index_signal_start+nx3] );
__m256 xxx256_data4a = _mm256_load_ps( &pf[index_signal_start+nx4] );
__m256 xxx256_data5a = _mm256_load_ps( &pf[index_signal_start+nx5] );
xxx256_data0a = _mm256_mul_ps( xxx256_data0a, xxx256_x_coeff1 );
xxx256_data1a = _mm256_mul_ps( xxx256_data1a, xxx256_x_coeff1 );
xxx256_data2a = _mm256_mul_ps( xxx256_data2a, xxx256_x_coeff1 );
xxx256_data3a = _mm256_mul_ps( xxx256_data3a, xxx256_x_coeff1 );
xxx256_data4a = _mm256_mul_ps( xxx256_data4a, xxx256_x_coeff1 );
xxx256_data5a = _mm256_mul_ps( xxx256_data5a, xxx256_x_coeff1 );
__m256 xxx256_sum = _mm256_add_ps(_mm256_mul_ps(xxx256_data0a,xxx256_y_coeff0),
_mm256_mul_ps(xxx256_data1a, xxx256_y_coeff1) );
__m256 xxx256_sum1 = _mm256_add_ps(_mm256_mul_ps(xxx256_data2a,xxx256_y_coeff2),
_mm256_mul_ps(xxx256_data3a, xxx256_y_coeff3) );
__m256 xxx256_sum2 = _mm256_add_ps(_mm256_mul_ps(xxx256_data4a,xxx256_y_coeff4),
_mm256_mul_ps(xxx256_data5a, xxx256_y_coeff5) );
xxx256_sum = _mm256_add_ps(xxx256_sum, _mm256_add_ps(xxx256_sum1, xxx256_sum2) );
xxx256_sum = _mm256_add_ps(xxx256_sum, _mm256_movehdup_ps(xxx256_sum));
xxx256_sum1 = _mm256_unpackhi_ps(xxx256_sum, xxx256_sum) ;
xxx256_sum1 = _mm256_add_ps(xxx256_sum, xxx256_sum1) ;
xxx256_sum = _mm256_permute2f128_ps(xxx256_sum1, xxx256_sum1, 0x01) ;
xxx256_sum = _mm256_add_ps(xxx256_sum, xxx256_sum1);
_mm256_store_ps( f, xxx256_sum );
signal_value = f[0];
Please help!!