[cpp]float min_reduce(__m128 a) { a = _mm_min_ps(a, _mm_movehl_ps(a, a)); // a = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3) a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = min(a0, a1), a1, a2, a3 float r; _mm_store_ss(&r, a); return r; }[/cpp]It seems to work (at least for the cases I tested). But it looks more complicated to me than necessary. Is there something more efficient, or is this the best I can get already?
Link Copied
[cpp]short min(__m128i a) { a = _mm_min_epi16(a, _mm_shuffle_epi32 (a, _MM_SHUFFLE(1, 0, 3, 2))); a = _mm_min_epi16(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = _mm_min_epi16(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); // & 0xffff is implicit }[/cpp]That's quite a long dependency chain:
For more complete information about compiler optimizations, see our Optimization Notice.