[cpp]void TestPreMLerpSSETBB(unsigned char* pDest, unsigned char* pSrc, int count)
{
//
// D = D + S - (D*S)/255
//
class Blend
{
public:
Blend(__m128i* dest, __m128i* source) : dest_(dest), source_(sour...*S)
rb = _mm_srli_epi16(rb, 8); // prepack and div [(D*S)]/255
// TODO: check assembly for prefetch
_mm_prefetch(reinterpret_cast(&dest_[n+PSD]) , _... *S)
ag = _mm_and_si128(ag, himask); // prepack and div [(D*S)]/255
rb = _mm_or_si128(rb, ag); // pack
rb = _mm_sub_epi8(s, rb); // sub S-[(D*S)/255]
d = _mm_add_epi8(d, rb); // add D+[S-(D*S)/255]
_mm_store_si128(&dest_, d); *S)
// UNROLL 2 //
d = _mm_load_si128(&dest_[n+1]);
s =...
rb = _mm_srli_epi16(rb, 8); // prepack and div [(D*S)]/255
ag = _mm_srli_epi16(s, 8); // unpack
ag = _mm_mullo_epi16(ag, a); // mul (D*S)
ag = _mm_and_si128(ag, himask); // prepack and div [(D*S)]/255
rb = _mm_or_si128(rb, ag); // pack
rb = _mm_sub_epi8(s, rb); // sub S-[(D*S)/255]
d = _mm_add_epi8(d, rb); // add D+[S-(D*S)/255]
_mm_store_si128(&dest_[n+1], d);
// UNROLL 3 //
d = _mm_load_si128(&dest_[n+2]);
...
*S)
rb = _mm_srli_epi16(rb, 8); // prepack and div [(D*S)]/255
_mm_prefetch(reinterpret_cast(&source_[n+PSD]) , _MM_HINT_NTA); *S)
ag = _mm_srli_epi16(s...
ag = _mm_and_si128(ag, himask); // prepack and div [(D*S)]/255
rb = _mm_or_si128(rb, ag); // pack
rb = _mm_sub_epi8(s, rb); // sub S-[(D*S)/255]
d = _mm_add_epi8(d, rb); // add D+[S-(D*S)/255]
_mm_store_si128(&dest_[n+2], d);
// UNROLL 4 //
d = _mm_load_si128(&dest_[n+3]);
...
*S)
rb = _mm_srli_epi16(rb, 8); // prepack and div [(D*S)]/255
ag = _mm_srli_epi16(s, 8); // unpack
ag = _mm_mullo_epi16(ag, a); // mul (D*S)
ag = _mm_and_si128(ag, himask); // prepack and div [(D*S)]/255
rb = _mm_or_si128(rb, ag); // pack
rb = _mm_sub_epi8(s, rb); // sub S-[(D*S)/255]
d = _mm_add_epi8(d, rb); // add D+[S-(D*S)/255]
_mm_store_si128(&dest_[n+3], d);
}
}
private:
__m128i* dest_;
__m128i* source_;
...
Link Copied
[cpp]double time = 0.0;
_asm emms;
for(int i=0;i{
InitData(pDest, pSrc);
for(int n = 0; n < dataSize()/64; ++n) // flush cache
{
_mm_clflush(pDest+n*64);
_mm_clflush(pSrc+n*64);
}
global_pTimer->GetTimespan();
(*pFn)(pDest, pSrc, count);
time += global_pTimer->GetTimespan();
}
_asm emms;
return time/(double)TestCount;[/cpp]
For more complete information about compiler optimizations, see our Optimization Notice.