- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
how can i write these codes using sse2 intrinsic?
void edwin::add(void* btr)
{
short* b =(short*)btr;
for(j = 0; j < 16; j += 4)
{
int f0 = (int)(b
int f3 = (int)(b
int f1 = (int)(b[j+1] + b[j+2]);
int f2 = (int)(b[j+1] - b[j+2]);
b
b[j+2] = (short)(f0 - f1);
b[j+1] = (short)(f2 + (f3 << 1));
b[j+3] = (short)(f3 - (f2 << 1));
}
}
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
__declspec(align(16))__m128i t0,t1,temp0,temp1,temp2,temp3,temp4,f0,f1,f2,f3;
__declspec(align(16)) __m128i*b = (__m128i*)btr;
t0 = _mm_loadu_si128(b);//1,1,1,1,2,2,2,2
t1 = _mm_loadu_si128(b+1);//3,3,3,3,4,4,4,4
f0 = _mm_loadl_epi64(b);//1,1,1,1,0,0,0,0
f1 = _mm_loadl_epi64(b+1);//3,3,3,3,0,0,0,0
f2 = _mm_unpackhi_epi64(t0,f0);//2,2,2,2,0,0,0,0
f3 = _mm_unpackhi_epi64(t1,f1);//4,4,4,4,0,0,0,0
temp0 = f0;
temp1 = f1;
temp2 = f2;
temp3 = f3;
temp0 = _mm_add_epi16(temp0, f3);
temp1 = _mm_add_epi16(temp1, f2);
f0 = _mm_sub_epi16(f0, f3);
f1 = _mm_sub_epi16(f1, f2);
temp4 = temp0;
temp4 = _mm_add_epi16(temp4, temp1);
_mm_store_si128(b, temp4);
temp0 = _mm_sub_epi16(temp0, temp1);
_mm_store_si128(b+2, temp0);
temp1 = f0;
temp4 = f1;
temp1 = _mm_slli_epi16(temp1, 1);
temp4 = _mm_slli_epi16(temp4, 1);
f0 = _mm_add_epi16(f0, temp4);
f1 = _mm_sub_epi16(f1, temp1);
_mm_store_si128(b+1, f0);
_mm_store_si128(b+3, f1);
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
if it is static, use __declspec(align(...)) to align
if dynamic use _aligned_malloc, _aligned_free
if it is just an arbritart memory that you have no control over its alignment, use unaligned version of the function (replace the load/store with loadu/storeu)

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page