- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
how do i vectorise the first loop using sse2 intrinsics and achieve faster codes?
#include "stdafx.h"
#include
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
{
short b[4][4]={1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4};
short a[4][4];
short c[4][4];
for (int j =0;j<4;j++)
{
a[0]=b[0]+b[3];
a[1]=b[1]+b[2];
a[2]=b[1]-b[2];
a[3]=b[0]-b[3];
// step 2
c[0]=a[0]+a[1];
c[1]=a[2]+(a[3]<<1);
c[2]=a[0]-a[1];
c[3]=a[3]-(a[2]<<1);
}
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
cout << a << " ";
cout << endl;
}
cout< for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
cout << c << " ";
cout << endl;
}
return 0;
}
#include "stdafx.h"
#include
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
{
short b[4][4]={1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4};
short a[4][4];
short c[4][4];
for (int j =0;j<4;j++)
{
a[0]
a[1]
a[2]
a[3]
// step 2
c[0]
c[1]
c[2]
c[3]
}
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
cout << a
cout << endl;
}
cout<
{
for (int j = 0; j < 4; j++)
cout << c
cout << endl;
}
return 0;
}
Link Copied
1 Reply
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
it looks like you have asked same question twice. i will answer here.
1. Is loop count 4 (j<4) ? if so then there is no use of vectorizing this. To make it faster just unroll it w/o any loop. You may get little benefit. To unroll, You can write a simple "C" macro call it 4 times.
2. Assuming that you are having a big loop, then you may able to vectorize it. (Secondly, i dont know whether you need a needs any more or not, or they are just temp variable. assuming that you need them, your SSE code will look like this (you may need to fix little bit here and there):
__m128i xmm0 = _mm_load_si128(b[0]); // load 4 elements from each row
__m128i xmm1 = _mm_load_si128(b[1]);
__m128i xmm2 = _mm_load_si128[b[2]);
__m128i xmm3 = _mm_load_si12(b[3]);
__m128i temp0 = xmm0;
temp1 = _mm_add_epi16(temp1, xmm2);
xmm0= _mm_sub_epi16(xmm0, xmm3);
xmm1= _mm_sub_epi16(xmm1, xmm2);
__m128i temp4 = temp0;
temp4 = _mm_add_epi16(temp4, temp1);
_mm_store_si128(c[0], temp4);
temp0 = _mm_sub_epi16(temp0, temp1);
_mm_store_si128(c[2], temp0);
temp1 = xmm0;
temp4 = xmm1;
temp1 = _mm_slli_epi16(temp1, 1);
temp4 = _mm_slli_epi16(temp4, 1);
xmm0 = _mm_add_epi16(xmm0, temp4);
xmm1 = _mm_add_epi16(xmm1, temp1);
_mm_store_si128(c[1], xmm0);
_mm_store_si128(c[3], xmm1);
You need to type cast pointer properly. It is just a sample and you need to make sure that all operations are same what you were looking for.
1. Is loop count 4 (j<4) ? if so then there is no use of vectorizing this. To make it faster just unroll it w/o any loop. You may get little benefit. To unroll, You can write a simple "C" macro call it 4 times.
2. Assuming that you are having a big loop, then you may able to vectorize it. (Secondly, i dont know whether you need a
__m128i xmm0 = _mm_load_si128(b[0]); // load 4 elements from each row
__m128i xmm1 = _mm_load_si128(b[1]);
__m128i xmm2 = _mm_load_si128[b[2]);
__m128i xmm3 = _mm_load_si12(b[3]);
__m128i temp0 = xmm0;
__m128i temp1 = xmm1;
__m128i temp2 = xmm2;
__m128i temp3 = xmm3;
temp0 = _mm_add_epi16(temp0, xmm3);temp1 = _mm_add_epi16(temp1, xmm2);
xmm0= _mm_sub_epi16(xmm0, xmm3);
xmm1= _mm_sub_epi16(xmm1, xmm2);
__m128i temp4 = temp0;
temp4 = _mm_add_epi16(temp4, temp1);
_mm_store_si128(c[0], temp4);
temp0 = _mm_sub_epi16(temp0, temp1);
_mm_store_si128(c[2], temp0);
temp1 = xmm0;
temp4 = xmm1;
temp1 = _mm_slli_epi16(temp1, 1);
temp4 = _mm_slli_epi16(temp4, 1);
xmm0 = _mm_add_epi16(xmm0, temp4);
xmm1 = _mm_add_epi16(xmm1, temp1);
_mm_store_si128(c[1], xmm0);
_mm_store_si128(c[3], xmm1);
You need to type cast pointer properly. It is just a sample and you need to make sure that all operations are same what you were looking for.

Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page