- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi :
I am tuning a code for image rotation, for RGB565 format(simplely, rotation 90 degree):
typedef enum _rotation
{
Left,
Flipped,
Right,
}Rot;
#define PIXELSIZE2 2
#define ALIGN_NUM 4
//#define _DST_CINTINUOUS
#ifdef _DST_CINTINUOUS
int RGB565RotateDstContinuous( const void *pSrc, int width, int height, void *pDst, int beginShift, int lineStep, int pointStep)
{
int i, j;
unsigned short *movSrc, *movDst;
unsigned short *pLineSrc;
pLineSrc = (unsigned short*)pSrc;
pLineSrc += beginShift;
movDst= (unsigned short*)pDst;
for(j = 0; j< height; j++) {
movSrc = pLineSrc;
for(i = 0; i< width; i++){
*movDst = *movSrc;
movSrc += pointStep;
movDst++;
}/*for i*/
pLineSrc += lineStep;
}/*for j*/
return 0;
}/*RGB565RotateDstContinuous*/
#else
int RGB565RotateSrcContinuous(const void *pSrc, int width, int height, void *pDst, int beginShift, int lineStep, int pointStep)
{
int i, j;
unsigned short *movSrc, *movDst;
unsigned short *pLineDst;
pLineDst = (unsigned short*)pDst;
pLineDst += beginShift;
movSrc = (unsigned short*)pSrc;
for(j = 0; j< height; j++) {
movDst = pLineDst;
for(i = 0; i< width; i++){
*movDst = *movSrc;
movSrc++;
movDst += pointStep;
}/*for i*/
pLineDst += lineStep;
}/*for j*/
return 0;
}/*SixteenBitRearrange*/
#endif
int RGB565Rotate( const void *pSrc, int width, int height, void *pDst, Rot rot)
{
int lineStep, pointStep;
int beginShift;
int widthr, heightr;
#ifdef _DST_CINTINUOUS
switch(rot)
{
case Left:
beginShift = width - 1;
pointStep = width;
lineStep = -1;
widthr = height; heightr = width;
break;
case Flipped:
beginShift = width*height - 1;
pointStep = -1;
lineStep = -width;
widthr = width; heightr = height;
break;
case Right:
beginShift = width*(height - 1);
pointStep = -width;
lineStep = 1;
widthr = height; heightr = width;
break;
}/*switch fmt*/
RGB565RotateDstContinuous( pSrc, widthr, heightr, pDst, beginShift, lineStep, pointStep);
#else
switch(rot)
{
case Left:
beginShift = height*(width - 1);
pointStep = -height;
lineStep = 1;
break;
case Flipped:
beginShift = width*height - 1;
pointStep = -1;
lineStep = -width;
break;
case Right:
beginShift = height - 1;
pointStep = height;
lineStep = -1;
break;
}/*switch fmt*/
RGB565RotateSrcContinuous( pSrc, width, height, pDst, beginShift, lineStep, pointStep);
#endif
return 0;
}/*RGB565Rotate*/
The problem is very similiar as matrix transport,.
Of cource, amount of pDst/pSrc memory, there is only one which could be continue.
The Flag _DST_CINTINUOUS is to detect is pDst or pSrc memory continue.
By my benchmark, The result is not stable (max different runtime ~15% by same input when I run many time ) on i5-2400/ i5-2410M.
seem that is no different for pDst/pSrc ct ontinues.
I would like to know which is better for x86 architecture In theory?
Or that does no different for current x86 ?
Greedy for me .... may ask.... Is there are some trick for fast matrix transpose by SSE/AVX instruct set, or cashe control trick ?
Thank you.
I am tuning a code for image rotation, for RGB565 format(simplely, rotation 90 degree):
typedef enum _rotation
{
Left,
Flipped,
Right,
}Rot;
#define PIXELSIZE2 2
#define ALIGN_NUM 4
//#define _DST_CINTINUOUS
#ifdef _DST_CINTINUOUS
int RGB565RotateDstContinuous( const void *pSrc, int width, int height, void *pDst, int beginShift, int lineStep, int pointStep)
{
int i, j;
unsigned short *movSrc, *movDst;
unsigned short *pLineSrc;
pLineSrc = (unsigned short*)pSrc;
pLineSrc += beginShift;
movDst= (unsigned short*)pDst;
for(j = 0; j< height; j++) {
movSrc = pLineSrc;
for(i = 0; i< width; i++){
*movDst = *movSrc;
movSrc += pointStep;
movDst++;
}/*for i*/
pLineSrc += lineStep;
}/*for j*/
return 0;
}/*RGB565RotateDstContinuous*/
#else
int RGB565RotateSrcContinuous(const void *pSrc, int width, int height, void *pDst, int beginShift, int lineStep, int pointStep)
{
int i, j;
unsigned short *movSrc, *movDst;
unsigned short *pLineDst;
pLineDst = (unsigned short*)pDst;
pLineDst += beginShift;
movSrc = (unsigned short*)pSrc;
for(j = 0; j< height; j++) {
movDst = pLineDst;
for(i = 0; i< width; i++){
*movDst = *movSrc;
movSrc++;
movDst += pointStep;
}/*for i*/
pLineDst += lineStep;
}/*for j*/
return 0;
}/*SixteenBitRearrange*/
#endif
int RGB565Rotate( const void *pSrc, int width, int height, void *pDst, Rot rot)
{
int lineStep, pointStep;
int beginShift;
int widthr, heightr;
#ifdef _DST_CINTINUOUS
switch(rot)
{
case Left:
beginShift = width - 1;
pointStep = width;
lineStep = -1;
widthr = height; heightr = width;
break;
case Flipped:
beginShift = width*height - 1;
pointStep = -1;
lineStep = -width;
widthr = width; heightr = height;
break;
case Right:
beginShift = width*(height - 1);
pointStep = -width;
lineStep = 1;
widthr = height; heightr = width;
break;
}/*switch fmt*/
RGB565RotateDstContinuous( pSrc, widthr, heightr, pDst, beginShift, lineStep, pointStep);
#else
switch(rot)
{
case Left:
beginShift = height*(width - 1);
pointStep = -height;
lineStep = 1;
break;
case Flipped:
beginShift = width*height - 1;
pointStep = -1;
lineStep = -width;
break;
case Right:
beginShift = height - 1;
pointStep = height;
lineStep = -1;
break;
}/*switch fmt*/
RGB565RotateSrcContinuous( pSrc, width, height, pDst, beginShift, lineStep, pointStep);
#endif
return 0;
}/*RGB565Rotate*/
The problem is very similiar as matrix transport,.
Of cource, amount of pDst/pSrc memory, there is only one which could be continue.
The Flag _DST_CINTINUOUS is to detect is pDst or pSrc memory continue.
By my benchmark, The result is not stable (max different runtime ~15% by same input when I run many time ) on i5-2400/ i5-2410M.
seem that is no different for pDst/pSrc ct ontinues.
I would like to know which is better for x86 architecture In theory?
Or that does no different for current x86 ?
Greedy for me .... may ask.... Is there are some trick for fast matrix transpose by SSE/AVX instruct set, or cashe control trick ?
Thank you.
Link Copied
2 Replies
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I re-benchamark above code on linux with GCC.
I found that, when I input 480x854 to rotate to Left, for 1000 round:
by RGB565RotateDstContinuous:
425 ms
by RGB565RotateSrcContinuous
535 ms
. Similiar result as turning Right.
That seems that Dst poiniter is continues is better.
What mechanism makes this result?
thank you.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
...
The problem is very similiar as matrix transport,.
...
I would like to know which is better for x86 architecture In theory?
[SergeyK] Let's consider a processing bya single thread. In that case,an Inplacealgorithm for a
matrix transpose that uses as less as possible elementexchanges. In practical
applications it outperforms a classic algorithm for a matrix transpose that requires a 2nd output matrix.
...
Is there are some trick for fast matrix transpose by SSE/AVX instruct set, or cashe control trick ?
The problem is very similiar as matrix transport,.
...
I would like to know which is better for x86 architecture In theory?
[SergeyK] Let's consider a processing bya single thread. In that case,an Inplacealgorithm for a
matrix transpose that uses as less as possible elementexchanges. In practical
applications it outperforms a classic algorithm for a matrix transpose that requires a 2nd output matrix.
...
Is there are some trick for fast matrix transpose by SSE/AVX instruct set, or cashe control trick ?
Please take a look at a Thread:
http://software.intel.com/en-us/forums/showthread.php?t=103465
( Post #13 has some real numbers)
Best regards,
Sergey
Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page