Community
cancel
Showing results for 
Search instead for 
Did you mean: 
DLake1
New Contributor I
84 Views

RGB to ARGB and xor faster

Whats the fastest way to convert RGB to ARGB and optionally xor at the same time? This is what I currently use:

bool DrawImage(const unsigned char* __restrict a, unsigned int* __restrict b, const int size, const bool xor){
	if (xor){
#pragma simd
		for (int i = size / 3; 0 < i; --i , a += 3){
			*b++ = *reinterpret_cast<const unsigned int*>(a) ^ *b | 0xFF000000;
		}
	}else{
#pragma ivdep
#pragma simd
		for (int i = size / 3; 0 < i; --i , a += 3){
			*b++ = *reinterpret_cast<const unsigned int*>(a) | 0xFF000000;
		}
	}
	return true;
}

These functions are performance critical and I want to convert them to inline assembly can this be optimized further:

lea         ecx,[esi+esi*2]  
movd        xmm1,dword ptr [edi+ecx+9]  
movd        xmm3,dword ptr [edi+ecx+3]  
movd        xmm2,dword ptr [edi+ecx+6]  
movd        xmm4,dword ptr [edi+ecx]  
punpckldq   xmm3,xmm1  
punpckldq   xmm4,xmm2  
punpckldq   xmm4,xmm3  
pxor        xmm4,xmmword ptr [ebx+esi*4]
por         xmm4,xmm0  
movdqa      xmmword ptr [ebx+esi*4],xmm4
add         esi,4  
cmp         esi,eax  
jb          DrawImage+0A4h (0F8F96F4h)  
lea         ecx,[esi+esi*2]  
movd        xmm1,dword ptr [edi+ecx+9]  
movd        xmm3,dword ptr [edi+ecx+3]  
movd        xmm2,dword ptr [edi+ecx+6]  
movd        xmm4,dword ptr [edi+ecx]  
punpckldq   xmm3,xmm1  
punpckldq   xmm4,xmm2  
punpckldq   xmm4,xmm3  
pxor        xmm4,xmmword ptr [ebx+esi*4]
por         xmm4,xmm0  
movdqa      xmmword ptr [ebx+esi*4],xmm4
add         esi,4  
cmp         esi,eax  
jb          DrawImage+0A4h (0F8F96F4h)  

 

0 Kudos
1 Reply
DLake1
New Contributor I
84 Views

Oh dam I wish I could edit the original post the second assembly code block was supposed to be this:

lea         ecx,[esi+esi*2]  
movd        xmm1,dword ptr [edi+ecx+9]  
movd        xmm3,dword ptr [edi+ecx+3]  
movd        xmm2,dword ptr [edi+ecx+6]  
movd        xmm4,dword ptr [edi+ecx]  
punpckldq   xmm3,xmm1  
punpckldq   xmm4,xmm2  
punpckldq   xmm4,xmm3  
por         xmm4,xmm0  
movdqa      xmmword ptr [ebx+esi*4],xmm4
add         esi,4  
cmp         esi,eax  
jb          DrawImage+199h (0F8797E9h)  

 

Reply