topic Re: Performance regression icc->icx in Intel® oneAPI DPC++/C++ Compiler

Performance regression icc->icx

KnutInge — Fri, 16 Aug 2024 13:50:58 GMT

I am looking at a compiler issue here, reduced to a 250-line snippet, where icx seems to have a regression compared to icc. In short, I am seeing 2x the execution time of compiled binary, 5.5x compilation time and 2x file size. The execution time is my main priority, while compilation time comes second. I have been playing with compiler options and reading the compiler migration guide offered by Intel, but I have been unable to reduce the regression any further. I would greatly appreciate any insights here.

(timing measurements are repeated until I get stable numbers)

# Build using ICC (i7-8665U)
source /opt/intel/bin/compilervars.sh intel64
icc --version
#icc (ICC) 19.1.0.166 20191121
time icc -Wall -Ofast -xICELAKE-SERVER -qopenmp-simd minimal_convert_slowness2_avx512_only_simpler.c -o slowness_icc.out
#real 0m0.361s

# Build using ICX (i7-8665U)
source /opt/icc-future/compiler/latest/env/vars.sh --force
icx --version
#Intel(R) oneAPI DPC++/C++ Compiler 2024.1.0 (2024.1.0.20240308)
time icx -Wall -Ofast -xrocketlake -fiopenmp minimal_convert_slowness2_avx512_only_simpler.c -o slowness_icx.out
#real 0m2.002s

# Execute ICC code (i9-11900KF)
time ./slowness_icc.out
#real 0m1.509s

# Execute ICX code (i9-11900KF) - explicitly point to libiomp5.so
time LD_LIBRARY_PATH=/home/tmp ./slowness_icx.out
#real 0m3.274s

ls -l slowness_icx.out
#99776

ls -l slowness_icc.out
#52536

#compile time factor: 2.002/0.361 = 5.5457x
#execution time factor: 3.274/1.509 = 2.1696x
#file size factor: 99776/52536 = 1.9x

Code snippet:

#include <stdint.h> #include <string.h> #include <stdlib.h> #include <immintrin.h> #define max(a,b) ((a) > (b) ? (a) : (b)) #define min(a,b) ((a) < (b) ? (a) : (b)) #define B00 ( 256) #define B01 ( 0) #define B02 ( 403) #define B10 ( 256) #define B11 ( 48) #define B12 ( 120) #define B20 ( 256) #define B21 ( 475) #define B22 ( 0) static __forceinline void interleave64_bgrx_for_a420_bt709_0_255( uint8_t *__restrict bgra, const uint8_t *__restrict b, const uint8_t *__restrict g, const uint8_t *__restrict r, const uint8_t *__restrict a) { __m512i zmm0 = _mm512_load_epi32((__m512i*)b); __m512i zmm1 = _mm512_load_epi32((__m512i*)g); __m512i zmm2 = _mm512_load_epi32((__m512i*)r); __m512i zmm3 = _mm512_load_epi32((__m512i*)a); __m512i zmm4 = _mm512_unpacklo_epi8(zmm0, zmm1); __m512i zmm5 = _mm512_unpackhi_epi8(zmm0, zmm1); __m512i zmm6 = _mm512_unpacklo_epi8(zmm2, zmm3); __m512i zmm7 = _mm512_unpackhi_epi8(zmm2, zmm3); __m512i zmm8 = _mm512_unpacklo_epi16(zmm4, zmm6); __m512i zmm9 = _mm512_unpackhi_epi16(zmm4, zmm6); __m512i zmm10 = _mm512_unpacklo_epi16(zmm5, zmm7); __m512i zmm11 = _mm512_unpackhi_epi16(zmm5, zmm7); __m512i zmm12 = _mm512_permutex2var_epi64(zmm8,_mm512_set_epi64(11,10,3,2, 9, 8,1,0),zmm9); __m512i zmm13 = _mm512_permutex2var_epi64(zmm8,_mm512_set_epi64(15,14,7,6,13,12,5,4),zmm9); __m512i zmm14 = _mm512_permutex2var_epi64(zmm10,_mm512_set_epi64(11,10,3,2, 9, 8,1,0),zmm11); __m512i zmm15 = _mm512_permutex2var_epi64(zmm10,_mm512_set_epi64(15,14,7,6,13,12,5,4),zmm11); _mm512_storeu_si512((__m512i*)&bgra[0 * 64], _mm512_permutex2var_epi64(zmm12,_mm512_set_epi64(11,10, 9, 8,3,2,1,0),zmm14)); _mm512_storeu_si512((__m512i*)&bgra[1 * 64], _mm512_permutex2var_epi64(zmm12,_mm512_set_epi64(15,14,13,12,7,6,5,4),zmm14)); _mm512_storeu_si512((__m512i*)&bgra[2 * 64], _mm512_permutex2var_epi64(zmm13,_mm512_set_epi64(11,10, 9, 8,3,2,1,0),zmm15)); _mm512_storeu_si512((__m512i*)&bgra[3 * 64], _mm512_permutex2var_epi64(zmm13,_mm512_set_epi64(15,14,13,12,7,6,5,4),zmm15)); } static void interleave32_bgrx_for_a420_bt709_0_255( uint8_t *__restrict bgra, const uint8_t *__restrict b, const uint8_t *__restrict g, const uint8_t *__restrict r, const uint8_t *__restrict a) { for (int i = 0; i < 32; i++) { bgra[i * 4 + 0] = b[i]; bgra[i * 4 + 1] = g[i]; bgra[i * 4 + 2] = r[i]; bgra[i * 4 + 3] = a[i]; } } static inline void color_process_32(const int16_t * cb, const int16_t * cr, const uint16_t * y1, uint8_t *__restrict b1, uint8_t *__restrict g1, uint8_t *__restrict r1, uint8_t *__restrict a1) { uint16_t r, g, b; #pragma omp simd simdlen(32) for (int k = 0; k < 32; k++) { int16_t c11 = B11 * cb[k]; int16_t c12 = B12 * cr[k]; int16_t m12 = -c12; int16_t c22 = c11 + c12; if (cr[k] > 0) r = min((uint16_t)(B00 * y1[k]) + (uint16_t)(B02 * abs(cr[k])), UINT16_MAX); else r = max((uint16_t)(B00 * y1[k]) - (uint16_t)(B02 * abs(cr[k])), 0); if (c11 < m12) g = min((uint16_t)(B00 * y1[k]) + (uint16_t)(-c22), UINT16_MAX); else g = max((uint16_t)(B00 * y1[k]) - (uint16_t)(c22), 0); if (cb[k] > 0) b = min((uint16_t)(B00 * y1[k]) + (uint16_t)(B21 * abs(cb[k])), UINT16_MAX); else b = max((uint16_t)(B00 * y1[k]) - (uint16_t)(B21 * abs(cb[k])), 0); r1[k] = max(min((int16_t)((uint16_t)(min(r + 128, UINT16_MAX)) >> 8), 255), 0); g1[k] = max(min((int16_t)((uint16_t)(min(g + 128, UINT16_MAX)) >> 8), 255), 0); b1[k] = max(min((int16_t)((uint16_t)(min(b + 128, UINT16_MAX)) >> 8), 255), 0); } } static inline void color_process_64(const int16_t * cb, const int16_t * cr, const uint16_t * y1, uint8_t *__restrict b1, uint8_t *__restrict g1, uint8_t *__restrict r1, uint8_t *__restrict a1) { uint16_t r, g, b; #pragma omp simd simdlen(64) for (int k = 0; k < 64; k++) { int16_t c11 = B11 * cb[k]; int16_t c12 = B12 * cr[k]; int16_t m12 = -c12; int16_t c22 = c11 + c12; if (cr[k] > 0) r = min((uint16_t)(B00 * y1[k]) + (uint16_t)(B02 * abs(cr[k])), UINT16_MAX); else r = max((uint16_t)(B00 * y1[k]) - (uint16_t)(B02 * abs(cr[k])), 0); if (c11 < m12) g = min((uint16_t)(B00 * y1[k]) + (uint16_t)(-c22), UINT16_MAX); else g = max((uint16_t)(B00 * y1[k]) - (uint16_t)(c22), 0); if (cb[k] > 0) b = min((uint16_t)(B00 * y1[k]) + (uint16_t)(B21 * abs(cb[k])), UINT16_MAX); else b = max((uint16_t)(B00 * y1[k]) - (uint16_t)(B21 * abs(cb[k])), 0); r1[k] = max(min((int16_t)((uint16_t)(min(r + 128, UINT16_MAX)) >> 8), 255), 0); g1[k] = max(min((int16_t)((uint16_t)(min(g + 128, UINT16_MAX)) >> 8), 255), 0); b1[k] = max(min((int16_t)((uint16_t)(min(b + 128, UINT16_MAX)) >> 8), 255), 0); } } void convert_a420_bt709_0_255_to_bgrx(const uint8_t * y, const uint8_t * u, const uint8_t * v, const int src_stride_y, const int src_stride_uv, uint8_t *__restrict dst, const int dst_stride, const int width, const int height) { uint16_t y0[64] = {0,}, y1[64] = {0,}; int16_t cb[64] = {0,}, cr[64] = {0,}; uint8_t r0[64] = {0,}, g0[64] = {0,}, b0[64] = {0,}, a0[64] = { 255,255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }; uint8_t r1[64] = {0,}, g1[64] = {0,}, b1[64] = {0,}, a1[64] = { 255,255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }; for (int i = 0; i < height; i++) { int j = 0; uint8_t *__restrict dstptr = dst + 2 * i * dst_stride; for (; j < width - width % 32; j += 32, dstptr += 8 * 32) { #pragma omp simd simdlen(64) for (int k = 0; k < 64; k++) { y0[k] = y[(2 * i + 0) * src_stride_y + 2 * j + k]; y1[k] = y[(2 * i + 1) * src_stride_y + 2 * j + k]; } #pragma omp simd simdlen(32) for (int k = 0; k < 32; k++) { cb[2 * k + 0] = u[i * src_stride_uv + j + k] - 128; cb[2 * k + 1] = u[i * src_stride_uv + j + k] - 128; cr[2 * k + 0] = v[i * src_stride_uv + j + k] - 128; cr[2 * k + 1] = v[i * src_stride_uv + j + k] - 128; } color_process_64(cb, cr, y0, b0, g0, r0, a0); interleave64_bgrx_for_a420_bt709_0_255(dstptr + 0 * dst_stride, b0, g0, r0, a0); color_process_64(cb, cr, y1, b1, g1, r1, a1); interleave64_bgrx_for_a420_bt709_0_255(dstptr + 1 * dst_stride, b1, g1, r1, a1); } for (; j < width - width % 16; j += 16, dstptr += 4 * 32) { #pragma omp simd simdlen(32) for (int k = 0; k < 32; k++) { y0[k] = y[(2 * i + 0) * src_stride_y + 2 * j + k]; y1[k] = y[(2 * i + 1) * src_stride_y + 2 * j + k]; } for (int k = 0; k < 16; k++) { cb[2 * k + 0] = u[i * src_stride_uv + j + k] - 128; cb[2 * k + 1] = u[i * src_stride_uv + j + k] - 128; cr[2 * k + 0] = v[i * src_stride_uv + j + k] - 128; cr[2 * k + 1] = v[i * src_stride_uv + j + k] - 128; } color_process_32(cb, cr, y0, b0, g0, r0, a0); interleave32_bgrx_for_a420_bt709_0_255(dstptr + 0 * dst_stride, b0, g0, r0, a0); color_process_32(cb, cr, y0, b1, g1, r1, a1); interleave32_bgrx_for_a420_bt709_0_255(dstptr + 1 * dst_stride, b1, g1, r1, a1); } if (j != width) { uint8_t bgra[4 * 32]; dstptr = bgra; int K = width - j; for (int k = 0; k < 2 * K; k++) { y0[k] = y[(2 * i + 0) * src_stride_y + 2 * j + k]; y1[k] = y[(2 * i + 1) * src_stride_y + 2 * j + k]; } for (int k = 0; k < K; k++) { cb[2 * k + 0] = u[i * src_stride_uv + j + k] - 128; cb[2 * k + 1] = u[i * src_stride_uv + j + k] - 128; cr[2 * k + 0] = v[i * src_stride_uv + j + k] - 128; cr[2 * k + 1] = v[i * src_stride_uv + j + k] - 128; } color_process_32(cb, cr, y0, b0, g0, r0, a0); interleave32_bgrx_for_a420_bt709_0_255(dstptr + 0 * dst_stride, b0, g0, r0, a0); memcpy(dst + (2 * i + 0) * dst_stride + j * 32 / 4, dstptr, K * 32 / 4); color_process_32(cb, cr, y0, b1, g1, r1, a1); interleave32_bgrx_for_a420_bt709_0_255(dstptr + 1 * dst_stride, b1, g1, r1, a1); memcpy(dst + (2 * i + 1) * dst_stride + j * 32 / 4, dstptr, K * 32 / 4); } } } int main () { int width = 1280; int height = 720; uint8_t *src_y_data = (uint8_t *)_mm_malloc (height * width, 64); uint8_t *src_u_data = (uint8_t *) _mm_malloc (((height + 1) / 2) * ((width + 1) / 2), 64); uint8_t *src_v_data = (uint8_t *)_mm_malloc (((height + 1) / 2) * ((width + 1) / 2), 64); memset(src_y_data, 128, height * width); memset(src_u_data, 128, ((height + 1) / 2) * ((width + 1) / 2)); memset(src_v_data, 128, ((height + 1) / 2) * ((width + 1) / 2)); uint8_t *dst_data = (uint8_t *)_mm_malloc (height * width * 4, 64); const int N = 10000; /* run code*/ for (int i = 0; i < N; i++) convert_a420_bt709_0_255_to_bgrx(src_y_data, src_u_data, src_v_data, width, (width + 1) / 2, dst_data, width * 4, width / 2, height / 2); /* cleanup */ _mm_free (src_y_data); _mm_free (src_u_data); _mm_free (src_v_data); _mm_free (dst_data); }

Re: Performance regression icc->icx

KnutInge — Tue, 20 Aug 2024 05:34:30 GMT

I did try forcing subfunctions to not be inline. That helps significantly on compilation time, but not so much on execution time. Anyways, it seems that this is a bug/undesired feature where icx spends a lot of time trying to do something that turns out to be the wrong thing?

Re: Performance regression icc->icx

Alex_Y_Intel — Sat, 24 Aug 2024 00:20:04 GMT

This issue is escalated to our internal team for further investigation.

Re:Performance regression icc->icx

Alex_Y_Intel — Mon, 30 Sep 2024 09:31:43 GMT

We're still investigating this issue.

Re:Performance regression icc->icx

Alex_Y_Intel — Mon, 28 Oct 2024 07:16:59 GMT

Is it acceptable for you to modify your code to use saturation builtins? This will result reduced time. Our engineers have reported your issue to optimization team for possible better solution.

#include <stdint.h>

#include <string.h>

#include <stdlib.h>

#include <immintrin.h>

#define max(a,b) ((a) > (b) ? (a) : (b))

#define min(a,b) ((a) < (b) ? (a) : (b))

#define B00 ( 256)

#define B01 ( 0)

#define B02 ( 403)

#define B10 ( 256)

#define B11 ( 48)

#define B12 ( 120)

#define B20 ( 256)

#define B21 ( 475)

#define B22 ( 0)

static __forceinline void interleave64_bgrx_for_a420_bt709_0_255(

uint8_t *__restrict bgra,

const uint8_t *__restrict b,

const uint8_t *__restrict g,

const uint8_t *__restrict r,

const uint8_t *__restrict a)

{

__m512i zmm0 = _mm512_load_epi32((__m512i*)b);

__m512i zmm1 = _mm512_load_epi32((__m512i*)g);

__m512i zmm2 = _mm512_load_epi32((__m512i*)r);

__m512i zmm3 = _mm512_load_epi32((__m512i*)a);

__m512i zmm4 = _mm512_unpacklo_epi8(zmm0, zmm1);

__m512i zmm5 = _mm512_unpackhi_epi8(zmm0, zmm1);

__m512i zmm6 = _mm512_unpacklo_epi8(zmm2, zmm3);

__m512i zmm7 = _mm512_unpackhi_epi8(zmm2, zmm3);

__m512i zmm8 = _mm512_unpacklo_epi16(zmm4, zmm6);

__m512i zmm9 = _mm512_unpackhi_epi16(zmm4, zmm6);

__m512i zmm10 = _mm512_unpacklo_epi16(zmm5, zmm7);

__m512i zmm11 = _mm512_unpackhi_epi16(zmm5, zmm7);

__m512i zmm12 = _mm512_permutex2var_epi64(zmm8,_mm512_set_epi64(11,10,3,2, 9, 8,1,0),zmm9);

__m512i zmm13 = _mm512_permutex2var_epi64(zmm8,_mm512_set_epi64(15,14,7,6,13,12,5,4),zmm9);

__m512i zmm14 = _mm512_permutex2var_epi64(zmm10,_mm512_set_epi64(11,10,3,2, 9, 8,1,0),zmm11);

__m512i zmm15 = _mm512_permutex2var_epi64(zmm10,_mm512_set_epi64(15,14,7,6,13,12,5,4),zmm11);

_mm512_storeu_si512((__m512i*)&bgra[0 * 64], _mm512_permutex2var_epi64(zmm12,_mm512_set_epi64(11,10, 9, 8,3,2,1,0),zmm14));

_mm512_storeu_si512((__m512i*)&bgra[1 * 64], _mm512_permutex2var_epi64(zmm12,_mm512_set_epi64(15,14,13,12,7,6,5,4),zmm14));

_mm512_storeu_si512((__m512i*)&bgra[2 * 64], _mm512_permutex2var_epi64(zmm13,_mm512_set_epi64(11,10, 9, 8,3,2,1,0),zmm15));

_mm512_storeu_si512((__m512i*)&bgra[3 * 64], _mm512_permutex2var_epi64(zmm13,_mm512_set_epi64(15,14,13,12,7,6,5,4),zmm15));

}

static void interleave32_bgrx_for_a420_bt709_0_255(

uint8_t *__restrict bgra,

const uint8_t *__restrict b,

const uint8_t *__restrict g,

const uint8_t *__restrict r,

const uint8_t *__restrict a)

{

for (int i = 0; i < 32; i++)

{

bgra[i * 4 + 0] = b[i];

bgra[i * 4 + 1] = g[i];

bgra[i * 4 + 2] = r[i];

bgra[i * 4 + 3] = a[i];

}

static inline void color_process_32(const int16_t * cb, const int16_t * cr, const uint16_t * y1, uint8_t *__restrict b1, uint8_t *__restrict g1, uint8_t *__restrict r1, uint8_t *__restrict a1)

{

uint16_t r, g, b;

#pragma omp simd simdlen(32)

for (int k = 0; k < 32; k++)

{

int16_t c11 = B11 * cb[k];

int16_t c12 = B12 * cr[k];

int16_t c22 = c11 + c12;

r = __builtin_elementwise_add_sat(B00 * y1[k], B02 * cr[k]);

g = __builtin_elementwise_sub_sat(B00 * y1[k], c22);

b = __builtin_elementwise_add_sat(B00 * y1[k], B21 * cb[k]);

r = __builtin_elementwise_add_sat(r, 128);

g = __builtin_elementwise_add_sat(r, 128);

b = __builtin_elementwise_add_sat(r, 128);

r1[k] = max(min((int16_t)(r >> 8), 255), 0);

g1[k] = max(min((int16_t)(g >> 8), 255), 0);

b1[k] = max(min((int16_t)(b >> 8), 255), 0);

}

static inline void color_process_64(const int16_t * cb, const int16_t * cr, const uint16_t * y1, uint8_t *__restrict b1, uint8_t *__restrict g1, uint8_t *__restrict r1, uint8_t *__restrict a1)

{

uint16_t r, g, b;

#pragma omp simd simdlen(64)

for (int k = 0; k < 64; k++)

{

int16_t c11 = B11 * cb[k];

int16_t c12 = B12 * cr[k];

int16_t c22 = c11 + c12;

r = __builtin_elementwise_add_sat(B00 * y1[k], B02 * cr[k]);

g = __builtin_elementwise_sub_sat(B00 * y1[k], c22);

b = __builtin_elementwise_add_sat(B00 * y1[k], B21 * cb[k]);

r = __builtin_elementwise_add_sat(r, 128);

g = __builtin_elementwise_add_sat(r, 128);

b = __builtin_elementwise_add_sat(r, 128);

r1[k] = max(min((int16_t)(r >> 8), 255), 0);

g1[k] = max(min((int16_t)(g >> 8), 255), 0);

b1[k] = max(min((int16_t)(b >> 8), 255), 0);

}

void convert_a420_bt709_0_255_to_bgrx(const uint8_t * y, const uint8_t * u, const uint8_t * v, const int src_stride_y, const int src_stride_uv, uint8_t *__restrict dst, const int dst_stride, const int width, const int height)

{

uint16_t y0[64] = {0,}, y1[64] = {0,};

int16_t cb[64] = {0,}, cr[64] = {0,};

uint8_t r0[64] = {0,}, g0[64] = {0,}, b0[64] = {0,}, a0[64] = { 255,255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 };

uint8_t r1[64] = {0,}, g1[64] = {0,}, b1[64] = {0,}, a1[64] = { 255,255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 };

for (int i = 0; i < height; i++)

{

int j = 0;

uint8_t *__restrict dstptr = dst + 2 * i * dst_stride;

for (; j < width - width % 32; j += 32, dstptr += 8 * 32)

{

#pragma omp simd simdlen(64)

for (int k = 0; k < 64; k++)

{

y0[k] = y[(2 * i + 0) * src_stride_y + 2 * j + k];

y1[k] = y[(2 * i + 1) * src_stride_y + 2 * j + k];

}

#pragma omp simd simdlen(32)

for (int k = 0; k < 32; k++)

{

cb[2 * k + 0] = u[i * src_stride_uv + j + k] - 128;

cb[2 * k + 1] = u[i * src_stride_uv + j + k] - 128;

cr[2 * k + 0] = v[i * src_stride_uv + j + k] - 128;

cr[2 * k + 1] = v[i * src_stride_uv + j + k] - 128;

}

color_process_64(cb, cr, y0, b0, g0, r0, a0);

interleave64_bgrx_for_a420_bt709_0_255(dstptr + 0 * dst_stride, b0, g0, r0, a0);

color_process_64(cb, cr, y1, b1, g1, r1, a1);

interleave64_bgrx_for_a420_bt709_0_255(dstptr + 1 * dst_stride, b1, g1, r1, a1);

}

for (; j < width - width % 16; j += 16, dstptr += 4 * 32)

{

#pragma omp simd simdlen(32)

for (int k = 0; k < 32; k++)

{

y0[k] = y[(2 * i + 0) * src_stride_y + 2 * j + k];

y1[k] = y[(2 * i + 1) * src_stride_y + 2 * j + k];

}

for (int k = 0; k < 16; k++)

{

cb[2 * k + 0] = u[i * src_stride_uv + j + k] - 128;

cb[2 * k + 1] = u[i * src_stride_uv + j + k] - 128;

cr[2 * k + 0] = v[i * src_stride_uv + j + k] - 128;

cr[2 * k + 1] = v[i * src_stride_uv + j + k] - 128;

}

color_process_32(cb, cr, y0, b0, g0, r0, a0);

interleave32_bgrx_for_a420_bt709_0_255(dstptr + 0 * dst_stride, b0, g0, r0, a0);

color_process_32(cb, cr, y0, b1, g1, r1, a1);

interleave32_bgrx_for_a420_bt709_0_255(dstptr + 1 * dst_stride, b1, g1, r1, a1);

}

if (j != width)

{

uint8_t bgra[4 * 32];

dstptr = bgra;

int K = width - j;

for (int k = 0; k < 2 * K; k++)

{

y0[k] = y[(2 * i + 0) * src_stride_y + 2 * j + k];

y1[k] = y[(2 * i + 1) * src_stride_y + 2 * j + k];

}

for (int k = 0; k < K; k++)

{

cb[2 * k + 0] = u[i * src_stride_uv + j + k] - 128;

cb[2 * k + 1] = u[i * src_stride_uv + j + k] - 128;

cr[2 * k + 0] = v[i * src_stride_uv + j + k] - 128;

cr[2 * k + 1] = v[i * src_stride_uv + j + k] - 128;

}

color_process_32(cb, cr, y0, b0, g0, r0, a0);

interleave32_bgrx_for_a420_bt709_0_255(dstptr + 0 * dst_stride, b0, g0, r0, a0);

memcpy(dst + (2 * i + 0) * dst_stride + j * 32 / 4, dstptr, K * 32 / 4);

color_process_32(cb, cr, y0, b1, g1, r1, a1);

interleave32_bgrx_for_a420_bt709_0_255(dstptr + 1 * dst_stride, b1, g1, r1, a1);

memcpy(dst + (2 * i + 1) * dst_stride + j * 32 / 4, dstptr, K * 32 / 4);

}

int

main ()

{

int width = 1280;

int height = 720;

uint8_t *src_y_data = (uint8_t *)_mm_malloc (height * width, 64);

uint8_t *src_u_data = (uint8_t *) _mm_malloc (((height + 1) / 2) * ((width + 1) / 2), 64);

uint8_t *src_v_data = (uint8_t *)_mm_malloc (((height + 1) / 2) * ((width + 1) / 2), 64);

memset(src_y_data, 128, height * width);

memset(src_u_data, 128, ((height + 1) / 2) * ((width + 1) / 2));

memset(src_v_data, 128, ((height + 1) / 2) * ((width + 1) / 2));

uint8_t *dst_data = (uint8_t *)_mm_malloc (height * width * 4, 64);

const int N = 10000;

/* run code*/

for (int i = 0; i < N; i++)

convert_a420_bt709_0_255_to_bgrx(src_y_data, src_u_data, src_v_data, width, (width + 1) / 2, dst_data, width * 4, width / 2, height / 2);

/* cleanup */

_mm_free (src_y_data);

_mm_free (src_u_data);

_mm_free (src_v_data);

_mm_free (dst_data);

}