Intel® Integrated Performance Primitives
Deliberate problems developing high-performance vision, signal, security, and storage applications.

Speeding up H264 encoding with GPU

chrisdo
Beginner
372 Views
Hello,

I am currently trying to modify IPP samples for H264 encoding for speeding it up using CUDA.
My goal is to make Motion Estimation running on the GPU.

For that, I saw that each MacroBlock of a frame (actually a slice) is visited for ME and then encoding in the function H264CoreEncoder_Compress_Slice(). So the first part of my job is to do ME for all MBs first, and then to do encoding.
My problem is there, I'm stuck doing it : when I separate ME and encoding, the final encoded video is not as expected (it's all grey...).

Here is my code :

[cpp]Status H264ENC_MAKE_NAME(H264CoreEncoder_Compress_Slice_GPU)(
void* state,
H264SliceType *curr_slice,
bool is_first_mb)
{
H264CoreEncoderType* core_enc = (H264CoreEncoderType *)state;
H264CurrentMacroblockDescriptorType &cur_mb = curr_slice->m_cur_mb;
H264BsRealType* pBitstream = (H264BsRealType *)curr_slice->m_pbitstream;
Ipp32s slice_num = curr_slice->m_slice_number;
EnumSliceType slice_type = curr_slice->m_slice_type;
Ipp8u uUsePCM = 0;

Ipp8u *pStartBits;
Ipp32u uStartBitOffset;

Ipp32u uRecompressMB;
Ipp8u iLastQP;
Ipp32u uSaved_Skip_Run;

Ipp8u bSeenFirstMB = false;

Status status = UMC_OK;

Ipp32u uNumMBs = core_enc->m_HeightInMBs * core_enc->m_WidthInMBs;
Ipp32u uFirstMB = core_enc->m_field_index * uNumMBs;

H264CurrentMacroblockDescriptorType* mbTab = new H264CurrentMacroblockDescriptorType[ uNumMBs ];
for( unsigned int idx = 0 ; idx < uNumMBs; idx++ )
{
mbTab[ idx ].LocalMacroblockInfo = new H264MacroblockLocalInfo;
mbTab[ idx ].LocalMacroblockPairInfo = new H264MacroblockLocalInfo;
mbTab[ idx ].GlobalMacroblockInfo = new H264MacroblockGlobalInfo;
mbTab[ idx ].GlobalMacroblockPairInfo = new H264MacroblockGlobalInfo;
mbTab[ idx ].MacroblockCoeffsInfo = new H264MacroblockCoeffsInfo;
mbTab[ idx ].intra_types = new T_AIMode;
mbTab[ idx ].MVs[0] = new H264MacroblockMVs;
mbTab[ idx ].MVs[1] = new H264MacroblockMVs;
mbTab[ idx ].MVs[2] = new H264MacroblockMVs;
mbTab[ idx ].MVs[3] = new H264MacroblockMVs;
mbTab[ idx ].RefIdxs[0] = new H264MacroblockRefIdxs;
mbTab[ idx ].RefIdxs[1] = new H264MacroblockRefIdxs;
}

Ipp32s MBYAdjust = 0;
if (core_enc->m_field_index)
{
MBYAdjust = core_enc->m_HeightInMBs;
}

curr_slice->m_InitialOffset = core_enc->m_InitialOffsets[core_enc->m_pCurrentFrame->m_bottom_field_flag[core_enc->m_field_index]];
curr_slice->m_is_cur_mb_field = core_enc->m_pCurrentFrame->m_PictureStructureForDec < FRM_STRUCTURE;
curr_slice->m_is_cur_mb_bottom_field = core_enc->m_pCurrentFrame->m_bottom_field_flag[core_enc->m_field_index] == 1;

curr_slice->m_use_transform_for_intra_decision = 1;

// loop over all MBs in the picture
// MB Motion Estimation is done here
for (Ipp32u uMB = uFirstMB; uMB < uFirstMB + uNumMBs; uMB++)
{
// Is this MB in the current slice? If not, move on...
if (core_enc->m_pCurrentFrame->m_mbinfo.mbs[uMB].slice_id != slice_num)
{
continue;
}
else if (!bSeenFirstMB)
{
// Reset xpos and ypos in framedata struct
// This is necessary because the same slice may be recoded multiple times.

// reset intra MB counter per slice
curr_slice->m_Intra_MB_Counter = 0;
curr_slice->m_MB_Counter = 0;

// Fill in the first mb in slice field in the slice header.
curr_slice->m_first_mb_in_slice = is_first_mb ? 0 : uMB - uFirstMB;

// Fill in the current deblocking filter parameters.
curr_slice->m_slice_alpha_c0_offset = (Ipp8s)core_enc->m_info.deblocking_filter_alpha;
curr_slice->m_slice_beta_offset = (Ipp8s)core_enc->m_info.deblocking_filter_beta;
curr_slice->m_disable_deblocking_filter_idc = core_enc->m_info.deblocking_filter_idc;
curr_slice->m_cabac_init_idc = core_enc->m_info.cabac_init_idc;

// Write a slice header
H264ENC_MAKE_NAME(H264BsReal_PutSliceHeader)(
pBitstream,
core_enc->m_SliceHeader,
core_enc->m_PicParamSet,
core_enc->m_SeqParamSet,
core_enc->m_PicClass,
curr_slice);
bSeenFirstMB = true;

// Fill in the correct value for m_iLastXmittedQP, used to correctly code
// the per MB QP Delta
curr_slice->m_iLastXmittedQP = core_enc->m_PicParamSet.pic_init_qp + curr_slice->m_slice_qp_delta;
Ipp32s SliceQPy = curr_slice->m_iLastXmittedQP;

if (core_enc->m_info.entropy_coding_mode)
{
if (slice_type==INTRASLICE)
H264ENC_MAKE_NAME(H264BsReal_InitializeContextVariablesIntra_CABAC)(
pBitstream,
SliceQPy);
else
H264ENC_MAKE_NAME(H264BsReal_InitializeContextVariablesInter_CABAC)(
pBitstream,
SliceQPy,
curr_slice->m_cabac_init_idc);
}

// Initialize the MB skip run counter
curr_slice->m_uSkipRun = 0;
}

cur_mb.lambda = lambda_sq[curr_slice->m_iLastXmittedQP];
cur_mb.uMB = uMB;
cur_mb.chroma_format_idc = core_enc->m_PicParamSet.chroma_format_idc;
cur_mb.mbPtr = core_enc->m_pCurrentFrame->m_pYPlane + core_enc->m_pMBOffsets[uMB].uLumaOffset[core_enc->m_is_cur_pic_afrm][curr_slice->m_is_cur_mb_field];
cur_mb.mbPitchPixels = core_enc->m_pCurrentFrame->m_pitchPixels << curr_slice->m_is_cur_mb_field;
cur_mb.uMBx = uMB % core_enc->m_WidthInMBs;
cur_mb.uMBy = uMB / core_enc->m_WidthInMBs - MBYAdjust;
H264ENC_MAKE_NAME(H264CoreEncoder_UpdateCurrentMBInfo)(state, curr_slice);
cur_mb.lumaQP = getLumaQP(cur_mb.LocalMacroblockInfo->QP, core_enc->m_PicParamSet.bit_depth_luma);
cur_mb.lumaQP51 = getLumaQP51(cur_mb.LocalMacroblockInfo->QP, core_enc->m_PicParamSet.bit_depth_luma);
cur_mb.chromaQP = getChromaQP(cur_mb.LocalMacroblockInfo->QP, core_enc->m_PicParamSet.chroma_qp_index_offset, core_enc->m_SeqParamSet.bit_depth_chroma);
pSetMB8x8TSFlag(curr_slice->m_cur_mb.GlobalMacroblockInfo, 0);
curr_slice->m_MB_Counter++;
H264BsBase_GetState(&pBitstream->m_base, &pStartBits, &uStartBitOffset);
iLastQP = curr_slice->m_iLastXmittedQP;
uSaved_Skip_Run = curr_slice->m_uSkipRun; // To restore it if we recompress
uUsePCM = 0; // Don't use the PCM mode initially.
do
{ // this is to recompress MBs that are too big.
H264ENC_MAKE_NAME(H264CoreEncoder_MB_Decision)(state, curr_slice, uMB);

mbTab[ uMB-uFirstMB ].uMB = cur_mb.uMB;
mbTab[ uMB-uFirstMB ].uMBpair = cur_mb.uMBpair;
mbTab[ uMB-uFirstMB ].uMBx = cur_mb.uMBx;
mbTab[ uMB-uFirstMB ].uMBy = cur_mb.uMBy;
mbTab[ uMB-uFirstMB ].mbPtr = cur_mb.mbPtr;
mbTab[ uMB-uFirstMB ].mbPitchPixels = cur_mb.mbPitchPixels;
mbTab[ uMB-uFirstMB ].lambda = cur_mb.lambda;
mbTab[ uMB-uFirstMB ].chroma_format_idc = cur_mb.chroma_format_idc;
mbTab[ uMB-uFirstMB ].lumaQP = cur_mb.lumaQP;
mbTab[ uMB-uFirstMB ].lumaQP51 = cur_mb.lumaQP51;
mbTab[ uMB-uFirstMB ].chromaQP = cur_mb.chromaQP;
memcpy( mbTab[ uMB-uFirstMB ].LocalMacroblockInfo, cur_mb.LocalMacroblockInfo, sizeof(H264MacroblockLocalInfo) );
memcpy( mbTab[ uMB-uFirstMB ].LocalMacroblockPairInfo, cur_mb.LocalMacroblockPairInfo, sizeof(H264MacroblockLocalInfo) );
memcpy( mbTab[ uMB-uFirstMB ].GlobalMacroblockInfo, cur_mb.GlobalMacroblockInfo, sizeof(H264MacroblockGlobalInfo) );
memcpy( mbTab[ uMB-uFirstMB ].GlobalMacroblockPairInfo, cur_mb.GlobalMacroblockPairInfo, sizeof(H264MacroblockGlobalInfo) );
memcpy( mbTab[ uMB-uFirstMB ].MacroblockCoeffsInfo, cur_mb.MacroblockCoeffsInfo, sizeof(H264MacroblockCoeffsInfo) );
mbTab[ uMB-uFirstMB ].m_uIntraCBP4x4 = cur_mb.m_uIntraCBP4x4;
memcpy( mbTab[ uMB-uFirstMB ].m_iNumCoeffs4x4, cur_mb.m_iNumCoeffs4x4, 16*sizeof(Ipp32s) );
memcpy( mbTab[ uMB-uFirstMB ].m_iLastCoeff4x4, cur_mb.m_iLastCoeff4x4, 16*sizeof(Ipp32s) );
mbTab[ uMB-uFirstMB ].m_uIntraCBP8x8 = cur_mb.m_uIntraCBP8x8;
memcpy( mbTab[ uMB-uFirstMB ].m_iNumCoeffs8x8, cur_mb.m_iNumCoeffs8x8, 16*sizeof(Ipp32s) );
memcpy( mbTab[ uMB-uFirstMB ].m_iLastCoeff8x8, cur_mb.m_iLastCoeff8x8, 16*sizeof(Ipp32s) );
memcpy( mbTab[ uMB-uFirstMB ].intra_types, cur_mb.intra_types, sizeof(T_AIMode) );
mbTab[ uMB-uFirstMB ].mb4x4 = cur_mb.mb4x4;
mbTab[ uMB-uFirstMB ].mb8x8 = cur_mb.mb8x8;
mbTab[ uMB-uFirstMB ].mb16x16 = cur_mb.mb16x16;
mbTab[ uMB-uFirstMB ].mbInter = cur_mb.mbInter;
mbTab[ uMB-uFirstMB ].mbChromaInter = cur_mb.mbChromaInter;
mbTab[ uMB-uFirstMB ].mbChromaIntra = cur_mb.mbChromaIntra;

memcpy( mbTab[ uMB-uFirstMB ].MVs[0], cur_mb.MVs[0], sizeof(H264MacroblockMVs) );
memcpy( mbTab[ uMB-uFirstMB ].MVs[1], cur_mb.MVs[1], sizeof(H264MacroblockMVs) );
memcpy( mbTab[ uMB-uFirstMB ].MVs[2], cur_mb.MVs[2], sizeof(H264MacroblockMVs) );
memcpy( mbTab[ uMB-uFirstMB ].MVs[3], cur_mb.MVs[3], sizeof(H264MacroblockMVs) );
memcpy( mbTab[ uMB-uFirstMB ].RefIdxs[0], cur_mb.RefIdxs[0], sizeof(H264MacroblockRefIdxs) );
memcpy( mbTab[ uMB-uFirstMB ].RefIdxs[1], cur_mb.RefIdxs[1], sizeof(H264MacroblockRefIdxs) );
mbTab[ uMB-uFirstMB ].MacroblockNeighbours = cur_mb.MacroblockNeighbours;
mbTab[ uMB-uFirstMB ].BlockNeighbours = cur_mb.BlockNeighbours;

uRecompressMB = 0;
} while (uRecompressMB); // End of the MB recompression loop.
}

bSeenFirstMB = false;





// loop over all MBs in the picture
// encoding is done here
for (Ipp32u uMB = uFirstMB; uMB < uFirstMB + uNumMBs; uMB++)
{
// Is this MB in the current slice? If not, move on...
if (core_enc->m_pCurrentFrame->m_mbinfo.mbs[uMB].slice_id != slice_num)
{
continue;
}
else if (!bSeenFirstMB)
{
// Reset xpos and ypos in framedata struct
// This is necessary because the same slice may be recoded multiple times.

// reset intra MB counter per slice
curr_slice->m_Intra_MB_Counter = 0;
curr_slice->m_MB_Counter = 0;

// Fill in the first mb in slice field in the slice header.
curr_slice->m_first_mb_in_slice = is_first_mb ? 0 : uMB - uFirstMB;

// Fill in the current deblocking filter parameters.
curr_slice->m_slice_alpha_c0_offset = (Ipp8s)core_enc->m_info.deblocking_filter_alpha;
curr_slice->m_slice_beta_offset = (Ipp8s)core_enc->m_info.deblocking_filter_beta;
curr_slice->m_disable_deblocking_filter_idc = core_enc->m_info.deblocking_filter_idc;
curr_slice->m_cabac_init_idc = core_enc->m_info.cabac_init_idc;

bSeenFirstMB = true;

// Fill in the correct value for m_iLastXmittedQP, used to correctly code
// the per MB QP Delta
curr_slice->m_iLastXmittedQP = core_enc->m_PicParamSet.pic_init_qp + curr_slice->m_slice_qp_delta;
Ipp32s SliceQPy = curr_slice->m_iLastXmittedQP;

if (core_enc->m_info.entropy_coding_mode)
{
if (slice_type==INTRASLICE)
H264ENC_MAKE_NAME(H264BsReal_InitializeContextVariablesIntra_CABAC)(
pBitstream,
SliceQPy);
else
H264ENC_MAKE_NAME(H264BsReal_InitializeContextVariablesInter_CABAC)(
pBitstream,
SliceQPy,
curr_slice->m_cabac_init_idc);
}

// Initialize the MB skip run counter
curr_slice->m_uSkipRun = 0;
}

cur_mb.lambda = lambda_sq[curr_slice->m_iLastXmittedQP];
cur_mb.uMB = uMB;
cur_mb.chroma_format_idc = core_enc->m_PicParamSet.chroma_format_idc;
cur_mb.mbPtr = core_enc->m_pCurrentFrame->m_pYPlane + core_enc->m_pMBOffsets[uMB].uLumaOffset[core_enc->m_is_cur_pic_afrm][curr_slice->m_is_cur_mb_field];
cur_mb.mbPitchPixels = core_enc->m_pCurrentFrame->m_pitchPixels << curr_slice->m_is_cur_mb_field;
cur_mb.uMBx = uMB % core_enc->m_WidthInMBs;
cur_mb.uMBy = uMB / core_enc->m_WidthInMBs - MBYAdjust;
H264ENC_MAKE_NAME(H264CoreEncoder_UpdateCurrentMBInfo)(state, curr_slice);
cur_mb.lumaQP = getLumaQP(cur_mb.LocalMacroblockInfo->QP, core_enc->m_PicParamSet.bit_depth_luma);
cur_mb.lumaQP51 = getLumaQP51(cur_mb.LocalMacroblockInfo->QP, core_enc->m_PicParamSet.bit_depth_luma);
cur_mb.chromaQP = getChromaQP(cur_mb.LocalMacroblockInfo->QP, core_enc->m_PicParamSet.chroma_qp_index_offset, core_enc->m_SeqParamSet.bit_depth_chroma);
pSetMB8x8TSFlag(curr_slice->m_cur_mb.GlobalMacroblockInfo, 0);
curr_slice->m_MB_Counter++;
H264BsBase_GetState(&pBitstream->m_base, &pStartBits, &uStartBitOffset);
iLastQP = curr_slice->m_iLastXmittedQP;
uSaved_Skip_Run = curr_slice->m_uSkipRun; // To restore it if we recompress
uUsePCM = 0; // Don't use the PCM mode initially.
do
{ // this is to recompress MBs that are too big.

// we restore cur_mb
cur_mb.uMB = mbTab[ uMB-uFirstMB ].uMB;
cur_mb.uMBpair = mbTab[ uMB-uFirstMB ].uMBpair;
cur_mb.uMBx = mbTab[ uMB-uFirstMB ].uMBx;
cur_mb.uMBy = mbTab[ uMB-uFirstMB ].uMBy;
cur_mb.mbPtr = mbTab[ uMB-uFirstMB ].mbPtr;
cur_mb.mbPitchPixels = mbTab[ uMB-uFirstMB ].mbPitchPixels;
cur_mb.lambda = mbTab[ uMB-uFirstMB ].lambda;
cur_mb.chroma_format_idc = mbTab[ uMB-uFirstMB ].chroma_format_idc;
cur_mb.lumaQP = mbTab[ uMB-uFirstMB ].lumaQP;
cur_mb.lumaQP51 = mbTab[ uMB-uFirstMB ].lumaQP51;
cur_mb.chromaQP = mbTab[ uMB-uFirstMB ].chromaQP;
memcpy( cur_mb.LocalMacroblockInfo, mbTab[ uMB-uFirstMB ].LocalMacroblockInfo, sizeof(H264MacroblockLocalInfo) );
memcpy( cur_mb.LocalMacroblockPairInfo, mbTab[ uMB-uFirstMB ].LocalMacroblockPairInfo, sizeof(H264MacroblockLocalInfo) );
memcpy( cur_mb.GlobalMacroblockInfo, mbTab[ uMB-uFirstMB ].GlobalMacroblockInfo, sizeof(H264MacroblockGlobalInfo) );
memcpy( cur_mb.GlobalMacroblockPairInfo, mbTab[ uMB-uFirstMB ].GlobalMacroblockPairInfo, sizeof(H264MacroblockGlobalInfo) );
memcpy( cur_mb.MacroblockCoeffsInfo, mbTab[ uMB-uFirstMB ].MacroblockCoeffsInfo, sizeof(H264MacroblockCoeffsInfo) );
cur_mb.m_uIntraCBP4x4 = mbTab[ uMB-uFirstMB ].m_uIntraCBP4x4;
memcpy( cur_mb.m_iNumCoeffs4x4, mbTab[ uMB-uFirstMB ].m_iNumCoeffs4x4, 16*sizeof(Ipp32s) );
memcpy( cur_mb.m_iLastCoeff4x4, mbTab[ uMB-uFirstMB ].m_iLastCoeff4x4, 16*sizeof(Ipp32s) );
cur_mb.m_uIntraCBP8x8 = mbTab[ uMB-uFirstMB ].m_uIntraCBP8x8;
memcpy( cur_mb.m_iNumCoeffs8x8, mbTab[ uMB-uFirstMB ].m_iNumCoeffs8x8, 16*sizeof(Ipp32s) );
memcpy( cur_mb.m_iLastCoeff8x8, mbTab[ uMB-uFirstMB ].m_iLastCoeff8x8, 16*sizeof(Ipp32s) );
memcpy( cur_mb.intra_types, mbTab[ uMB-uFirstMB ].intra_types, sizeof(T_AIMode) );
cur_mb.mb4x4 = mbTab[ uMB-uFirstMB ].mb4x4;
cur_mb.mb8x8 = mbTab[ uMB-uFirstMB ].mb8x8;
cur_mb.mb16x16 = mbTab[ uMB-uFirstMB ].mb16x16;
cur_mb.mbInter = mbTab[ uMB-uFirstMB ].mbInter;
cur_mb.mbChromaInter = mbTab[ uMB-uFirstMB ].mbChromaInter;
cur_mb.mbChromaIntra = mbTab[ uMB-uFirstMB ].mbChromaIntra;

memcpy( cur_mb.MVs[0], mbTab[ uMB-uFirstMB ].MVs[0], sizeof(H264MacroblockMVs) );
memcpy( cur_mb.MVs[1], mbTab[ uMB-uFirstMB ].MVs[1], sizeof(H264MacroblockMVs) );
memcpy( cur_mb.MVs[2], mbTab[ uMB-uFirstMB ].MVs[2], sizeof(H264MacroblockMVs) );
memcpy( cur_mb.MVs[3], mbTab[ uMB-uFirstMB ].MVs[3], sizeof(H264MacroblockMVs) );
memcpy( cur_mb.RefIdxs[0], mbTab[ uMB-uFirstMB ].RefIdxs[0], sizeof(H264MacroblockRefIdxs) );
memcpy( cur_mb.RefIdxs[1], mbTab[ uMB-uFirstMB ].RefIdxs[1], sizeof(H264MacroblockRefIdxs) );
cur_mb.MacroblockNeighbours = mbTab[ uMB-uFirstMB ].MacroblockNeighbours;
cur_mb.BlockNeighbours = mbTab[ uMB-uFirstMB ].BlockNeighbours;

Ipp32s mb_bits;
Ipp32s bit_offset;
if (core_enc->m_PicParamSet.entropy_coding_mode)
{
bit_offset = pBitstream->m_base.m_nReadyBits;
if (pBitstream->m_base.m_nReadyBits == 9) bit_offset = 8;
}
// Code the macroblock, all planes
cur_mb.LocalMacroblockInfo->cbp_bits = 0;
cur_mb.LocalMacroblockInfo->cbp_bits_chroma = 0;
uSaved_Skip_Run = curr_slice->m_uSkipRun;
H264ENC_MAKE_NAME(H264CoreEncoder_CEncAndRecMB)(state, curr_slice);

mb_bits = 0;
status = H264ENC_MAKE_NAME(H264CoreEncoder_Put_MB_Real)(state, curr_slice);
if (status != UMC_OK)
goto done;

Ipp8u *pEndBits;
Ipp32u uEndBitOffset;
H264BsBase_GetState(&pBitstream->m_base, &pEndBits, &uEndBitOffset);

mb_bits += (Ipp32u) (pEndBits - pStartBits)*8;
if (uEndBitOffset >= uStartBitOffset)
mb_bits += uEndBitOffset - uStartBitOffset;
else
mb_bits -= uStartBitOffset - uEndBitOffset;

// Should not recompress for CABAC
if (!core_enc->m_PicParamSet.entropy_coding_mode && (mb_bits > MB_RECODE_THRESH) && core_enc->m_info.rate_controls.method == H264_RCM_QUANT)
{
// OK, this is bad, it's not compressing very much!!!
// TBD: Tune this decision to QP... Higher QPs will progressively trash PSNR,
// so if they are still using a lot of bits, then PCM coding is extra attractive.

// We're going to be recoding this MB, so reset some stuff.
H264BsBase_SetState(&pBitstream->m_base, pStartBits, uStartBitOffset);
// Zero out unused bits in buffer before OR in next op
// This removes dependency on buffer being zeroed out.
*pStartBits = (Ipp8u)((*pStartBits >> (8-uStartBitOffset)) << (8-uStartBitOffset));

curr_slice->m_iLastXmittedQP = iLastQP; // Restore the last xmitted QP
curr_slice->m_uSkipRun = uSaved_Skip_Run; // Restore the skip run

// If the QP has only been adjusted up 0 or 1 times, and QP != 51
if (((cur_mb.LocalMacroblockInfo->QP -
core_enc->m_PicParamSet.pic_init_qp + curr_slice->m_slice_qp_delta) < 2) &&
(cur_mb.LocalMacroblockInfo->QP != 51))
{
// Quantize more and try again!
cur_mb.LocalMacroblockInfo->QP++;
uRecompressMB = 1;
}
else
{
// Code this block as a PCM MB next time around.
uUsePCM = 1;
uRecompressMB = 0;
// Reset the MB QP value to the "last transmitted QP"
// Since no DeltaQP will be transmitted for a PCM block
// This is important, since the Loop Filter will use the
// this value in filtering this MB
cur_mb.LocalMacroblockInfo->QP = curr_slice->m_iLastXmittedQP;
}

}
else
{
uRecompressMB = 0;
}
} while (uRecompressMB); // End of the MB recompression loop.

// If the above MB encoding failed to efficiently predict the MB, then
// code it as raw pixels using the mb_type = PCM
if (uUsePCM)
{
cur_mb.GlobalMacroblockInfo->mbtype = MBTYPE_PCM;
cur_mb.LocalMacroblockInfo->cbp_luma = 0xffff;

memset(cur_mb.MacroblockCoeffsInfo->numCoeff, 16, 24);

Ipp32s k; // block number, 0 to 15
for (k = 0; k < 16; k++) {
cur_mb.intra_types = 2;
cur_mb.MVs[LIST_0]->MotionVectors = null_mv;
cur_mb.MVs[LIST_1]->MotionVectors = null_mv;
cur_mb.RefIdxs[LIST_0]->RefIdxs = -1;
cur_mb.RefIdxs[LIST_1]->RefIdxs = -1;
}

H264ENC_MAKE_NAME(H264CoreEncoder_Put_MBHeader_Real)(state, curr_slice); // PCM values are written in the MB Header.
}

if (core_enc->m_PicParamSet.entropy_coding_mode){
H264ENC_MAKE_NAME(H264BsReal_EncodeFinalSingleBin_CABAC)(
pBitstream,
(uMB == uFirstMB + uNumMBs - 1) ||
(core_enc->m_pCurrentFrame->m_mbinfo.mbs[uMB + 1].slice_id != slice_num));
H264ENC_MAKE_NAME(H264CoreEncoder_ReconstuctCBP)(&cur_mb);
}
} // loop over MBs

#ifndef NO_FINAL_SKIP_RUN
// Check if the last N MBs were skip blocks. If so, write a final skip run
// NOTE! This is _optional_. The encoder is not required to do this, and
// decoders need to be able to handle it either way.

// Even though skip runs are not written for I Slices, m_uSkipRun can only be
// non-zero for non-I slices, so the following test is OK.
if (curr_slice->m_uSkipRun !=0 && core_enc->m_info.entropy_coding_mode==0) {
H264ENC_MAKE_NAME(H264BsReal_PutVLCCode)(pBitstream, curr_slice->m_uSkipRun);
}

#endif // NO_FINAL_SKIP_RUN

// save the frame class

done:
if (core_enc->m_PicParamSet.entropy_coding_mode) {
H264ENC_MAKE_NAME(H264BsReal_TerminateEncode_CABAC)(pBitstream);
}
else {
H264BsBase_WriteTrailingBits(&pBitstream->m_base);
}

return status;

}[/cpp]

Do you have an idea of what is wrong ?
0 Kudos
4 Replies
andrewk88
Beginner
372 Views
>>My problem is there, I'm stuck doing it : when I separate ME and encoding, the final encoded video is not as >>expected (it's all grey...).


Regardless of your code, I'm afraid, there is a conceptual problem with your approach - you can't seperate "ME and encoding" due to the MB mode decision aimingatminimization ofthe encoding cost usually expressed as "D+lambda*R" to "optimize" a video quality loss vs. number of bits used for encoding. In other words,if youwant to encode a current MB in a reasonable/"optimal way"you should take a look atwhat is a modeof previous MB that has already been encoded, in particular a value of MV predictor.
Hope it helps,

AndrewK

PS.I'd reccomend to read some literature regarding the H.264 inter mode decision. Depending on what is exactly your ME algorithm youmightbe ableto"offload a low-level processing intensive part of ME" to GPU, but rather try to keep "encoding part" and mode decision on a single CPU / multicore with a shared memory. Alternatively, you might attempta multi-slice encoding but it has its own challenges.
0 Kudos
chrisdo
Beginner
372 Views
Quoting - andrewk88

Regardless of your code, I'm afraid, there is a conceptual problem with your approach - you can't seperate "ME and encoding" due to the MB mode decision aimingatminimization ofthe encoding cost usually expressed as "D+lambda*R" to "optimize" a video quality loss vs. number of bits used for encoding. In other words,if youwant to encode a current MB in a reasonable/"optimal way"you should take a look atwhat is a modeof previous MB that has already been encoded, in particular a value of MV predictor.
Hope it helps,

AndrewK

PS.I'd reccomend to read some literature regarding the H.264 inter mode decision. Depending on what is exactly your ME algorithm youmightbe ableto"offload a low-level processing intensive part of ME" to GPU, but rather try to keep "encoding part" and mode decision on a single CPU / multicore with a shared memory. Alternatively, you might attempta multi-slice encoding but it has its own challenges.

Hello Andrew, thanks for your answer!

It seems I was a little too optimistic... (and not enough documented) :D

Do you think I could enhance something doing SAD calculation for the macroblocks of a frame on the GPU before doing the real ME and encoding ?
0 Kudos
andrewk88
Beginner
372 Views
Quoting - chrisdo

Hello Andrew, thanks for your answer!

It seems I was a little too optimistic... (and not enough documented) :D

Do you think I could enhance something doing SAD calculation for the macroblocks of a frame on the GPU before doing the real ME and encoding ?

Definitely, you could do a full pel SADs on the GPU assuming some knowledge of a "promising" Motion Vectors candidates/predictors unless you can afford doing the Full Search on the GPU and sendinga portion of the bestSADs back to CPU to make a MB mode decision there. Depending on your ME algorithm, very often those MVs candidates/predictors are derived based on previously encoded MBs, so you'd need to establish a CPU-GPU communication protocol on a MB processing basis andI'm not sure if that is your intention and how beneficial that might be vs. processing& communcation on a frame level basis (half pel interpolationwould bea good candidate to be offloaded on a frame basis)

Best regards,

Andrew
0 Kudos
Priya_Natarajan
Beginner
372 Views
Quoting - andrewk88
Refer to the example/explanation in this article :
http://ieeexplore.org/stamp/stamp.jsp?tp=&arnumber=424284972


Definitely, you could do a full pel SADs on the GPU assuming some knowledge of a "promising" Motion Vectors candidates/predictors unless you can afford doing the Full Search on the GPU and sendinga portion of the bestSADs back to CPU to make a MB mode decision there. Depending on your ME algorithm, very often those MVs candidates/predictors are derived based on previously encoded MBs, so you'd need to establish a CPU-GPU communication protocol on a MB processing basis andI'm not sure if that is your intention and how beneficial that might be vs. processing& communcation on a frame level basis (half pel interpolationwould bea good candidate to be offloaded on a frame basis)

Best regards,

Andrew

0 Kudos
Reply