libbpg/x265/source/encoder/search.cpp

3558 lines
148 KiB
C++
Raw Normal View History

2015-10-27 10:46:00 +00:00
/*****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve@borho.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#include "common.h"
#include "primitives.h"
#include "picyuv.h"
#include "cudata.h"
#include "search.h"
#include "entropy.h"
#include "rdcost.h"
#include "analysis.h" // TLD
#include "framedata.h"
using namespace X265_NS;
#if _MSC_VER
#pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
#pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
#pragma warning(disable: 4127) // conditional expression is constant
#endif
#define MVP_IDX_BITS 1
ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
Search::Search()
{
memset(m_rqt, 0, sizeof(m_rqt));
for (int i = 0; i < 3; i++)
{
m_qtTempTransformSkipFlag[i] = NULL;
m_qtTempCbf[i] = NULL;
}
m_numLayers = 0;
m_intraPred = NULL;
m_intraPredAngs = NULL;
m_fencScaled = NULL;
m_fencTransposed = NULL;
m_tsCoeff = NULL;
m_tsResidual = NULL;
m_tsRecon = NULL;
m_param = NULL;
m_slice = NULL;
m_frame = NULL;
}
bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
{
uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
m_param = &param;
m_bEnableRDOQ = !!param.rdoqLevel;
m_bFrameParallel = param.frameNumThreads > 1;
m_numLayers = g_log2Size[param.maxCUSize] - 2;
m_rdCost.setPsyRdScale(param.psyRd);
m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder);
if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
ok &= m_quant.allocNoiseReduction(param);
ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
/* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
* available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */
m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
uint32_t sizeL = 1 << (maxLog2CUSize * 2);
uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
/* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
* the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
* which are reconstructed at each depth are valid. At the end, the transform depth table
* is walked and the coeff and recon at the correct depths are collected */
for (uint32_t i = 0; i <= m_numLayers; i++)
{
CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
}
/* the rest of these buffers are indexed per-depth */
for (uint32_t i = 0; i <= g_maxCUDepth; i++)
{
int cuSize = g_maxCUSize >> i;
ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp);
ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp);
ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp);
ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
}
CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
m_fencScaled = m_intraPred + 32 * 32;
m_fencTransposed = m_fencScaled + 32 * 32;
m_intraPredAngs = m_fencTransposed + 32 * 32;
CHECKED_MALLOC(m_tsCoeff, coeff_t, MAX_TS_SIZE * MAX_TS_SIZE);
CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
CHECKED_MALLOC(m_tsRecon, pixel, MAX_TS_SIZE * MAX_TS_SIZE);
return ok;
fail:
return false;
}
Search::~Search()
{
for (uint32_t i = 0; i <= m_numLayers; i++)
{
X265_FREE(m_rqt[i].coeffRQT[0]);
m_rqt[i].reconQtYuv.destroy();
m_rqt[i].resiQtYuv.destroy();
}
for (uint32_t i = 0; i <= g_maxCUDepth; i++)
{
m_rqt[i].tmpResiYuv.destroy();
m_rqt[i].tmpPredYuv.destroy();
m_rqt[i].bidirPredYuv[0].destroy();
m_rqt[i].bidirPredYuv[1].destroy();
}
X265_FREE(m_qtTempCbf[0]);
X265_FREE(m_qtTempTransformSkipFlag[0]);
X265_FREE(m_intraPred);
X265_FREE(m_tsCoeff);
X265_FREE(m_tsResidual);
X265_FREE(m_tsRecon);
}
int Search::setLambdaFromQP(const CUData& ctu, int qp)
{
X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
m_me.setQP(qp);
m_rdCost.setQP(*m_slice, qp);
int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
m_quant.setQPforQuant(ctu, quantQP);
return quantQP;
}
#if CHECKED_BUILD || _DEBUG
void Search::invalidateContexts(int fromDepth)
{
/* catch reads without previous writes */
for (int d = fromDepth; d < NUM_FULL_DEPTH; d++)
{
m_rqt[d].cur.markInvalid();
m_rqt[d].rqtTemp.markInvalid();
m_rqt[d].rqtRoot.markInvalid();
m_rqt[d].rqtTest.markInvalid();
}
}
#else
void Search::invalidateContexts(int) {}
#endif
void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
{
uint32_t subdiv = tuDepth < cu.m_tuDepth[absPartIdx];
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
if (!(log2TrSize - m_hChromaShift < 2))
{
if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
}
if (subdiv)
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
}
}
void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
{
if (!cu.getCbf(absPartIdx, ttype, tuDepth))
return;
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
if (tuDepth < cu.m_tuDepth[absPartIdx])
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
return;
}
uint32_t tuDepthC = tuDepth;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
if (absPartIdx & 3)
return;
log2TrSizeC = 2;
tuDepthC--;
}
uint32_t qtLayer = log2TrSize - 2;
if (m_csp != X265_CSP_I422)
{
uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0;
uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift);
coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
}
else
{
uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
uint32_t subTUSize = 1 << (log2TrSizeC * 2);
uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
}
}
void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
uint32_t fullDepth = cuGeom.depth + tuDepth;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
uint32_t qtLayer = log2TrSize - 2;
uint32_t sizeIdx = log2TrSize - 2;
bool mightNotSplit = log2TrSize <= depthRange[1];
bool mightSplit = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
/* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
{
mightNotSplit = false;
mightSplit = true;
}
Cost fullCost;
uint32_t bCBF = 0;
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
if (mightNotSplit)
{
if (mightSplit)
m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
uint32_t stride = mode.fencYuv->m_size;
// init availability pattern
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
// get prediction signal
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
// store original entropy coding status
if (m_bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
}
else
// no coded residual, recon = pred
primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride);
bCBF = !!numSig << tuDepth;
cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride);
m_entropyCoder.resetBits();
if (!absPartIdx)
{
if (!cu.m_slice->isIntra())
{
if (cu.m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
m_entropyCoder.codeSkipFlag(cu, 0);
m_entropyCoder.codePredMode(cu.m_predMode[0]);
}
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
}
if (cu.m_partSize[0] == SIZE_2Nx2N)
{
if (!absPartIdx)
m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
}
else
{
uint32_t qNumParts = cuGeom.numPartitions >> 2;
if (!tuDepth)
{
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
}
else if (!(absPartIdx & (qNumParts - 1)))
m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
}
if (log2TrSize != depthRange[0])
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
fullCost.bits *= 4;
if (m_rdCost.m_psyRd)
{
fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
}
else
fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
}
else
fullCost.rdcost = MAX_INT64;
if (mightSplit)
{
if (mightNotSplit)
{
m_entropyCoder.store(m_rqt[fullDepth].rqtTest); // save state after full TU encode
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); // prep state of split encode
}
/* code split block */
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
if (m_param->bEnableTSkipFast)
checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
Cost splitCost;
uint32_t cbf = 0;
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
if (checkTransformSkip)
codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
else
codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
}
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
if (mightNotSplit && log2TrSize != depthRange[0])
{
/* If we could have coded this TU depth, include cost of subdiv flag */
m_entropyCoder.resetBits();
m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
if (m_rdCost.m_psyRd)
splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
else
splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
}
if (splitCost.rdcost < fullCost.rdcost)
{
outCost.rdcost += splitCost.rdcost;
outCost.distortion += splitCost.distortion;
outCost.bits += splitCost.bits;
outCost.energy += splitCost.energy;
return;
}
else
{
// recover entropy state of full-size TU encode
m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
// recover transform index and Cbf values
cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
}
}
// set reconstruction for next intra prediction blocks if full TU prediction won
PicYuv* reconPic = m_frame->m_reconPic;
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
intptr_t picStride = reconPic->m_stride;
primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
outCost.rdcost += fullCost.rdcost;
outCost.distortion += fullCost.distortion;
outCost.bits += fullCost.bits;
outCost.energy += fullCost.energy;
}
void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
{
uint32_t fullDepth = cuGeom.depth + tuDepth;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
uint32_t tuSize = 1 << log2TrSize;
X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
CUData& cu = mode.cu;
Yuv* predYuv = &mode.predYuv;
const Yuv* fencYuv = mode.fencYuv;
Cost fullCost;
fullCost.rdcost = MAX_INT64;
int bTSkip = 0;
uint32_t bCBF = 0;
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
pixel* pred = predYuv->getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
uint32_t stride = fencYuv->m_size;
uint32_t sizeIdx = log2TrSize - 2;
// init availability pattern
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
// get prediction signal
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
uint32_t qtLayer = log2TrSize - 2;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
// store original entropy coding status
m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
if (m_bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
int checkTransformSkip = 1;
for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
{
uint64_t tmpCost;
uint32_t tmpEnergy = 0;
coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
pixel* tmpRecon = (useTSkip ? m_tsRecon : reconQt);
uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, residual, stride, stride);
}
else if (useTSkip)
{
/* do not allow tskip if CBF=0, pretend we did not try tskip */
checkTransformSkip = 0;
break;
}
else
// no residual coded, recon = pred
primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
sse_ret_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
if (useTSkip)
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
m_entropyCoder.resetBits();
if (!absPartIdx)
{
if (!cu.m_slice->isIntra())
{
if (cu.m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
m_entropyCoder.codeSkipFlag(cu, 0);
m_entropyCoder.codePredMode(cu.m_predMode[0]);
}
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
}
if (cu.m_partSize[0] == SIZE_2Nx2N)
{
if (!absPartIdx)
m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
}
else
{
uint32_t qNumParts = cuGeom.numPartitions >> 2;
if (!tuDepth)
{
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
}
else if (!(absPartIdx & (qNumParts - 1)))
m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
}
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
if (!useTSkip)
m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
if (m_rdCost.m_psyRd)
{
tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
}
else
tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
if (tmpCost < fullCost.rdcost)
{
bTSkip = useTSkip;
bCBF = !!numSig;
fullCost.rdcost = tmpCost;
fullCost.distortion = tmpDist;
fullCost.bits = tmpBits;
fullCost.energy = tmpEnergy;
}
}
if (bTSkip)
{
memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2));
primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize);
}
else if (checkTransformSkip)
{
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
}
// set reconstruction for next intra prediction blocks
PicYuv* reconPic = m_frame->m_reconPic;
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
intptr_t picStride = reconPic->m_stride;
primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
outCost.rdcost += fullCost.rdcost;
outCost.distortion += fullCost.distortion;
outCost.bits += fullCost.bits;
outCost.energy += fullCost.energy;
}
/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
uint32_t fullDepth = cuGeom.depth + tuDepth;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
bool bCheckFull = log2TrSize <= depthRange[1];
X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
/* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
* since we are not measuring RD cost */
if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4)
bCheckFull = false;
if (bCheckFull)
{
const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
uint32_t stride = mode.fencYuv->m_size;
// init availability pattern
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
// get prediction signal
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffY = cu.m_trCoeff[0] + coeffOffsetY;
uint32_t sizeIdx = log2TrSize - 2;
primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
PicYuv* reconPic = m_frame->m_reconPic;
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
intptr_t picStride = reconPic->m_stride;
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, residual, stride, stride);
cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
}
else
{
primitives.cu[sizeIdx].copy_pp(picReconY, picStride, pred, stride);
cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
}
}
else
{
X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
/* code split block */
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t cbf = 0;
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
}
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
}
}
void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
{
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
if (tuDepth == cu.m_tuDepth[absPartIdx])
{
uint32_t qtLayer = log2TrSize - 2;
// copy transform coefficients
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
coeff_t* coeffDestY = cu.m_trCoeff[0] + coeffOffsetY;
memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
// copy reconstruction
m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
}
else
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
}
}
inline void offsetCBFs(uint8_t subTUCBF[2])
{
uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
}
/* 4:2:2 post-TU split processing */
void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
{
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
if (log2TrSize == 2)
{
X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
++log2TrSize;
}
uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
// move the CBFs down a level and set the parent CBF
uint8_t subTUCBF[2];
subTUCBF[0] = cu.getCbf(absPartIdx , ttype, tuDepth);
subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
offsetCBFs(subTUCBF);
cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx , tuNumParts);
cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
}
/* returns distortion */
uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
{
CUData& cu = mode.cu;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
if (tuDepth < cu.m_tuDepth[absPartIdx])
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
{
cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
}
return outDist;
}
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
uint32_t tuDepthC = tuDepth;
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
if (absPartIdx & 3)
return 0;
log2TrSizeC = 2;
tuDepthC--;
}
if (m_bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
if (checkTransformSkip)
return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
uint32_t qtLayer = log2TrSize - 2;
uint32_t stride = mode.fencYuv->m_csize;
const uint32_t sizeIdxC = log2TrSizeC - 2;
sse_ret_t outDist = 0;
uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
TURecurse tuIterator(splitType, curPartNum, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
TextType ttype = (TextType)chromaId;
const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
PicYuv* reconPic = m_frame->m_reconPic;
pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
intptr_t picStride = reconPic->m_strideC;
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
if (chromaPredMode == DM_CHROMA_IDX)
chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
if (m_csp == X265_CSP_I422)
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
// init availability pattern
initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
// get prediction signal
predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
// no coded residual, recon = pred
primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
outDist += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride));
if (m_rdCost.m_psyRd)
psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride);
}
}
while (tuIterator.isNextSection());
if (splitType == VERTICAL_SPLIT)
{
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
}
return outDist;
}
/* returns distortion */
uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
{
CUData& cu = mode.cu;
uint32_t fullDepth = cuGeom.depth + tuDepth;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
const uint32_t log2TrSizeC = 2;
uint32_t qtLayer = log2TrSize - 2;
uint32_t outDist = 0;
/* At the TU layers above this one, no RDO is performed, only distortion is being measured,
* so the entropy coder is not very accurate. The best we can do is return it in the same
* condition as it arrived, and to do all bit estimates from the same state. */
m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
TURecurse tuIterator(splitType, curPartNum, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
TextType ttype = (TextType)chromaId;
const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t stride = mode.fencYuv->m_csize;
const uint32_t sizeIdxC = log2TrSizeC - 2;
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
// init availability pattern
initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
if (chromaPredMode == DM_CHROMA_IDX)
chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
if (m_csp == X265_CSP_I422)
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
// get prediction signal
predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
uint64_t bCost = MAX_INT64;
uint32_t bDist = 0;
uint32_t bCbf = 0;
uint32_t bEnergy = 0;
int bTSkip = 0;
int checkTransformSkip = 1;
for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
{
coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC);
pixel* recon = (useTSkip ? m_tsRecon : reconQt);
uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
primitives.cu[sizeIdxC].add_ps(recon, reconStride, pred, residual, stride, stride);
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else if (useTSkip)
{
checkTransformSkip = 0;
break;
}
else
{
primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
sse_ret_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
uint32_t tmpBits = 0, tmpEnergy = 0;
if (numSig)
{
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
m_entropyCoder.resetBits();
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
tmpBits = m_entropyCoder.getNumberOfWrittenBits();
}
uint64_t tmpCost;
if (m_rdCost.m_psyRd)
{
tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
}
else
tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
if (tmpCost < bCost)
{
bCost = tmpCost;
bDist = tmpDist;
bTSkip = useTSkip;
bCbf = !!numSig;
bEnergy = tmpEnergy;
}
}
if (bTSkip)
{
memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2));
primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE);
}
cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
PicYuv* reconPic = m_frame->m_reconPic;
pixel* reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
intptr_t picStride = reconPic->m_strideC;
primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride);
outDist += bDist;
psyEnergy += bEnergy;
}
}
while (tuIterator.isNextSection());
if (splitType == VERTICAL_SPLIT)
{
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
}
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
return outDist;
}
void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
{
uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
if (tuDepthL == tuDepth || log2TrSizeC == 2)
{
// copy transform coefficients
uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
uint32_t qtLayer = log2TrSize - 2 - (tuDepthL - tuDepth);
coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
// copy reconstruction
m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
}
else
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
}
}
void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth)
{
CUData& cu = mode.cu;
uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
if (tuDepth < cu.m_tuDepth[absPartIdx])
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t splitCbfU = 0, splitCbfV = 0;
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1);
splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
{
cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
}
return;
}
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
uint32_t tuDepthC = tuDepth;
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
if (absPartIdx & 3)
return;
log2TrSizeC = 2;
tuDepthC--;
}
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
uint32_t stride = mode.fencYuv->m_csize;
const uint32_t sizeIdxC = log2TrSizeC - 2;
uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
TURecurse tuIterator(splitType, curPartNum, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
TextType ttype = (TextType)chromaId;
const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeffC = cu.m_trCoeff[ttype] + coeffOffsetC;
PicYuv* reconPic = m_frame->m_reconPic;
pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
intptr_t picStride = reconPic->m_strideC;
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
if (chromaPredMode == DM_CHROMA_IDX)
chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
if (m_csp == X265_CSP_I422)
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
// init availability pattern
initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
// get prediction signal
predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, residual, stride, stride);
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
// no coded residual, recon = pred
primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
}
}
while (tuIterator.isNextSection());
if (splitType == VERTICAL_SPLIT)
{
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
}
}
void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes)
{
CUData& cu = intraMode.cu;
cu.setPartSizeSubParts(partSize);
cu.setPredModeSubParts(MODE_INTRA);
uint32_t tuDepthRange[2];
cu.getIntraTUQtDepthRange(tuDepthRange, 0);
intraMode.initCosts();
intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
if (m_csp != X265_CSP_I400)
intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom, sharedChromaModes);
intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
m_entropyCoder.resetBits();
if (m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
if (!m_slice->isIntra())
{
m_entropyCoder.codeSkipFlag(cu, 0);
m_entropyCoder.codePredMode(cu.m_predMode[0]);
}
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
m_entropyCoder.codePredInfo(cu, 0);
intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
bool bCodeDQP = m_slice->m_pps->bUseDQP;
m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
m_entropyCoder.store(intraMode.contexts);
intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
if (m_rdCost.m_psyRd)
{
const Yuv* fencYuv = intraMode.fencYuv;
intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
}
intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
updateModeCost(intraMode);
checkDQP(intraMode, cuGeom);
}
/* Note that this function does not save the best intra prediction, it must
* be generated later. It records the best mode in the cu */
void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
{
ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
CUData& cu = intraMode.cu;
uint32_t depth = cuGeom.depth;
cu.setPartSizeSubParts(SIZE_2Nx2N);
cu.setPredModeSubParts(MODE_INTRA);
const uint32_t initTuDepth = 0;
uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
uint32_t tuSize = 1 << log2TrSize;
const uint32_t absPartIdx = 0;
// Reference sample smoothing
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
const pixel* fenc = intraMode.fencYuv->m_buf[0];
uint32_t stride = intraMode.fencYuv->m_size;
int sad, bsad;
uint32_t bits, bbits, mode, bmode;
uint64_t cost, bcost;
// 33 Angle modes once
int scaleTuSize = tuSize;
int scaleStride = stride;
int costShift = 0;
int sizeIdx = log2TrSize - 2;
if (tuSize > 32)
{
// CU is 64x64, we scale to 32x32 and adjust required parameters
primitives.scale2D_64to32(m_fencScaled, fenc, stride);
fenc = m_fencScaled;
pixel nScale[129];
intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1);
// we do not estimate filtering for downscaled samples
memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel)); // Top & Left pixels
memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel));
scaleTuSize = 32;
scaleStride = 32;
costShift = 2;
sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
}
pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
int predsize = scaleTuSize * scaleTuSize;
m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
/* there are three cost tiers for intra modes:
* pred[0] - mode probable, least cost
* pred[1], pred[2] - less probable, slightly more cost
* non-mpm modes - all cost the same (rbits) */
uint64_t mpms;
uint32_t mpmModes[3];
uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
// DC
primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPredAngs, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
bmode = mode = DC_IDX;
bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
bcost = m_rdCost.calcRdSADCost(bsad, bbits);
// PLANAR
pixel* planar = intraNeighbourBuf[0];
if (tuSize & (8 | 16 | 32))
planar = intraNeighbourBuf[1];
primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPredAngs, scaleStride, planar, 0, 0);
sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
mode = PLANAR_IDX;
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
cost = m_rdCost.calcRdSADCost(sad, bits);
COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
bool allangs = true;
if (primitives.cu[sizeIdx].intra_pred_allangs)
{
primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
}
else
allangs = false;
#define TRY_ANGLE(angle) \
if (allangs) { \
if (angle < 18) \
sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
else \
sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
cost = m_rdCost.calcRdSADCost(sad, bits); \
} else { \
int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
cost = m_rdCost.calcRdSADCost(sad, bits); \
}
if (m_param->bEnableFastIntra)
{
int asad = 0;
uint32_t lowmode, highmode, amode = 5, abits = 0;
uint64_t acost = MAX_INT64;
/* pick the best angle, sampling at distance of 5 */
for (mode = 5; mode < 35; mode += 5)
{
TRY_ANGLE(mode);
COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
}
/* refine best angle at distance 2, then distance 1 */
for (uint32_t dist = 2; dist >= 1; dist--)
{
lowmode = amode - dist;
highmode = amode + dist;
X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
TRY_ANGLE(lowmode);
COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
TRY_ANGLE(highmode);
COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
}
if (amode == 33)
{
TRY_ANGLE(34);
COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
}
COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
}
else // calculate and search all intra prediction angles for lowest cost
{
for (mode = 2; mode < 35; mode++)
{
TRY_ANGLE(mode);
COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
}
}
cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
intraMode.initCosts();
intraMode.totalBits = bbits;
intraMode.distortion = bsad;
intraMode.sa8dCost = bcost;
intraMode.sa8dBits = bbits;
X265_CHECK(intraMode.ok(), "intra mode is not ok");
}
void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
{
ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
CUData& cu = intraMode.cu;
Yuv* reconYuv = &intraMode.reconYuv;
X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
uint32_t tuDepthRange[2];
cu.getIntraTUQtDepthRange(tuDepthRange, 0);
m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
Cost icosts;
codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
extractIntraResultQT(cu, *reconYuv, 0, 0);
intraMode.lumaDistortion = icosts.distortion;
intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom, NULL);
intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
m_entropyCoder.resetBits();
if (m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
m_entropyCoder.codeSkipFlag(cu, 0);
m_entropyCoder.codePredMode(cu.m_predMode[0]);
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
m_entropyCoder.codePredInfo(cu, 0);
intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
bool bCodeDQP = m_slice->m_pps->bUseDQP;
m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
if (m_rdCost.m_psyRd)
{
const Yuv* fencYuv = intraMode.fencYuv;
intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
}
intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
m_entropyCoder.store(intraMode.contexts);
updateModeCost(intraMode);
checkDQP(intraMode, cuGeom);
}
uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes)
{
CUData& cu = intraMode.cu;
Yuv* reconYuv = &intraMode.reconYuv;
Yuv* predYuv = &intraMode.predYuv;
const Yuv* fencYuv = intraMode.fencYuv;
uint32_t depth = cuGeom.depth;
uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
uint32_t numPU = 1 << (2 * initTuDepth);
uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
uint32_t tuSize = 1 << log2TrSize;
uint32_t qNumParts = cuGeom.numPartitions >> 2;
uint32_t sizeIdx = log2TrSize - 2;
uint32_t absPartIdx = 0;
uint32_t totalDistortion = 0;
int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
// loop over partitions
for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
{
uint32_t bmode = 0;
if (sharedModes)
bmode = sharedModes[puIdx];
else
{
uint64_t candCostList[MAX_RD_INTRA_MODES];
uint32_t rdModeList[MAX_RD_INTRA_MODES];
uint64_t bcost;
int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
{
ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
// Reference sample smoothing
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
// determine set of modes to be tested (using prediction signal only)
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
uint32_t stride = predYuv->m_size;
int scaleTuSize = tuSize;
int scaleStride = stride;
int costShift = 0;
if (tuSize > 32)
{
// origin is 64x64, we scale to 32x32 and setup required parameters
primitives.scale2D_64to32(m_fencScaled, fenc, stride);
fenc = m_fencScaled;
pixel nScale[129];
intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1);
memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel));
memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel));
scaleTuSize = 32;
scaleStride = 32;
costShift = 2;
sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
}
m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
/* there are three cost tiers for intra modes:
* pred[0] - mode probable, least cost
* pred[1], pred[2] - less probable, slightly more cost
* non-mpm modes - all cost the same (rbits) */
uint64_t mpms;
uint32_t mpmModes[3];
uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
uint64_t modeCosts[35];
// DC
primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
// PLANAR
pixel* planar = intraNeighbourBuf[0];
if (tuSize >= 8 && tuSize <= 32)
planar = intraNeighbourBuf[1];
primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
// angular predictions
if (primitives.cu[sizeIdx].intra_pred_allangs)
{
primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
for (int mode = 2; mode < 35; mode++)
{
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
if (mode < 18)
sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
else
sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
COPY1_IF_LT(bcost, modeCosts[mode]);
}
}
else
{
for (int mode = 2; mode < 35; mode++)
{
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
COPY1_IF_LT(bcost, modeCosts[mode]);
}
}
/* Find the top maxCandCount candidate modes with cost within 25% of best
* or among the most probable modes. maxCandCount is derived from the
* rdLevel and depth. In general we want to try more modes at slower RD
* levels and at higher depths */
for (int i = 0; i < maxCandCount; i++)
candCostList[i] = MAX_INT64;
uint64_t paddedBcost = bcost + (bcost >> 3); // 1.12%
for (int mode = 0; mode < 35; mode++)
if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode)))
updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
}
/* measure best candidates using simple RDO (no TU splits) */
bcost = MAX_INT64;
for (int i = 0; i < maxCandCount; i++)
{
if (candCostList[i] == MAX_INT64)
break;
ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
m_entropyCoder.load(m_rqt[depth].cur);
cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
Cost icosts;
if (checkTransformSkip)
codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
else
codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
}
}
ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
/* remeasure best mode, allowing TU splits */
cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
m_entropyCoder.load(m_rqt[depth].cur);
Cost icosts;
if (checkTransformSkip)
codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
else
codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
totalDistortion += icosts.distortion;
extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
// set reconstruction for next intra prediction blocks
if (puIdx != numPU - 1)
{
/* This has important implications for parallelism and RDO. It is writing intermediate results into the
* output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
* it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
* that the contexts should be tracked through each PU */
PicYuv* reconPic = m_frame->m_reconPic;
pixel* dst = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
uint32_t dststride = reconPic->m_stride;
const pixel* src = reconYuv->getLumaAddr(absPartIdx);
uint32_t srcstride = reconYuv->m_size;
primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
}
}
if (numPU > 1)
{
uint32_t combCbfY = 0;
for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
cu.m_cbf[0][offs] |= combCbfY;
}
// TODO: remove this
m_entropyCoder.load(m_rqt[depth].cur);
return totalDistortion;
}
void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
{
CUData& cu = intraMode.cu;
const Yuv* fencYuv = intraMode.fencYuv;
Yuv* predYuv = &intraMode.predYuv;
uint32_t bestMode = 0;
uint64_t bestCost = MAX_INT64;
uint32_t modeList[NUM_CHROMA_MODE];
uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
uint32_t tuSize = 1 << log2TrSizeC;
uint32_t tuDepth = 0;
int32_t costShift = 0;
if (tuSize > 32)
{
tuDepth = 1;
costShift = 2;
log2TrSizeC = 5;
}
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
cu.getAllowedChromaDir(0, modeList);
// check chroma modes
for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++)
{
uint32_t chromaPredMode = modeList[mode];
if (chromaPredMode == DM_CHROMA_IDX)
chromaPredMode = cu.m_lumaIntraDir[0];
if (m_csp == X265_CSP_I422)
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
uint64_t cost = 0;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
const pixel* fenc = fencYuv->m_buf[chromaId];
pixel* pred = predYuv->m_buf[chromaId];
Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId);
// get prediction signal
predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC);
cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
}
if (cost < bestCost)
{
bestCost = cost;
bestMode = modeList[mode];
}
}
cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth);
}
uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes)
{
CUData& cu = intraMode.cu;
Yuv& reconYuv = intraMode.reconYuv;
uint32_t depth = cuGeom.depth;
uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
uint32_t absPartStep = cuGeom.numPartitions;
uint32_t totalDistortion = 0;
int size = partitionFromLog2Size(log2TrSize);
TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
uint32_t bestMode = 0;
uint32_t bestDist = 0;
uint64_t bestCost = MAX_INT64;
// init mode list
uint32_t minMode = 0;
uint32_t maxMode = NUM_CHROMA_MODE;
uint32_t modeList[NUM_CHROMA_MODE];
if (sharedChromaModes && !initTuDepth)
{
for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++)
modeList[l] = sharedChromaModes[0];
maxMode = 1;
}
else
cu.getAllowedChromaDir(absPartIdxC, modeList);
// check chroma modes
for (uint32_t mode = minMode; mode < maxMode; mode++)
{
// restore context models
m_entropyCoder.load(m_rqt[depth].cur);
cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
uint32_t psyEnergy = 0;
uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, psyEnergy);
if (m_slice->m_pps->bTransformSkipEnabled)
m_entropyCoder.load(m_rqt[depth].cur);
m_entropyCoder.resetBits();
// chroma prediction mode
if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
{
if (!absPartIdxC)
m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
}
else
{
uint32_t qNumParts = cuGeom.numPartitions >> 2;
if (!(absPartIdxC & (qNumParts - 1)))
m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
}
codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits);
if (cost < bestCost)
{
bestCost = cost;
bestDist = dist;
bestMode = modeList[mode];
extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
}
}
if (!tuIterator.isLastSection())
{
uint32_t zorder = cuGeom.absPartIdx + absPartIdxC;
PicYuv* reconPic = m_frame->m_reconPic;
uint32_t dststride = reconPic->m_strideC;
const pixel* src;
pixel* dst;
dst = reconPic->getCbAddr(cu.m_cuAddr, zorder);
src = reconYuv.getCbAddr(absPartIdxC);
primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
dst = reconPic->getCrAddr(cu.m_cuAddr, zorder);
src = reconYuv.getCrAddr(absPartIdxC);
primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
}
memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
totalDistortion += bestDist;
}
while (tuIterator.isNextSection());
if (initTuDepth != 0)
{
uint32_t combCbfU = 0;
uint32_t combCbfV = 0;
uint32_t qNumParts = tuIterator.absPartIdxStep;
for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
}
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
{
cu.m_cbf[1][offs] |= combCbfU;
cu.m_cbf[2][offs] |= combCbfV;
}
}
/* TODO: remove this */
m_entropyCoder.load(m_rqt[depth].cur);
return totalDistortion;
}
/* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */
uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m)
{
X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n");
MVField candMvField[MRG_MAX_NUM_CANDS][2];
uint8_t candDir[MRG_MAX_NUM_CANDS];
uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
if (cu.isBipredRestriction())
{
/* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
{
if (candDir[mergeCand] == 3)
{
candDir[mergeCand] = 1;
candMvField[mergeCand][1].refIdx = REF_NOT_VALID;
}
}
}
Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
uint32_t outCost = MAX_UINT;
for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
{
/* Prevent TMVP candidates from using unavailable reference pixels */
if (m_bFrameParallel &&
(candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4))
continue;
cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv;
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx;
cu.m_mv[1][pu.puAbsPartIdx] = candMvField[mergeCand][1].mv;
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][1].refIdx;
motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD);
uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size);
if (m_me.bChromaSATD)
costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx);
uint32_t bitsCand = getTUBits(mergeCand, numMergeCand);
costCand = costCand + m_rdCost.getCost(bitsCand);
if (costCand < outCost)
{
outCost = costCand;
m.bits = bitsCand;
m.index = mergeCand;
}
}
m.mvField[0] = candMvField[m.index][0];
m.mvField[1] = candMvField[m.index][1];
m.dir = candDir[m.index];
return outCost;
}
/* find the lowres motion vector from lookahead in middle of current PU */
MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref)
{
int diffPoc = abs(m_slice->m_poc - m_slice->m_refPOCList[list][ref]);
if (diffPoc > m_param->bframes + 1)
/* poc difference is out of range for lookahead */
return 0;
MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc - 1];
if (mvs[0].x == 0x7FFF)
/* this motion search was not estimated by lookahead */
return 0;
uint32_t block_x = (cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + pu.width / 2) >> 4;
uint32_t block_y = (cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + pu.height / 2) >> 4;
uint32_t idx = block_y * m_frame->m_lowres.maxBlocksInRow + block_x;
X265_CHECK(block_x < m_frame->m_lowres.maxBlocksInRow, "block_x is too high\n");
X265_CHECK(block_y < m_frame->m_lowres.maxBlocksInCol, "block_y is too high\n");
return mvs[idx] << 1; /* scale up lowres mv */
}
/* Pick between the two AMVP candidates which is the best one to use as
* MVP for the motion search, based on SAD cost */
int Search::selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref)
{
if (amvp[0] == amvp[1])
return 0;
Yuv& tmpPredYuv = m_rqt[cu.m_cuDepth[0]].tmpPredYuv;
uint32_t costs[AMVP_NUM_CANDS];
for (int i = 0; i < AMVP_NUM_CANDS; i++)
{
MV mvCand = amvp[i];
// NOTE: skip mvCand if Y is > merange and -FN>1
if (m_bFrameParallel && (mvCand.y >= (m_param->searchRange + 1) * 4))
costs[i] = m_me.COST_MAX;
else
{
cu.clipMv(mvCand);
predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand);
costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
}
}
return costs[0] <= costs[1] ? 0 : 1;
}
void Search::PME::processTasks(int workerThreadId)
{
#if DETAILED_CU_STATS
int fe = mode.cu.m_encData->m_frameEncoderID;
master.m_stats[fe].countPMETasks++;
ScopedElapsedTime pmeTime(master.m_stats[fe].pmeTime);
#endif
ProfileScopeEvent(pme);
master.processPME(*this, master.m_tld[workerThreadId].analysis);
}
void Search::processPME(PME& pme, Search& slave)
{
/* acquire a motion estimation job, else exit early */
int meId;
pme.m_lock.acquire();
if (pme.m_jobTotal > pme.m_jobAcquired)
{
meId = pme.m_jobAcquired++;
pme.m_lock.release();
}
else
{
pme.m_lock.release();
return;
}
/* Setup slave Search instance for ME for master's CU */
if (&slave != this)
{
slave.m_slice = m_slice;
slave.m_frame = m_frame;
slave.m_param = m_param;
slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp);
slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height);
}
/* Perform ME, repeat until no more work is available */
do
{
if (meId < pme.m_jobs.refCnt[0])
{
int refIdx = pme.m_jobs.ref[0][meId]; //L0
slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx);
}
else
{
int refIdx = pme.m_jobs.ref[1][meId - pme.m_jobs.refCnt[0]]; //L1
slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx);
}
meId = -1;
pme.m_lock.acquire();
if (pme.m_jobTotal > pme.m_jobAcquired)
meId = pme.m_jobAcquired++;
pme.m_lock.release();
}
while (meId >= 0);
}
void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref)
{
uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
MotionData* bestME = interMode.bestME[part];
// 12 mv candidates including lowresMV
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
const MV* amvp = interMode.amvpCand[list][ref];
int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
MV lmv = getLowresMV(interMode.cu, pu, list, ref);
if (lmv.notZero())
mvc[numMvc++] = lmv;
setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
/* Get total cost of partition, but only include MV bit cost once */
bits += m_me.bitcost(outmv);
uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
/* Refine MVP selection, updates: mvpIdx, bits, cost */
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
/* tie goes to the smallest ref ID, just like --no-pme */
ScopedLock _lock(master.m_meLock);
if (cost < bestME[list].cost ||
(cost == bestME[list].cost && ref < bestME[list].ref))
{
bestME[list].mv = outmv;
bestME[list].mvp = mvp;
bestME[list].mvpIdx = mvpIdx;
bestME[list].ref = ref;
bestME[list].cost = cost;
bestME[list].bits = bits;
}
}
/* find the best inter prediction for each PU of specified mode */
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
{
ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
CUData& cu = interMode.cu;
Yuv* predYuv = &interMode.predYuv;
// 12 mv candidates including lowresMV
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
const Slice *slice = m_slice;
int numPart = cu.getNumPartInter(0);
int numPredDir = slice->isInterP() ? 1 : 2;
const int* numRefIdx = slice->m_numRefIdx;
uint32_t lastMode = 0;
int totalmebits = 0;
MV mvzero(0, 0);
Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
MergeData merge;
memset(&merge, 0, sizeof(merge));
for (int puIdx = 0; puIdx < numPart; puIdx++)
{
MotionData* bestME = interMode.bestME[puIdx];
PredictionUnit pu(cu, cuGeom, puIdx);
m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height);
/* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
bestME[0].cost = MAX_UINT;
bestME[1].cost = MAX_UINT;
getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
bool bDoUnidir = true;
cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
/* Uni-directional prediction */
if (m_param->analysisMode == X265_ANALYSIS_LOAD && bestME[0].ref >= 0)
{
for (int list = 0; list < numPredDir; list++)
{
int ref = bestME[list].ref;
uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, numRefIdx[list]);
int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
const MV* amvp = interMode.amvpCand[list][ref];
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
MV lmv = getLowresMV(cu, pu, list, ref);
if (lmv.notZero())
mvc[numMvc++] = lmv;
setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
/* Get total cost of partition, but only include MV bit cost once */
bits += m_me.bitcost(outmv);
uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
/* Refine MVP selection, updates: mvpIdx, bits, cost */
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
if (cost < bestME[list].cost)
{
bestME[list].mv = outmv;
bestME[list].mvp = mvp;
bestME[list].mvpIdx = mvpIdx;
bestME[list].cost = cost;
bestME[list].bits = bits;
}
}
bDoUnidir = false;
}
else if (m_param->bDistributeMotionEstimation)
{
PME pme(*this, interMode, cuGeom, pu, puIdx);
pme.m_jobTotal = 0;
pme.m_jobAcquired = 1; /* reserve L0-0 or L1-0 */
uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
for (int list = 0; list < numPredDir; list++)
{
int idx = 0;
for (int ref = 0; ref < numRefIdx[list]; ref++)
{
if (!(refMask & (1 << ref)))
continue;
pme.m_jobs.ref[list][idx++] = ref;
pme.m_jobTotal++;
}
pme.m_jobs.refCnt[list] = idx;
/* the second list ref bits start at bit 16 */
refMask >>= 16;
}
if (pme.m_jobTotal > 2)
{
pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1);
processPME(pme, *this);
int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0];
singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */
bDoUnidir = false;
ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
pme.waitForExit();
}
/* if no peer threads were bonded, fall back to doing unidirectional
* searches ourselves without overhead of singleMotionEstimation() */
}
if (bDoUnidir)
{
uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
for (int list = 0; list < numPredDir; list++)
{
for (int ref = 0; ref < numRefIdx[list]; ref++)
{
ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
if (!(refMask & (1 << ref)))
{
ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
continue;
}
uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, numRefIdx[list]);
int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
const MV* amvp = interMode.amvpCand[list][ref];
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
MV lmv = getLowresMV(cu, pu, list, ref);
if (lmv.notZero())
mvc[numMvc++] = lmv;
setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
/* Get total cost of partition, but only include MV bit cost once */
bits += m_me.bitcost(outmv);
uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
/* Refine MVP selection, updates: mvpIdx, bits, cost */
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
if (cost < bestME[list].cost)
{
bestME[list].mv = outmv;
bestME[list].mvp = mvp;
bestME[list].mvpIdx = mvpIdx;
bestME[list].ref = ref;
bestME[list].cost = cost;
bestME[list].bits = bits;
}
}
/* the second list ref bits start at bit 16 */
refMask >>= 16;
}
}
/* Bi-directional prediction */
MotionData bidir[2];
uint32_t bidirCost = MAX_UINT;
int bidirBits = 0;
if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */
cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */
bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
{
bidir[0] = bestME[0];
bidir[1] = bestME[1];
int satdCost;
if (m_me.bChromaSATD)
{
cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
motionCompensation(cu, pu, tmpPredYuv, true, true);
satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
}
else
{
PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
/* Generate reference subpels */
predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
bidirCost = satdCost + m_rdCost.getCost(bidirBits);
bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
if (bTryZero)
{
/* Do not try zero MV if unidir motion predictors are beyond
* valid search area */
MV mvmin, mvmax;
int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
setSearchRange(cu, mvzero, merange, mvmin, mvmax);
mvmax.y += 2; // there is some pad for subpel refine
mvmin <<= 2;
mvmax <<= 2;
bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
}
if (bTryZero)
{
/* coincident blocks of the two reference pictures */
if (m_me.bChromaSATD)
{
cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
motionCompensation(cu, pu, tmpPredYuv, true, true);
satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
}
else
{
const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
intptr_t refStride = slice->m_mref[0][0].lumaStride;
primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
MV mvp0 = bestME[0].mvp;
int mvpIdx0 = bestME[0].mvpIdx;
uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
MV mvp1 = bestME[1].mvp;
int mvpIdx1 = bestME[1].mvpIdx;
uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
/* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);
if (cost < bidirCost)
{
bidir[0].mv = mvzero;
bidir[1].mv = mvzero;
bidir[0].mvp = mvp0;
bidir[1].mvp = mvp1;
bidir[0].mvpIdx = mvpIdx0;
bidir[1].mvpIdx = mvpIdx1;
bidirCost = cost;
bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
}
}
}
/* select best option and store into CU */
if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
{
cu.m_mergeFlag[pu.puAbsPartIdx] = true;
cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */
cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
totalmebits += merge.bits;
}
else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
{
lastMode = 2;
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;
cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
totalmebits += bidirBits;
}
else if (bestME[0].cost <= bestME[1].cost)
{
lastMode = 0;
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;
cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
totalmebits += bestME[0].bits;
}
else
{
lastMode = 1;
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;
cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
totalmebits += bestME[1].bits;
}
motionCompensation(cu, pu, *predYuv, true, bChromaMC);
}
X265_CHECK(interMode.ok(), "inter mode is not ok");
interMode.sa8dBits += totalmebits;
}
void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
{
if (cuMode == SIZE_2Nx2N)
{
blockBit[0] = (!bPSlice) ? 3 : 1;
blockBit[1] = 3;
blockBit[2] = 5;
}
else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD)
{
static const uint32_t listBits[2][3][3] =
{
{ { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
{ { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
};
if (bPSlice)
{
blockBit[0] = 3;
blockBit[1] = 0;
blockBit[2] = 0;
}
else
memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
}
else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N)
{
static const uint32_t listBits[2][3][3] =
{
{ { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
{ { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
};
if (bPSlice)
{
blockBit[0] = 3;
blockBit[1] = 0;
blockBit[2] = 0;
}
else
memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
}
else if (cuMode == SIZE_NxN)
{
blockBit[0] = (!bPSlice) ? 3 : 1;
blockBit[1] = 3;
blockBit[2] = 5;
}
else
{
X265_CHECK(0, "getBlkBits: unknown cuMode\n");
}
}
/* Check if using an alternative MVP would result in a smaller MVD + signal bits */
const MV& Search::checkBestMVP(const MV* amvpCand, const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const
{
int diffBits = m_me.bitcost(mv, amvpCand[!mvpIdx]) - m_me.bitcost(mv, amvpCand[mvpIdx]);
if (diffBits < 0)
{
mvpIdx = !mvpIdx;
uint32_t origOutBits = outBits;
outBits = origOutBits + diffBits;
outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
}
return amvpCand[mvpIdx];
}
void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const
{
MV dist((int16_t)merange << 2, (int16_t)merange << 2);
mvmin = mvp - dist;
mvmax = mvp + dist;
cu.clipMv(mvmin);
cu.clipMv(mvmax);
/* Clip search range to signaled maximum MV length.
* We do not support this VUI field being changed from the default */
const int maxMvLen = (1 << 15) - 1;
mvmin.x = X265_MAX(mvmin.x, -maxMvLen);
mvmin.y = X265_MAX(mvmin.y, -maxMvLen);
mvmax.x = X265_MIN(mvmax.x, maxMvLen);
mvmax.y = X265_MIN(mvmax.y, maxMvLen);
mvmin >>= 2;
mvmax >>= 2;
/* conditional clipping for frame parallelism */
mvmin.y = X265_MIN(mvmin.y, (int16_t)m_refLagPixels);
mvmax.y = X265_MIN(mvmax.y, (int16_t)m_refLagPixels);
}
/* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
{
CUData& cu = interMode.cu;
Yuv* reconYuv = &interMode.reconYuv;
const Yuv* fencYuv = interMode.fencYuv;
Yuv* predYuv = &interMode.predYuv;
X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
uint32_t depth = cu.m_cuDepth[0];
// No residual coding : SKIP mode
cu.setPredModeSubParts(MODE_SKIP);
cu.clearCbf();
cu.setTUDepthSubParts(0, 0, depth);
reconYuv->copyFromYuv(interMode.predYuv);
// Luma
int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
if (m_csp != X265_CSP_I400) {
// Chroma
interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
} else {
interMode.chromaDistortion = 0;
}
interMode.distortion = interMode.lumaDistortion + interMode.chromaDistortion;
m_entropyCoder.load(m_rqt[depth].cur);
m_entropyCoder.resetBits();
if (m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
m_entropyCoder.codeSkipFlag(cu, 0);
m_entropyCoder.codeMergeIndex(cu, 0);
interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
interMode.coeffBits = 0;
interMode.totalBits = interMode.mvBits;
if (m_rdCost.m_psyRd)
interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
updateModeCost(interMode);
m_entropyCoder.store(interMode.contexts);
}
/* encode residual and calculate rate-distortion for a CU block.
* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
{
ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]);
CUData& cu = interMode.cu;
Yuv* reconYuv = &interMode.reconYuv;
Yuv* predYuv = &interMode.predYuv;
uint32_t depth = cuGeom.depth;
ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;
const Yuv* fencYuv = interMode.fencYuv;
X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
uint32_t log2CUSize = cuGeom.log2CUSize;
int sizeIdx = log2CUSize - 2;
resiYuv->subtract(*fencYuv, *predYuv, log2CUSize);
uint32_t tuDepthRange[2];
cu.getInterTUQtDepthRange(tuDepthRange, 0);
m_entropyCoder.load(m_rqt[depth].cur);
Cost costs;
estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
uint32_t tqBypass = cu.m_tqBypass[0];
if (!tqBypass)
{
sse_ret_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
if (m_csp != X265_CSP_I400) {
cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
}
/* Consider the RD cost of not signaling any residual */
m_entropyCoder.load(m_rqt[depth].cur);
m_entropyCoder.resetBits();
m_entropyCoder.codeQtRootCbfZero();
uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
uint64_t cbf0Cost;
uint32_t cbf0Energy;
if (m_rdCost.m_psyRd)
{
cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
}
else
cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
if (cbf0Cost < costs.rdcost)
{
cu.clearCbf();
cu.setTUDepthSubParts(0, 0, depth);
}
}
if (cu.getQtRootCbf(0))
saveResidualQTData(cu, *resiYuv, 0, 0);
/* calculate signal bits for inter/merge/skip coded CU */
m_entropyCoder.load(m_rqt[depth].cur);
m_entropyCoder.resetBits();
if (m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(tqBypass);
uint32_t coeffBits, bits;
if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
{
cu.setPredModeSubParts(MODE_SKIP);
/* Merge/Skip */
m_entropyCoder.codeSkipFlag(cu, 0);
m_entropyCoder.codeMergeIndex(cu, 0);
coeffBits = 0;
bits = m_entropyCoder.getNumberOfWrittenBits();
}
else
{
m_entropyCoder.codeSkipFlag(cu, 0);
m_entropyCoder.codePredMode(cu.m_predMode[0]);
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
m_entropyCoder.codePredInfo(cu, 0);
uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
bool bCodeDQP = m_slice->m_pps->bUseDQP;
m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
bits = m_entropyCoder.getNumberOfWrittenBits();
coeffBits = bits - mvBits;
}
m_entropyCoder.store(interMode.contexts);
if (cu.getQtRootCbf(0))
reconYuv->addClip(*predYuv, *resiYuv, log2CUSize);
else
reconYuv->copyFromYuv(*predYuv);
// update with clipped distortion and cost (qp estimation loop uses unclipped values)
sse_ret_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
sse_ret_t bestChromaDist;
if (m_csp != X265_CSP_I400) {
bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
} else {
bestChromaDist = 0;
}
if (m_rdCost.m_psyRd)
interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
interMode.totalBits = bits;
interMode.lumaDistortion = bestLumaDist;
interMode.chromaDistortion = bestChromaDist;
interMode.distortion = bestLumaDist + bestChromaDist;
interMode.coeffBits = coeffBits;
interMode.mvBits = bits - coeffBits;
updateModeCost(interMode);
checkDQP(interMode, cuGeom);
}
void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
{
uint32_t depth = cuGeom.depth + tuDepth;
CUData& cu = mode.cu;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
bool bCheckFull = log2TrSize <= depthRange[1];
if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
bCheckFull = false;
if (bCheckFull)
{
// code full block
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
bool bCodeChroma = true;
uint32_t tuDepthC = tuDepth;
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
log2TrSizeC = 2;
tuDepthC--;
bCodeChroma = !(absPartIdx & 3);
}
uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
uint32_t setCbf = 1 << tuDepth;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
uint32_t sizeIdx = log2TrSize - 2;
cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
const Yuv* fencYuv = mode.fencYuv;
int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
uint32_t strideResiY = resiYuv.m_size;
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
if (numSigY)
{
m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
}
else
{
primitives.cu[sizeIdx].blockfill_s(curResiY, strideResiY, 0);
cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
}
if (bCodeChroma)
{
uint32_t sizeIdxC = log2TrSizeC - 2;
uint32_t strideResiC = resiYuv.m_csize;
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
if (numSigU)
{
m_quant.invtransformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
primitives.cu[sizeIdxC].blockfill_s(curResiU, strideResiC, 0);
cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
}
int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
if (numSigV)
{
m_quant.invtransformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
primitives.cu[sizeIdxC].blockfill_s(curResiV, strideResiC, 0);
cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
}
}
while (tuIterator.isNextSection());
if (splitIntoSubTUs)
{
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
}
}
}
else
{
X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
for (uint32_t i = 0; i < 4 * qNumParts; ++i)
{
cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
}
}
}
uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
{
uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
if (m_rdCost.m_psyRd)
return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy);
else
return m_rdCost.calcRdCost(dist, nullBits);
}
void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
uint32_t depth = cuGeom.depth + tuDepth;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
bool bCheckSplit = log2TrSize > depthRange[0];
bool bCheckFull = log2TrSize <= depthRange[1];
bool bSplitPresentFlag = bCheckSplit && bCheckFull;
if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
bCheckFull = false;
X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
bool bCodeChroma;
uint32_t tuDepthC = tuDepth;
if (m_csp != X265_CSP_I400) {
bCodeChroma = true;
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
log2TrSizeC = 2;
tuDepthC--;
bCodeChroma = !(absPartIdx & 3);
}
} else {
bCodeChroma = false;
}
// code full block
Cost fullCost;
fullCost.rdcost = MAX_INT64;
uint8_t cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
m_entropyCoder.store(m_rqt[depth].rqtRoot);
uint32_t trSize = 1 << log2TrSize;
const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
const Yuv* fencYuv = mode.fencYuv;
// code full block
if (bCheckFull)
{
uint32_t trSizeC = 1 << log2TrSizeC;
int partSize = partitionFromLog2Size(log2TrSize);
int partSizeC = partitionFromLog2Size(log2TrSizeC);
const uint32_t qtLayer = log2TrSize - 2;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
if (m_bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
m_entropyCoder.resetBits();
if (bSplitPresentFlag && log2TrSize > depthRange[0])
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
// Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth.
// So it is valid if we encode coefficients and then cbfs at least for analysis.
// m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
if (cbfFlag[TEXT_LUMA][0])
m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits();
singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits;
X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
uint32_t distY = primitives.cu[partSize].ssd_s(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
uint32_t psyEnergyY = 0;
if (m_rdCost.m_psyRd)
psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0);
int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
if (cbfFlag[TEXT_LUMA][0])
{
m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
// non-zero cost calculation for luma - This is an approximation
// finally we have to encode correct cbf after comparing with null cost
const uint32_t nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
if (m_rdCost.m_psyRd)
{
nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
}
else
singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
if (cu.m_tqBypass[0])
{
singleDist[TEXT_LUMA][0] = nonZeroDistY;
singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
}
else
{
// zero-cost calculation for luma. This is an approximation
// Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
// Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
if (nullCostY < singleCostY)
{
cbfFlag[TEXT_LUMA][0] = 0;
singleBits[TEXT_LUMA][0] = 0;
primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
#if CHECKED_BUILD || _DEBUG
uint32_t numCoeffY = 1 << (log2TrSize << 1);
memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
#endif
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = nullCostY;
singleDist[TEXT_LUMA][0] = distY;
singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
}
else
{
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = singleCostY;
singleDist[TEXT_LUMA][0] = nonZeroDistY;
singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
}
}
}
else
{
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
singleDist[TEXT_LUMA][0] = distY;
singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
}
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
if (bCodeChroma)
{
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
uint32_t distC = 0, psyEnergyC = 0;
coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
if (cbfFlag[chromaId][tuIterator.section])
m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev;
singleBitsPrev = newBits;
int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
distC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].ssd_s(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
if (cbfFlag[chromaId][tuIterator.section])
{
m_quant.invtransformNxN(cu, curResiC, strideResiC, coeffCurC + subTUOffset,
log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
// non-zero cost calculation for luma, same as luma - This is an approximation
// finally we have to encode correct cbf after comparing with null cost
uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
if (m_rdCost.m_psyRd)
{
nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
}
else
singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
if (cu.m_tqBypass[0])
{
singleDist[chromaId][tuIterator.section] = nonZeroDistC;
singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
}
else
{
//zero-cost calculation for chroma. This is an approximation
uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
if (nullCostC < singleCostC)
{
cbfFlag[chromaId][tuIterator.section] = 0;
singleBits[chromaId][tuIterator.section] = 0;
primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
#if CHECKED_BUILD || _DEBUG
uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
#endif
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = nullCostC;
singleDist[chromaId][tuIterator.section] = distC;
singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
}
else
{
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = singleCostC;
singleDist[chromaId][tuIterator.section] = nonZeroDistC;
singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
}
}
}
else
{
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
singleDist[chromaId][tuIterator.section] = distC;
singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
}
cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
}
while (tuIterator.isNextSection());
}
}
if (checkTransformSkipY)
{
uint32_t nonZeroDistY = 0;
uint32_t nonZeroPsyEnergyY = 0;
uint64_t singleCostY = MAX_INT64;
m_entropyCoder.load(m_rqt[depth].rqtRoot);
cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
if (m_bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
fenc = fencYuv->getLumaAddr(absPartIdx);
resi = resiYuv.getLumaAddr(absPartIdx);
uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true);
if (numSigTSkipY)
{
m_entropyCoder.resetBits();
m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA);
const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, m_tsResidual, trSize);
if (m_rdCost.m_psyRd)
{
nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, m_tsResidual, trSize);
singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroPsyEnergyY);
}
else
singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
}
if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
else
{
singleDist[TEXT_LUMA][0] = nonZeroDistY;
singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
bestTransformMode[TEXT_LUMA][0] = 1;
uint32_t numCoeffY = 1 << (log2TrSize << 1);
memcpy(coeffCurY, m_tsCoeff, sizeof(coeff_t) * numCoeffY);
primitives.cu[partSize].copy_ss(curResiY, strideResiY, m_tsResidual, trSize);
}
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
}
if (bCodeChroma && checkTransformSkipC)
{
uint32_t nonZeroDistC = 0, nonZeroPsyEnergyC = 0;
uint64_t singleCostC = MAX_INT64;
uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
m_entropyCoder.load(m_rqt[depth].rqtRoot);
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
m_entropyCoder.resetBits();
singleBits[chromaId][tuIterator.section] = 0;
if (numSigTSkipC)
{
m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
if (m_rdCost.m_psyRd)
{
nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
}
else
singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
}
if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
else
{
singleDist[chromaId][tuIterator.section] = nonZeroDistC;
singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
bestTransformMode[chromaId][tuIterator.section] = 1;
uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
memcpy(coeffCurC + subTUOffset, m_tsCoeff, sizeof(coeff_t) * numCoeffC);
primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, m_tsResidual, trSizeC);
}
cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
}
while (tuIterator.isNextSection());
}
}
// Here we were encoding cbfs and coefficients, after calculating distortion above.
// Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
// bits required for coefficients and added with number of cbf bits. As I tested the order does not
// make any difference. But bit confused whether I should load the original context as below.
m_entropyCoder.load(m_rqt[depth].rqtRoot);
m_entropyCoder.resetBits();
//Encode cbf flags
if (bCodeChroma)
{
if (!splitIntoSubTUs)
{
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
}
else
{
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth);
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth);
}
}
m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
uint32_t coeffBits = 0;
coeffBits = singleBits[TEXT_LUMA][0];
for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
{
coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
}
// In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
// In case of chroma, if any one of the split block's cbf is 1, then we need to encode cbf 1, and then for
// four split block's individual cbf value. This is not known before analysis of four split blocks.
// For that reason, I am collecting individual coefficient bits only.
fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
fullCost.distortion += singleDist[TEXT_LUMA][0];
fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
{
fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
}
if (m_rdCost.m_psyRd)
fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
else
fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
}
// code sub-blocks
if (bCheckSplit)
{
if (bCheckFull)
{
m_entropyCoder.store(m_rqt[depth].rqtTest);
m_entropyCoder.load(m_rqt[depth].rqtRoot);
}
Cost splitCost;
if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
{
// Subdiv flag can be encoded at the start of analysis of split blocks.
m_entropyCoder.resetBits();
m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
}
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange);
ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
for (uint32_t i = 0; i < 4 * qNumParts; ++i)
{
cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
}
// Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
// for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
// But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
// at depth 0 (for example).
m_entropyCoder.load(m_rqt[depth].rqtRoot);
m_entropyCoder.resetBits();
codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
splitCost.bits += splitCbfBits;
if (m_rdCost.m_psyRd)
splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
else
splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
if (ycbf || ucbf || vcbf || !bCheckFull)
{
if (splitCost.rdcost < fullCost.rdcost)
{
outCosts.distortion += splitCost.distortion;
outCosts.rdcost += splitCost.rdcost;
outCosts.bits += splitCost.bits;
outCosts.energy += splitCost.energy;
return;
}
else
outCosts.energy += splitCost.energy;
}
cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
if (bCodeChroma)
{
if (!splitIntoSubTUs)
{
cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
}
else
{
uint32_t tuNumParts = absPartIdxStep >> 1;
cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx , tuNumParts);
cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx , tuNumParts);
cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
}
}
X265_CHECK(bCheckFull, "check-full must be set\n");
m_entropyCoder.load(m_rqt[depth].rqtTest);
}
cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
if (bCodeChroma)
{
if (!splitIntoSubTUs)
{
cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
}
else
{
uint32_t tuNumParts = absPartIdxStep >> 1;
offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx , tuNumParts);
cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx , tuNumParts);
cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
}
}
outCosts.distortion += fullCost.distortion;
outCosts.rdcost += fullCost.rdcost;
outCosts.bits += fullCost.bits;
outCosts.energy += fullCost.energy;
}
void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
{
X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
const bool bSubdiv = tuDepth < cu.m_tuDepth[absPartIdx];
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
if (!(log2TrSize - m_hChromaShift < 2))
{
if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
}
else
{
X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
}
if (!bSubdiv)
{
m_entropyCoder.codeQtCbfLuma(cu, absPartIdx, tuDepth);
}
else
{
uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange);
}
}
void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth)
{
const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
if (tuDepth < cu.m_tuDepth[absPartIdx])
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1);
return;
}
const uint32_t qtLayer = log2TrSize - 2;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
bool bCodeChroma;
uint32_t tuDepthC = tuDepth;
if (m_csp != X265_CSP_I400) {
bCodeChroma = true;
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
log2TrSizeC = 2;
tuDepthC--;
bCodeChroma = !(absPartIdx & 3);
}
} else {
bCodeChroma = false;
}
m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
uint32_t numCoeffY = 1 << (log2TrSize * 2);
uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2;
coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
if (bCodeChroma)
{
m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
}
}
/* returns the number of bits required to signal a non-most-probable mode.
* on return mpms contains bitmap of most probable modes */
uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const
{
cu.getIntraDirLumaPredictor(absPartIdx, mpmModes);
mpms = 0;
for (int i = 0; i < 3; ++i)
mpms |= ((uint64_t)1 << mpmModes[i]);
return m_entropyCoder.bitsIntraModeNonMPM();
}
/* swap the current mode/cost with the mode with the highest cost in the
* current candidate list, if its cost is better (maintain a top N list) */
void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
{
uint32_t maxIndex = 0;
uint64_t maxValue = 0;
for (int i = 0; i < maxCandCount; i++)
{
if (maxValue < candCostList[i])
{
maxValue = candCostList[i];
maxIndex = i;
}
}
if (cost < maxValue)
{
candCostList[maxIndex] = cost;
candModeList[maxIndex] = mode;
}
}
void Search::checkDQP(Mode& mode, const CUGeom& cuGeom)
{
CUData& cu = mode.cu;
if (cu.m_slice->m_pps->bUseDQP && cuGeom.depth <= cu.m_slice->m_pps->maxCuDQPDepth)
{
if (cu.getQtRootCbf(0))
{
if (m_param->rdLevel >= 3)
{
mode.contexts.resetBits();
mode.contexts.codeDeltaQP(cu, 0);
uint32_t bits = mode.contexts.getNumberOfWrittenBits();
mode.mvBits += bits;
mode.totalBits += bits;
updateModeCost(mode);
}
else if (m_param->rdLevel <= 1)
{
mode.sa8dBits++;
mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
}
else
{
mode.mvBits++;
mode.totalBits++;
updateModeCost(mode);
}
}
else
cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
}
}
void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom)
{
CUData& cu = mode.cu;
if ((cuGeom.depth == cu.m_slice->m_pps->maxCuDQPDepth) && cu.m_slice->m_pps->bUseDQP)
{
bool hasResidual = false;
/* Check if any sub-CU has a non-zero QP */
for (uint32_t blkIdx = 0; blkIdx < cuGeom.numPartitions; blkIdx++)
{
if (cu.getQtRootCbf(blkIdx))
{
hasResidual = true;
break;
}
}
if (hasResidual)
{
if (m_param->rdLevel >= 3)
{
mode.contexts.resetBits();
mode.contexts.codeDeltaQP(cu, 0);
uint32_t bits = mode.contexts.getNumberOfWrittenBits();
mode.mvBits += bits;
mode.totalBits += bits;
updateModeCost(mode);
}
else if (m_param->rdLevel <= 1)
{
mode.sa8dBits++;
mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
}
else
{
mode.mvBits++;
mode.totalBits++;
updateModeCost(mode);
}
/* For all zero CBF sub-CUs, reset QP to RefQP (so that deltaQP is not signalled).
When the non-zero CBF sub-CU is found, stop */
cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth);
}
else
/* No residual within this CU or subCU, so reset QP to RefQP */
cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
}
}