3557 lines
148 KiB
C++
3557 lines
148 KiB
C++
/*****************************************************************************
|
|
* Copyright (C) 2013 x265 project
|
|
*
|
|
* Authors: Steve Borho <steve@borho.org>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
|
*
|
|
* This program is also available under a commercial proprietary license.
|
|
* For more information, contact us at license @ x265.com.
|
|
*****************************************************************************/
|
|
|
|
#include "common.h"
|
|
#include "primitives.h"
|
|
#include "picyuv.h"
|
|
#include "cudata.h"
|
|
|
|
#include "search.h"
|
|
#include "entropy.h"
|
|
#include "rdcost.h"
|
|
|
|
#include "analysis.h" // TLD
|
|
#include "framedata.h"
|
|
|
|
using namespace X265_NS;
|
|
|
|
#if _MSC_VER
|
|
#pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
|
|
#pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
|
|
#pragma warning(disable: 4127) // conditional expression is constant
|
|
#endif
|
|
|
|
#define MVP_IDX_BITS 1
|
|
|
|
ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
|
|
|
|
Search::Search()
|
|
{
|
|
memset(m_rqt, 0, sizeof(m_rqt));
|
|
|
|
for (int i = 0; i < 3; i++)
|
|
{
|
|
m_qtTempTransformSkipFlag[i] = NULL;
|
|
m_qtTempCbf[i] = NULL;
|
|
}
|
|
|
|
m_numLayers = 0;
|
|
m_intraPred = NULL;
|
|
m_intraPredAngs = NULL;
|
|
m_fencScaled = NULL;
|
|
m_fencTransposed = NULL;
|
|
m_tsCoeff = NULL;
|
|
m_tsResidual = NULL;
|
|
m_tsRecon = NULL;
|
|
m_param = NULL;
|
|
m_slice = NULL;
|
|
m_frame = NULL;
|
|
}
|
|
|
|
bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
|
|
{
|
|
uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
|
|
m_param = ¶m;
|
|
m_bEnableRDOQ = !!param.rdoqLevel;
|
|
m_bFrameParallel = param.frameNumThreads > 1;
|
|
m_numLayers = g_log2Size[param.maxCUSize] - 2;
|
|
|
|
m_rdCost.setPsyRdScale(param.psyRd);
|
|
m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
|
|
|
|
bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder);
|
|
if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
|
|
ok &= m_quant.allocNoiseReduction(param);
|
|
|
|
ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
|
|
|
|
/* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
|
|
* available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */
|
|
m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
|
|
|
|
uint32_t sizeL = 1 << (maxLog2CUSize * 2);
|
|
uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
|
|
uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
|
|
|
|
/* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
|
|
* the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
|
|
* which are reconstructed at each depth are valid. At the end, the transform depth table
|
|
* is walked and the coeff and recon at the correct depths are collected */
|
|
for (uint32_t i = 0; i <= m_numLayers; i++)
|
|
{
|
|
CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
|
|
m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
|
|
m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
|
|
ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
|
|
ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
|
|
}
|
|
|
|
/* the rest of these buffers are indexed per-depth */
|
|
for (uint32_t i = 0; i <= g_maxCUDepth; i++)
|
|
{
|
|
int cuSize = g_maxCUSize >> i;
|
|
ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp);
|
|
ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp);
|
|
ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp);
|
|
ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
|
|
}
|
|
|
|
CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
|
|
m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
|
|
m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
|
|
CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
|
|
m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
|
|
m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
|
|
|
|
CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
|
|
m_fencScaled = m_intraPred + 32 * 32;
|
|
m_fencTransposed = m_fencScaled + 32 * 32;
|
|
m_intraPredAngs = m_fencTransposed + 32 * 32;
|
|
|
|
CHECKED_MALLOC(m_tsCoeff, coeff_t, MAX_TS_SIZE * MAX_TS_SIZE);
|
|
CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
|
|
CHECKED_MALLOC(m_tsRecon, pixel, MAX_TS_SIZE * MAX_TS_SIZE);
|
|
|
|
return ok;
|
|
|
|
fail:
|
|
return false;
|
|
}
|
|
|
|
Search::~Search()
|
|
{
|
|
for (uint32_t i = 0; i <= m_numLayers; i++)
|
|
{
|
|
X265_FREE(m_rqt[i].coeffRQT[0]);
|
|
m_rqt[i].reconQtYuv.destroy();
|
|
m_rqt[i].resiQtYuv.destroy();
|
|
}
|
|
|
|
for (uint32_t i = 0; i <= g_maxCUDepth; i++)
|
|
{
|
|
m_rqt[i].tmpResiYuv.destroy();
|
|
m_rqt[i].tmpPredYuv.destroy();
|
|
m_rqt[i].bidirPredYuv[0].destroy();
|
|
m_rqt[i].bidirPredYuv[1].destroy();
|
|
}
|
|
|
|
X265_FREE(m_qtTempCbf[0]);
|
|
X265_FREE(m_qtTempTransformSkipFlag[0]);
|
|
X265_FREE(m_intraPred);
|
|
X265_FREE(m_tsCoeff);
|
|
X265_FREE(m_tsResidual);
|
|
X265_FREE(m_tsRecon);
|
|
}
|
|
|
|
int Search::setLambdaFromQP(const CUData& ctu, int qp)
|
|
{
|
|
X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
|
|
|
|
m_me.setQP(qp);
|
|
m_rdCost.setQP(*m_slice, qp);
|
|
|
|
int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
|
|
m_quant.setQPforQuant(ctu, quantQP);
|
|
return quantQP;
|
|
}
|
|
|
|
#if CHECKED_BUILD || _DEBUG
|
|
void Search::invalidateContexts(int fromDepth)
|
|
{
|
|
/* catch reads without previous writes */
|
|
for (int d = fromDepth; d < NUM_FULL_DEPTH; d++)
|
|
{
|
|
m_rqt[d].cur.markInvalid();
|
|
m_rqt[d].rqtTemp.markInvalid();
|
|
m_rqt[d].rqtRoot.markInvalid();
|
|
m_rqt[d].rqtTest.markInvalid();
|
|
}
|
|
}
|
|
#else
|
|
void Search::invalidateContexts(int) {}
|
|
#endif
|
|
|
|
void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
|
|
{
|
|
uint32_t subdiv = tuDepth < cu.m_tuDepth[absPartIdx];
|
|
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
|
|
|
|
if (!(log2TrSize - m_hChromaShift < 2))
|
|
{
|
|
if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
|
|
m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
|
|
if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
|
|
m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
|
|
}
|
|
|
|
if (subdiv)
|
|
{
|
|
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
|
|
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
|
|
codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
|
|
}
|
|
}
|
|
|
|
void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
|
|
{
|
|
if (!cu.getCbf(absPartIdx, ttype, tuDepth))
|
|
return;
|
|
|
|
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
|
|
|
|
if (tuDepth < cu.m_tuDepth[absPartIdx])
|
|
{
|
|
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
|
|
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
|
|
codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
|
|
|
|
return;
|
|
}
|
|
|
|
uint32_t tuDepthC = tuDepth;
|
|
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
|
|
|
|
if (log2TrSizeC < 2)
|
|
{
|
|
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
|
|
if (absPartIdx & 3)
|
|
return;
|
|
log2TrSizeC = 2;
|
|
tuDepthC--;
|
|
}
|
|
|
|
uint32_t qtLayer = log2TrSize - 2;
|
|
|
|
if (m_csp != X265_CSP_I422)
|
|
{
|
|
uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0;
|
|
uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift);
|
|
coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
|
|
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
|
|
}
|
|
else
|
|
{
|
|
uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
|
|
coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
|
|
uint32_t subTUSize = 1 << (log2TrSizeC * 2);
|
|
uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
|
|
if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
|
|
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
|
|
if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
|
|
m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
|
|
}
|
|
}
|
|
|
|
void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
|
|
{
|
|
CUData& cu = mode.cu;
|
|
uint32_t fullDepth = cuGeom.depth + tuDepth;
|
|
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
|
|
uint32_t qtLayer = log2TrSize - 2;
|
|
uint32_t sizeIdx = log2TrSize - 2;
|
|
bool mightNotSplit = log2TrSize <= depthRange[1];
|
|
bool mightSplit = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
|
|
|
|
/* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
|
|
if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
|
|
{
|
|
mightNotSplit = false;
|
|
mightSplit = true;
|
|
}
|
|
|
|
Cost fullCost;
|
|
uint32_t bCBF = 0;
|
|
|
|
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
|
|
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
|
|
|
|
if (mightNotSplit)
|
|
{
|
|
if (mightSplit)
|
|
m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
|
|
|
|
const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
|
|
pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
|
|
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
|
|
uint32_t stride = mode.fencYuv->m_size;
|
|
|
|
// init availability pattern
|
|
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
|
|
IntraNeighbors intraNeighbors;
|
|
initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
|
|
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
|
|
|
|
// get prediction signal
|
|
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
|
|
|
|
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
|
|
cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
|
|
|
|
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
|
|
coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
|
|
|
|
// store original entropy coding status
|
|
if (m_bEnableRDOQ)
|
|
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
|
|
|
|
primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
|
|
|
|
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
|
|
if (numSig)
|
|
{
|
|
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
|
|
primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
|
|
}
|
|
else
|
|
// no coded residual, recon = pred
|
|
primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride);
|
|
|
|
bCBF = !!numSig << tuDepth;
|
|
cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
|
|
fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride);
|
|
|
|
m_entropyCoder.resetBits();
|
|
if (!absPartIdx)
|
|
{
|
|
if (!cu.m_slice->isIntra())
|
|
{
|
|
if (cu.m_slice->m_pps->bTransquantBypassEnabled)
|
|
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
|
|
m_entropyCoder.codeSkipFlag(cu, 0);
|
|
m_entropyCoder.codePredMode(cu.m_predMode[0]);
|
|
}
|
|
|
|
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
|
|
}
|
|
if (cu.m_partSize[0] == SIZE_2Nx2N)
|
|
{
|
|
if (!absPartIdx)
|
|
m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
|
|
}
|
|
else
|
|
{
|
|
uint32_t qNumParts = cuGeom.numPartitions >> 2;
|
|
if (!tuDepth)
|
|
{
|
|
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
|
|
m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
|
|
}
|
|
else if (!(absPartIdx & (qNumParts - 1)))
|
|
m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
|
|
}
|
|
if (log2TrSize != depthRange[0])
|
|
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
|
|
|
|
m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
|
|
|
|
if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
|
|
m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
|
|
|
|
fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
|
|
|
|
if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
|
|
fullCost.bits *= 4;
|
|
|
|
if (m_rdCost.m_psyRd)
|
|
{
|
|
fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
|
|
fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
|
|
}
|
|
else
|
|
fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
|
|
}
|
|
else
|
|
fullCost.rdcost = MAX_INT64;
|
|
|
|
if (mightSplit)
|
|
{
|
|
if (mightNotSplit)
|
|
{
|
|
m_entropyCoder.store(m_rqt[fullDepth].rqtTest); // save state after full TU encode
|
|
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); // prep state of split encode
|
|
}
|
|
|
|
/* code split block */
|
|
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
|
|
|
|
int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
|
|
if (m_param->bEnableTSkipFast)
|
|
checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
|
|
|
|
Cost splitCost;
|
|
uint32_t cbf = 0;
|
|
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
|
|
{
|
|
if (checkTransformSkip)
|
|
codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
|
|
else
|
|
codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
|
|
|
|
cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
|
|
}
|
|
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
|
|
cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
|
|
|
|
if (mightNotSplit && log2TrSize != depthRange[0])
|
|
{
|
|
/* If we could have coded this TU depth, include cost of subdiv flag */
|
|
m_entropyCoder.resetBits();
|
|
m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
|
|
splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
|
|
|
|
if (m_rdCost.m_psyRd)
|
|
splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
|
|
else
|
|
splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
|
|
}
|
|
|
|
if (splitCost.rdcost < fullCost.rdcost)
|
|
{
|
|
outCost.rdcost += splitCost.rdcost;
|
|
outCost.distortion += splitCost.distortion;
|
|
outCost.bits += splitCost.bits;
|
|
outCost.energy += splitCost.energy;
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
// recover entropy state of full-size TU encode
|
|
m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
|
|
|
|
// recover transform index and Cbf values
|
|
cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
|
|
cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
|
|
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
|
|
}
|
|
}
|
|
|
|
// set reconstruction for next intra prediction blocks if full TU prediction won
|
|
PicYuv* reconPic = m_frame->m_reconPic;
|
|
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
|
|
intptr_t picStride = reconPic->m_stride;
|
|
primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
|
|
|
|
outCost.rdcost += fullCost.rdcost;
|
|
outCost.distortion += fullCost.distortion;
|
|
outCost.bits += fullCost.bits;
|
|
outCost.energy += fullCost.energy;
|
|
}
|
|
|
|
void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
|
|
{
|
|
uint32_t fullDepth = cuGeom.depth + tuDepth;
|
|
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
|
|
uint32_t tuSize = 1 << log2TrSize;
|
|
|
|
X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
|
|
|
|
CUData& cu = mode.cu;
|
|
Yuv* predYuv = &mode.predYuv;
|
|
const Yuv* fencYuv = mode.fencYuv;
|
|
|
|
Cost fullCost;
|
|
fullCost.rdcost = MAX_INT64;
|
|
int bTSkip = 0;
|
|
uint32_t bCBF = 0;
|
|
|
|
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
|
|
pixel* pred = predYuv->getLumaAddr(absPartIdx);
|
|
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
|
|
uint32_t stride = fencYuv->m_size;
|
|
uint32_t sizeIdx = log2TrSize - 2;
|
|
|
|
// init availability pattern
|
|
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
|
|
IntraNeighbors intraNeighbors;
|
|
initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
|
|
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
|
|
|
|
// get prediction signal
|
|
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
|
|
|
|
cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
|
|
|
|
uint32_t qtLayer = log2TrSize - 2;
|
|
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
|
|
coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
|
|
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
|
|
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
|
|
|
|
// store original entropy coding status
|
|
m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
|
|
|
|
if (m_bEnableRDOQ)
|
|
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
|
|
|
|
int checkTransformSkip = 1;
|
|
for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
|
|
{
|
|
uint64_t tmpCost;
|
|
uint32_t tmpEnergy = 0;
|
|
|
|
coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
|
|
pixel* tmpRecon = (useTSkip ? m_tsRecon : reconQt);
|
|
uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
|
|
|
|
primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
|
|
|
|
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
|
|
if (numSig)
|
|
{
|
|
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
|
|
primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, residual, stride, stride);
|
|
}
|
|
else if (useTSkip)
|
|
{
|
|
/* do not allow tskip if CBF=0, pretend we did not try tskip */
|
|
checkTransformSkip = 0;
|
|
break;
|
|
}
|
|
else
|
|
// no residual coded, recon = pred
|
|
primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
|
|
|
|
sse_ret_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
|
|
|
|
cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
|
|
cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
|
|
|
|
if (useTSkip)
|
|
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
|
|
|
|
m_entropyCoder.resetBits();
|
|
if (!absPartIdx)
|
|
{
|
|
if (!cu.m_slice->isIntra())
|
|
{
|
|
if (cu.m_slice->m_pps->bTransquantBypassEnabled)
|
|
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
|
|
m_entropyCoder.codeSkipFlag(cu, 0);
|
|
m_entropyCoder.codePredMode(cu.m_predMode[0]);
|
|
}
|
|
|
|
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
|
|
}
|
|
if (cu.m_partSize[0] == SIZE_2Nx2N)
|
|
{
|
|
if (!absPartIdx)
|
|
m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
|
|
}
|
|
else
|
|
{
|
|
uint32_t qNumParts = cuGeom.numPartitions >> 2;
|
|
if (!tuDepth)
|
|
{
|
|
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
|
|
m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
|
|
}
|
|
else if (!(absPartIdx & (qNumParts - 1)))
|
|
m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
|
|
}
|
|
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
|
|
|
|
m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
|
|
|
|
if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
|
|
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
|
|
|
|
uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
|
|
|
|
if (!useTSkip)
|
|
m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
|
|
|
|
if (m_rdCost.m_psyRd)
|
|
{
|
|
tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
|
|
tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
|
|
}
|
|
else
|
|
tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
|
|
|
|
if (tmpCost < fullCost.rdcost)
|
|
{
|
|
bTSkip = useTSkip;
|
|
bCBF = !!numSig;
|
|
fullCost.rdcost = tmpCost;
|
|
fullCost.distortion = tmpDist;
|
|
fullCost.bits = tmpBits;
|
|
fullCost.energy = tmpEnergy;
|
|
}
|
|
}
|
|
|
|
if (bTSkip)
|
|
{
|
|
memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2));
|
|
primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize);
|
|
}
|
|
else if (checkTransformSkip)
|
|
{
|
|
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
|
|
cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
|
|
m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
|
|
}
|
|
|
|
// set reconstruction for next intra prediction blocks
|
|
PicYuv* reconPic = m_frame->m_reconPic;
|
|
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
|
|
intptr_t picStride = reconPic->m_stride;
|
|
primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
|
|
|
|
outCost.rdcost += fullCost.rdcost;
|
|
outCost.distortion += fullCost.distortion;
|
|
outCost.bits += fullCost.bits;
|
|
outCost.energy += fullCost.energy;
|
|
}
|
|
|
|
/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
|
|
void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
|
|
{
|
|
CUData& cu = mode.cu;
|
|
uint32_t fullDepth = cuGeom.depth + tuDepth;
|
|
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
|
|
bool bCheckFull = log2TrSize <= depthRange[1];
|
|
|
|
X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
|
|
|
|
/* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
|
|
* since we are not measuring RD cost */
|
|
if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4)
|
|
bCheckFull = false;
|
|
|
|
if (bCheckFull)
|
|
{
|
|
const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
|
|
pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
|
|
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
|
|
uint32_t stride = mode.fencYuv->m_size;
|
|
|
|
// init availability pattern
|
|
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
|
|
IntraNeighbors intraNeighbors;
|
|
initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
|
|
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
|
|
|
|
// get prediction signal
|
|
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
|
|
|
|
X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
|
|
cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
|
|
|
|
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
|
|
coeff_t* coeffY = cu.m_trCoeff[0] + coeffOffsetY;
|
|
|
|
uint32_t sizeIdx = log2TrSize - 2;
|
|
primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
|
|
|
|
PicYuv* reconPic = m_frame->m_reconPic;
|
|
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
|
|
intptr_t picStride = reconPic->m_stride;
|
|
|
|
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
|
|
if (numSig)
|
|
{
|
|
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
|
|
primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, residual, stride, stride);
|
|
cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
|
|
}
|
|
else
|
|
{
|
|
primitives.cu[sizeIdx].copy_pp(picReconY, picStride, pred, stride);
|
|
cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
|
|
|
|
/* code split block */
|
|
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
|
|
uint32_t cbf = 0;
|
|
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
|
|
{
|
|
residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
|
|
cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
|
|
}
|
|
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
|
|
cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
|
|
}
|
|
}
|
|
|
|
void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
|
|
{
|
|
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
|
|
|
|
if (tuDepth == cu.m_tuDepth[absPartIdx])
|
|
{
|
|
uint32_t qtLayer = log2TrSize - 2;
|
|
|
|
// copy transform coefficients
|
|
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
|
|
coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
|
|
coeff_t* coeffDestY = cu.m_trCoeff[0] + coeffOffsetY;
|
|
memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
|
|
|
|
// copy reconstruction
|
|
m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
|
|
}
|
|
else
|
|
{
|
|
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
|
|
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
|
|
extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
|
|
}
|
|
}
|
|
|
|
inline void offsetCBFs(uint8_t subTUCBF[2])
|
|
{
|
|
uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
|
|
subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
|
|
subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
|
|
}
|
|
|
|
/* 4:2:2 post-TU split processing */
|
|
void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
|
|
{
|
|
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
|
|
|
|
if (log2TrSize == 2)
|
|
{
|
|
X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
|
|
++log2TrSize;
|
|
}
|
|
|
|
uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
|
|
|
|
// move the CBFs down a level and set the parent CBF
|
|
uint8_t subTUCBF[2];
|
|
subTUCBF[0] = cu.getCbf(absPartIdx , ttype, tuDepth);
|
|
subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
|
|
offsetCBFs(subTUCBF);
|
|
|
|
cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx , tuNumParts);
|
|
cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
|
|
}
|
|
|
|
/* returns distortion */
|
|
uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
|
|
{
|
|
CUData& cu = mode.cu;
|
|
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
|
|
|
|
if (tuDepth < cu.m_tuDepth[absPartIdx])
|
|
{
|
|
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
|
|
uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
|
|
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
|
|
{
|
|
outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
|
|
splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
|
|
splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
|
|
}
|
|
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
|
|
{
|
|
cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
|
|
cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
|
|
}
|
|
|
|
return outDist;
|
|
}
|
|
|
|
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
|
|
uint32_t tuDepthC = tuDepth;
|
|
if (log2TrSizeC < 2)
|
|
{
|
|
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
|
|
if (absPartIdx & 3)
|
|
return 0;
|
|
log2TrSizeC = 2;
|
|
tuDepthC--;
|
|
}
|
|
|
|
if (m_bEnableRDOQ)
|
|
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
|
|
|
|
bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
|
|
checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
|
|
if (checkTransformSkip)
|
|
return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
|
|
|
|
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
|
|
uint32_t qtLayer = log2TrSize - 2;
|
|
uint32_t stride = mode.fencYuv->m_csize;
|
|
const uint32_t sizeIdxC = log2TrSizeC - 2;
|
|
sse_ret_t outDist = 0;
|
|
|
|
uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
|
|
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
|
|
|
|
TURecurse tuIterator(splitType, curPartNum, absPartIdx);
|
|
do
|
|
{
|
|
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
|
|
|
|
IntraNeighbors intraNeighbors;
|
|
initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
|
|
|
|
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
|
|
{
|
|
TextType ttype = (TextType)chromaId;
|
|
|
|
const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
|
|
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
|
|
int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
|
|
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
|
|
coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
|
|
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
|
|
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
|
|
PicYuv* reconPic = m_frame->m_reconPic;
|
|
pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
|
|
intptr_t picStride = reconPic->m_strideC;
|
|
|
|
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
|
|
if (chromaPredMode == DM_CHROMA_IDX)
|
|
chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
|
|
if (m_csp == X265_CSP_I422)
|
|
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
|
|
|
|
// init availability pattern
|
|
initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
|
|
|
|
// get prediction signal
|
|
predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
|
|
cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
|
|
|
|
primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
|
|
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
|
|
if (numSig)
|
|
{
|
|
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
|
|
primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
|
|
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
|
|
}
|
|
else
|
|
{
|
|
// no coded residual, recon = pred
|
|
primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, pred, stride);
|
|
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
|
|
}
|
|
|
|
outDist += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride));
|
|
|
|
if (m_rdCost.m_psyRd)
|
|
psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
|
|
|
|
primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride);
|
|
}
|
|
}
|
|
while (tuIterator.isNextSection());
|
|
|
|
if (splitType == VERTICAL_SPLIT)
|
|
{
|
|
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
|
|
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
|
|
}
|
|
|
|
return outDist;
|
|
}
|
|
|
|
/* returns distortion */
|
|
uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
|
|
{
|
|
CUData& cu = mode.cu;
|
|
uint32_t fullDepth = cuGeom.depth + tuDepth;
|
|
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
|
|
const uint32_t log2TrSizeC = 2;
|
|
uint32_t qtLayer = log2TrSize - 2;
|
|
uint32_t outDist = 0;
|
|
|
|
/* At the TU layers above this one, no RDO is performed, only distortion is being measured,
|
|
* so the entropy coder is not very accurate. The best we can do is return it in the same
|
|
* condition as it arrived, and to do all bit estimates from the same state. */
|
|
m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
|
|
|
|
uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
|
|
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
|
|
|
|
TURecurse tuIterator(splitType, curPartNum, absPartIdx);
|
|
do
|
|
{
|
|
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
|
|
|
|
IntraNeighbors intraNeighbors;
|
|
initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
|
|
|
|
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
|
|
{
|
|
TextType ttype = (TextType)chromaId;
|
|
|
|
const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
|
|
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
|
|
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
|
|
uint32_t stride = mode.fencYuv->m_csize;
|
|
const uint32_t sizeIdxC = log2TrSizeC - 2;
|
|
|
|
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
|
|
coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
|
|
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
|
|
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
|
|
|
|
// init availability pattern
|
|
initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
|
|
|
|
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
|
|
if (chromaPredMode == DM_CHROMA_IDX)
|
|
chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
|
|
if (m_csp == X265_CSP_I422)
|
|
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
|
|
|
|
// get prediction signal
|
|
predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
|
|
|
|
uint64_t bCost = MAX_INT64;
|
|
uint32_t bDist = 0;
|
|
uint32_t bCbf = 0;
|
|
uint32_t bEnergy = 0;
|
|
int bTSkip = 0;
|
|
|
|
int checkTransformSkip = 1;
|
|
for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
|
|
{
|
|
coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC);
|
|
pixel* recon = (useTSkip ? m_tsRecon : reconQt);
|
|
uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
|
|
|
|
primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
|
|
|
|
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
|
|
if (numSig)
|
|
{
|
|
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
|
|
primitives.cu[sizeIdxC].add_ps(recon, reconStride, pred, residual, stride, stride);
|
|
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
|
|
}
|
|
else if (useTSkip)
|
|
{
|
|
checkTransformSkip = 0;
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
|
|
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
|
|
}
|
|
sse_ret_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
|
|
tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
|
|
|
|
cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
|
|
|
|
uint32_t tmpBits = 0, tmpEnergy = 0;
|
|
if (numSig)
|
|
{
|
|
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
|
|
m_entropyCoder.resetBits();
|
|
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
|
|
tmpBits = m_entropyCoder.getNumberOfWrittenBits();
|
|
}
|
|
|
|
uint64_t tmpCost;
|
|
if (m_rdCost.m_psyRd)
|
|
{
|
|
tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
|
|
tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
|
|
}
|
|
else
|
|
tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
|
|
|
|
if (tmpCost < bCost)
|
|
{
|
|
bCost = tmpCost;
|
|
bDist = tmpDist;
|
|
bTSkip = useTSkip;
|
|
bCbf = !!numSig;
|
|
bEnergy = tmpEnergy;
|
|
}
|
|
}
|
|
|
|
if (bTSkip)
|
|
{
|
|
memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2));
|
|
primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE);
|
|
}
|
|
|
|
cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
|
|
cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
|
|
|
|
PicYuv* reconPic = m_frame->m_reconPic;
|
|
pixel* reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
|
|
intptr_t picStride = reconPic->m_strideC;
|
|
primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride);
|
|
|
|
outDist += bDist;
|
|
psyEnergy += bEnergy;
|
|
}
|
|
}
|
|
while (tuIterator.isNextSection());
|
|
|
|
if (splitType == VERTICAL_SPLIT)
|
|
{
|
|
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
|
|
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
|
|
}
|
|
|
|
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
|
|
return outDist;
|
|
}
|
|
|
|
void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
|
|
{
|
|
uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
|
|
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
|
|
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
|
|
|
|
if (tuDepthL == tuDepth || log2TrSizeC == 2)
|
|
{
|
|
// copy transform coefficients
|
|
uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
|
|
uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
|
|
|
|
uint32_t qtLayer = log2TrSize - 2 - (tuDepthL - tuDepth);
|
|
coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
|
|
coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
|
|
coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
|
|
coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
|
|
memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
|
|
memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
|
|
|
|
// copy reconstruction
|
|
m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
|
|
}
|
|
else
|
|
{
|
|
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
|
|
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
|
|
extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
|
|
}
|
|
}
|
|
|
|
void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth)
|
|
{
|
|
CUData& cu = mode.cu;
|
|
uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
|
|
|
|
if (tuDepth < cu.m_tuDepth[absPartIdx])
|
|
{
|
|
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
|
|
uint32_t splitCbfU = 0, splitCbfV = 0;
|
|
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
|
|
{
|
|
residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1);
|
|
splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
|
|
splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
|
|
}
|
|
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
|
|
{
|
|
cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
|
|
cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
|
|
uint32_t tuDepthC = tuDepth;
|
|
if (log2TrSizeC < 2)
|
|
{
|
|
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
|
|
if (absPartIdx & 3)
|
|
return;
|
|
log2TrSizeC = 2;
|
|
tuDepthC--;
|
|
}
|
|
|
|
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
|
|
uint32_t stride = mode.fencYuv->m_csize;
|
|
const uint32_t sizeIdxC = log2TrSizeC - 2;
|
|
|
|
uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
|
|
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
|
|
|
|
TURecurse tuIterator(splitType, curPartNum, absPartIdx);
|
|
do
|
|
{
|
|
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
|
|
|
|
IntraNeighbors intraNeighbors;
|
|
initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
|
|
|
|
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
|
|
{
|
|
TextType ttype = (TextType)chromaId;
|
|
|
|
const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
|
|
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
|
|
int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
|
|
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
|
|
coeff_t* coeffC = cu.m_trCoeff[ttype] + coeffOffsetC;
|
|
PicYuv* reconPic = m_frame->m_reconPic;
|
|
pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
|
|
intptr_t picStride = reconPic->m_strideC;
|
|
|
|
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
|
|
if (chromaPredMode == DM_CHROMA_IDX)
|
|
chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
|
|
if (m_csp == X265_CSP_I422)
|
|
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
|
|
|
|
// init availability pattern
|
|
initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
|
|
|
|
// get prediction signal
|
|
predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
|
|
|
|
X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
|
|
|
|
primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
|
|
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
|
|
if (numSig)
|
|
{
|
|
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
|
|
primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, residual, stride, stride);
|
|
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
|
|
}
|
|
else
|
|
{
|
|
// no coded residual, recon = pred
|
|
primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, pred, stride);
|
|
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
|
|
}
|
|
}
|
|
}
|
|
while (tuIterator.isNextSection());
|
|
|
|
if (splitType == VERTICAL_SPLIT)
|
|
{
|
|
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
|
|
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
|
|
}
|
|
}
|
|
|
|
void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes)
|
|
{
|
|
CUData& cu = intraMode.cu;
|
|
|
|
cu.setPartSizeSubParts(partSize);
|
|
cu.setPredModeSubParts(MODE_INTRA);
|
|
|
|
uint32_t tuDepthRange[2];
|
|
cu.getIntraTUQtDepthRange(tuDepthRange, 0);
|
|
|
|
intraMode.initCosts();
|
|
intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
|
|
if (m_csp != X265_CSP_I400)
|
|
intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom, sharedChromaModes);
|
|
intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
|
|
|
|
m_entropyCoder.resetBits();
|
|
if (m_slice->m_pps->bTransquantBypassEnabled)
|
|
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
|
|
|
|
if (!m_slice->isIntra())
|
|
{
|
|
m_entropyCoder.codeSkipFlag(cu, 0);
|
|
m_entropyCoder.codePredMode(cu.m_predMode[0]);
|
|
}
|
|
|
|
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
|
|
m_entropyCoder.codePredInfo(cu, 0);
|
|
intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
|
|
|
|
bool bCodeDQP = m_slice->m_pps->bUseDQP;
|
|
m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
|
|
m_entropyCoder.store(intraMode.contexts);
|
|
intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
|
|
intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
|
|
if (m_rdCost.m_psyRd)
|
|
{
|
|
const Yuv* fencYuv = intraMode.fencYuv;
|
|
intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
|
|
}
|
|
intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
|
|
|
|
updateModeCost(intraMode);
|
|
checkDQP(intraMode, cuGeom);
|
|
}
|
|
|
|
/* Note that this function does not save the best intra prediction, it must
|
|
* be generated later. It records the best mode in the cu */
|
|
void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
|
|
{
|
|
ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
|
|
|
|
CUData& cu = intraMode.cu;
|
|
uint32_t depth = cuGeom.depth;
|
|
|
|
cu.setPartSizeSubParts(SIZE_2Nx2N);
|
|
cu.setPredModeSubParts(MODE_INTRA);
|
|
|
|
const uint32_t initTuDepth = 0;
|
|
uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
|
|
uint32_t tuSize = 1 << log2TrSize;
|
|
const uint32_t absPartIdx = 0;
|
|
|
|
// Reference sample smoothing
|
|
IntraNeighbors intraNeighbors;
|
|
initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
|
|
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
|
|
|
|
const pixel* fenc = intraMode.fencYuv->m_buf[0];
|
|
uint32_t stride = intraMode.fencYuv->m_size;
|
|
|
|
int sad, bsad;
|
|
uint32_t bits, bbits, mode, bmode;
|
|
uint64_t cost, bcost;
|
|
|
|
// 33 Angle modes once
|
|
int scaleTuSize = tuSize;
|
|
int scaleStride = stride;
|
|
int costShift = 0;
|
|
int sizeIdx = log2TrSize - 2;
|
|
|
|
if (tuSize > 32)
|
|
{
|
|
// CU is 64x64, we scale to 32x32 and adjust required parameters
|
|
primitives.scale2D_64to32(m_fencScaled, fenc, stride);
|
|
fenc = m_fencScaled;
|
|
|
|
pixel nScale[129];
|
|
intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
|
|
primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1);
|
|
|
|
// we do not estimate filtering for downscaled samples
|
|
memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel)); // Top & Left pixels
|
|
memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel));
|
|
|
|
scaleTuSize = 32;
|
|
scaleStride = 32;
|
|
costShift = 2;
|
|
sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
|
|
}
|
|
|
|
pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
|
|
int predsize = scaleTuSize * scaleTuSize;
|
|
|
|
m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
|
|
|
|
/* there are three cost tiers for intra modes:
|
|
* pred[0] - mode probable, least cost
|
|
* pred[1], pred[2] - less probable, slightly more cost
|
|
* non-mpm modes - all cost the same (rbits) */
|
|
uint64_t mpms;
|
|
uint32_t mpmModes[3];
|
|
uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
|
|
|
|
// DC
|
|
primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPredAngs, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
|
|
bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
|
|
bmode = mode = DC_IDX;
|
|
bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
|
|
bcost = m_rdCost.calcRdSADCost(bsad, bbits);
|
|
|
|
// PLANAR
|
|
pixel* planar = intraNeighbourBuf[0];
|
|
if (tuSize & (8 | 16 | 32))
|
|
planar = intraNeighbourBuf[1];
|
|
|
|
primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPredAngs, scaleStride, planar, 0, 0);
|
|
sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
|
|
mode = PLANAR_IDX;
|
|
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
|
|
cost = m_rdCost.calcRdSADCost(sad, bits);
|
|
COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
|
|
|
|
bool allangs = true;
|
|
if (primitives.cu[sizeIdx].intra_pred_allangs)
|
|
{
|
|
primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
|
|
primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
|
|
}
|
|
else
|
|
allangs = false;
|
|
|
|
#define TRY_ANGLE(angle) \
|
|
if (allangs) { \
|
|
if (angle < 18) \
|
|
sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
|
|
else \
|
|
sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
|
|
bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
|
|
cost = m_rdCost.calcRdSADCost(sad, bits); \
|
|
} else { \
|
|
int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
|
|
primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
|
|
sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
|
|
bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
|
|
cost = m_rdCost.calcRdSADCost(sad, bits); \
|
|
}
|
|
|
|
if (m_param->bEnableFastIntra)
|
|
{
|
|
int asad = 0;
|
|
uint32_t lowmode, highmode, amode = 5, abits = 0;
|
|
uint64_t acost = MAX_INT64;
|
|
|
|
/* pick the best angle, sampling at distance of 5 */
|
|
for (mode = 5; mode < 35; mode += 5)
|
|
{
|
|
TRY_ANGLE(mode);
|
|
COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
|
|
}
|
|
|
|
/* refine best angle at distance 2, then distance 1 */
|
|
for (uint32_t dist = 2; dist >= 1; dist--)
|
|
{
|
|
lowmode = amode - dist;
|
|
highmode = amode + dist;
|
|
|
|
X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
|
|
TRY_ANGLE(lowmode);
|
|
COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
|
|
|
|
X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
|
|
TRY_ANGLE(highmode);
|
|
COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
|
|
}
|
|
|
|
if (amode == 33)
|
|
{
|
|
TRY_ANGLE(34);
|
|
COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
|
|
}
|
|
|
|
COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
|
|
}
|
|
else // calculate and search all intra prediction angles for lowest cost
|
|
{
|
|
for (mode = 2; mode < 35; mode++)
|
|
{
|
|
TRY_ANGLE(mode);
|
|
COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
|
|
}
|
|
}
|
|
|
|
cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
|
|
intraMode.initCosts();
|
|
intraMode.totalBits = bbits;
|
|
intraMode.distortion = bsad;
|
|
intraMode.sa8dCost = bcost;
|
|
intraMode.sa8dBits = bbits;
|
|
X265_CHECK(intraMode.ok(), "intra mode is not ok");
|
|
}
|
|
|
|
void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
|
|
{
|
|
ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
|
|
|
|
CUData& cu = intraMode.cu;
|
|
Yuv* reconYuv = &intraMode.reconYuv;
|
|
|
|
X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
|
|
X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
|
|
|
|
uint32_t tuDepthRange[2];
|
|
cu.getIntraTUQtDepthRange(tuDepthRange, 0);
|
|
|
|
m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
|
|
|
|
Cost icosts;
|
|
codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
|
|
extractIntraResultQT(cu, *reconYuv, 0, 0);
|
|
|
|
intraMode.lumaDistortion = icosts.distortion;
|
|
intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom, NULL);
|
|
intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
|
|
|
|
m_entropyCoder.resetBits();
|
|
if (m_slice->m_pps->bTransquantBypassEnabled)
|
|
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
|
|
m_entropyCoder.codeSkipFlag(cu, 0);
|
|
m_entropyCoder.codePredMode(cu.m_predMode[0]);
|
|
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
|
|
m_entropyCoder.codePredInfo(cu, 0);
|
|
intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
|
|
|
|
bool bCodeDQP = m_slice->m_pps->bUseDQP;
|
|
m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
|
|
|
|
intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
|
|
intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
|
|
if (m_rdCost.m_psyRd)
|
|
{
|
|
const Yuv* fencYuv = intraMode.fencYuv;
|
|
intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
|
|
}
|
|
intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
|
|
m_entropyCoder.store(intraMode.contexts);
|
|
updateModeCost(intraMode);
|
|
checkDQP(intraMode, cuGeom);
|
|
}
|
|
|
|
uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes)
|
|
{
|
|
CUData& cu = intraMode.cu;
|
|
Yuv* reconYuv = &intraMode.reconYuv;
|
|
Yuv* predYuv = &intraMode.predYuv;
|
|
const Yuv* fencYuv = intraMode.fencYuv;
|
|
|
|
uint32_t depth = cuGeom.depth;
|
|
uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
|
|
uint32_t numPU = 1 << (2 * initTuDepth);
|
|
uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
|
|
uint32_t tuSize = 1 << log2TrSize;
|
|
uint32_t qNumParts = cuGeom.numPartitions >> 2;
|
|
uint32_t sizeIdx = log2TrSize - 2;
|
|
uint32_t absPartIdx = 0;
|
|
uint32_t totalDistortion = 0;
|
|
|
|
int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
|
|
|
|
// loop over partitions
|
|
for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
|
|
{
|
|
uint32_t bmode = 0;
|
|
|
|
if (sharedModes)
|
|
bmode = sharedModes[puIdx];
|
|
else
|
|
{
|
|
uint64_t candCostList[MAX_RD_INTRA_MODES];
|
|
uint32_t rdModeList[MAX_RD_INTRA_MODES];
|
|
uint64_t bcost;
|
|
int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
|
|
|
|
{
|
|
ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
|
|
|
|
// Reference sample smoothing
|
|
IntraNeighbors intraNeighbors;
|
|
initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
|
|
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
|
|
|
|
// determine set of modes to be tested (using prediction signal only)
|
|
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
|
|
uint32_t stride = predYuv->m_size;
|
|
|
|
int scaleTuSize = tuSize;
|
|
int scaleStride = stride;
|
|
int costShift = 0;
|
|
|
|
if (tuSize > 32)
|
|
{
|
|
// origin is 64x64, we scale to 32x32 and setup required parameters
|
|
primitives.scale2D_64to32(m_fencScaled, fenc, stride);
|
|
fenc = m_fencScaled;
|
|
|
|
pixel nScale[129];
|
|
intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
|
|
primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1);
|
|
|
|
memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel));
|
|
memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel));
|
|
|
|
scaleTuSize = 32;
|
|
scaleStride = 32;
|
|
costShift = 2;
|
|
sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
|
|
}
|
|
|
|
m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
|
|
|
|
/* there are three cost tiers for intra modes:
|
|
* pred[0] - mode probable, least cost
|
|
* pred[1], pred[2] - less probable, slightly more cost
|
|
* non-mpm modes - all cost the same (rbits) */
|
|
uint64_t mpms;
|
|
uint32_t mpmModes[3];
|
|
uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
|
|
|
|
pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
|
|
uint64_t modeCosts[35];
|
|
|
|
// DC
|
|
primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
|
|
uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
|
|
uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
|
|
modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
|
|
|
|
// PLANAR
|
|
pixel* planar = intraNeighbourBuf[0];
|
|
if (tuSize >= 8 && tuSize <= 32)
|
|
planar = intraNeighbourBuf[1];
|
|
|
|
primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
|
|
bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
|
|
sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
|
|
modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
|
|
COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
|
|
|
|
// angular predictions
|
|
if (primitives.cu[sizeIdx].intra_pred_allangs)
|
|
{
|
|
primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
|
|
primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
|
|
for (int mode = 2; mode < 35; mode++)
|
|
{
|
|
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
|
|
if (mode < 18)
|
|
sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
|
|
else
|
|
sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
|
|
modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
|
|
COPY1_IF_LT(bcost, modeCosts[mode]);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (int mode = 2; mode < 35; mode++)
|
|
{
|
|
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
|
|
int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
|
|
primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
|
|
sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
|
|
modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
|
|
COPY1_IF_LT(bcost, modeCosts[mode]);
|
|
}
|
|
}
|
|
|
|
/* Find the top maxCandCount candidate modes with cost within 25% of best
|
|
* or among the most probable modes. maxCandCount is derived from the
|
|
* rdLevel and depth. In general we want to try more modes at slower RD
|
|
* levels and at higher depths */
|
|
for (int i = 0; i < maxCandCount; i++)
|
|
candCostList[i] = MAX_INT64;
|
|
|
|
uint64_t paddedBcost = bcost + (bcost >> 3); // 1.12%
|
|
for (int mode = 0; mode < 35; mode++)
|
|
if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode)))
|
|
updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
|
|
}
|
|
|
|
/* measure best candidates using simple RDO (no TU splits) */
|
|
bcost = MAX_INT64;
|
|
for (int i = 0; i < maxCandCount; i++)
|
|
{
|
|
if (candCostList[i] == MAX_INT64)
|
|
break;
|
|
|
|
ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
|
|
|
|
m_entropyCoder.load(m_rqt[depth].cur);
|
|
cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
|
|
|
|
Cost icosts;
|
|
if (checkTransformSkip)
|
|
codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
|
|
else
|
|
codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
|
|
COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
|
|
}
|
|
}
|
|
|
|
ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
|
|
|
|
/* remeasure best mode, allowing TU splits */
|
|
cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
|
|
m_entropyCoder.load(m_rqt[depth].cur);
|
|
|
|
Cost icosts;
|
|
if (checkTransformSkip)
|
|
codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
|
|
else
|
|
codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
|
|
totalDistortion += icosts.distortion;
|
|
|
|
extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
|
|
|
|
// set reconstruction for next intra prediction blocks
|
|
if (puIdx != numPU - 1)
|
|
{
|
|
/* This has important implications for parallelism and RDO. It is writing intermediate results into the
|
|
* output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
|
|
* it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
|
|
* that the contexts should be tracked through each PU */
|
|
PicYuv* reconPic = m_frame->m_reconPic;
|
|
pixel* dst = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
|
|
uint32_t dststride = reconPic->m_stride;
|
|
const pixel* src = reconYuv->getLumaAddr(absPartIdx);
|
|
uint32_t srcstride = reconYuv->m_size;
|
|
primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
|
|
}
|
|
}
|
|
|
|
if (numPU > 1)
|
|
{
|
|
uint32_t combCbfY = 0;
|
|
for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
|
|
combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
|
|
|
|
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
|
|
cu.m_cbf[0][offs] |= combCbfY;
|
|
}
|
|
|
|
// TODO: remove this
|
|
m_entropyCoder.load(m_rqt[depth].cur);
|
|
|
|
return totalDistortion;
|
|
}
|
|
|
|
void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
|
|
{
|
|
CUData& cu = intraMode.cu;
|
|
const Yuv* fencYuv = intraMode.fencYuv;
|
|
Yuv* predYuv = &intraMode.predYuv;
|
|
|
|
uint32_t bestMode = 0;
|
|
uint64_t bestCost = MAX_INT64;
|
|
uint32_t modeList[NUM_CHROMA_MODE];
|
|
|
|
uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
|
|
uint32_t tuSize = 1 << log2TrSizeC;
|
|
uint32_t tuDepth = 0;
|
|
int32_t costShift = 0;
|
|
|
|
if (tuSize > 32)
|
|
{
|
|
tuDepth = 1;
|
|
costShift = 2;
|
|
log2TrSizeC = 5;
|
|
}
|
|
|
|
IntraNeighbors intraNeighbors;
|
|
initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
|
|
cu.getAllowedChromaDir(0, modeList);
|
|
|
|
// check chroma modes
|
|
for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++)
|
|
{
|
|
uint32_t chromaPredMode = modeList[mode];
|
|
if (chromaPredMode == DM_CHROMA_IDX)
|
|
chromaPredMode = cu.m_lumaIntraDir[0];
|
|
if (m_csp == X265_CSP_I422)
|
|
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
|
|
|
|
uint64_t cost = 0;
|
|
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
|
|
{
|
|
const pixel* fenc = fencYuv->m_buf[chromaId];
|
|
pixel* pred = predYuv->m_buf[chromaId];
|
|
Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId);
|
|
// get prediction signal
|
|
predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC);
|
|
cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
|
|
}
|
|
|
|
if (cost < bestCost)
|
|
{
|
|
bestCost = cost;
|
|
bestMode = modeList[mode];
|
|
}
|
|
}
|
|
|
|
cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth);
|
|
}
|
|
|
|
uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes)
|
|
{
|
|
CUData& cu = intraMode.cu;
|
|
Yuv& reconYuv = intraMode.reconYuv;
|
|
|
|
uint32_t depth = cuGeom.depth;
|
|
uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
|
|
uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
|
|
uint32_t absPartStep = cuGeom.numPartitions;
|
|
uint32_t totalDistortion = 0;
|
|
|
|
int size = partitionFromLog2Size(log2TrSize);
|
|
|
|
TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
|
|
|
|
do
|
|
{
|
|
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
|
|
|
|
uint32_t bestMode = 0;
|
|
uint32_t bestDist = 0;
|
|
uint64_t bestCost = MAX_INT64;
|
|
|
|
// init mode list
|
|
uint32_t minMode = 0;
|
|
uint32_t maxMode = NUM_CHROMA_MODE;
|
|
uint32_t modeList[NUM_CHROMA_MODE];
|
|
|
|
if (sharedChromaModes && !initTuDepth)
|
|
{
|
|
for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++)
|
|
modeList[l] = sharedChromaModes[0];
|
|
maxMode = 1;
|
|
}
|
|
else
|
|
cu.getAllowedChromaDir(absPartIdxC, modeList);
|
|
|
|
// check chroma modes
|
|
for (uint32_t mode = minMode; mode < maxMode; mode++)
|
|
{
|
|
// restore context models
|
|
m_entropyCoder.load(m_rqt[depth].cur);
|
|
|
|
cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
|
|
uint32_t psyEnergy = 0;
|
|
uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, psyEnergy);
|
|
|
|
if (m_slice->m_pps->bTransformSkipEnabled)
|
|
m_entropyCoder.load(m_rqt[depth].cur);
|
|
|
|
m_entropyCoder.resetBits();
|
|
// chroma prediction mode
|
|
if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
|
|
{
|
|
if (!absPartIdxC)
|
|
m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
|
|
}
|
|
else
|
|
{
|
|
uint32_t qNumParts = cuGeom.numPartitions >> 2;
|
|
if (!(absPartIdxC & (qNumParts - 1)))
|
|
m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
|
|
}
|
|
|
|
codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
|
|
codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
|
|
codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
|
|
uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
|
|
uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits);
|
|
|
|
if (cost < bestCost)
|
|
{
|
|
bestCost = cost;
|
|
bestDist = dist;
|
|
bestMode = modeList[mode];
|
|
extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
|
|
memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
|
|
memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
|
|
memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
|
|
memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
|
|
}
|
|
}
|
|
|
|
if (!tuIterator.isLastSection())
|
|
{
|
|
uint32_t zorder = cuGeom.absPartIdx + absPartIdxC;
|
|
PicYuv* reconPic = m_frame->m_reconPic;
|
|
uint32_t dststride = reconPic->m_strideC;
|
|
const pixel* src;
|
|
pixel* dst;
|
|
|
|
dst = reconPic->getCbAddr(cu.m_cuAddr, zorder);
|
|
src = reconYuv.getCbAddr(absPartIdxC);
|
|
primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
|
|
|
|
dst = reconPic->getCrAddr(cu.m_cuAddr, zorder);
|
|
src = reconYuv.getCrAddr(absPartIdxC);
|
|
primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
|
|
}
|
|
|
|
memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
|
|
memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
|
|
memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
|
|
memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
|
|
cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
|
|
totalDistortion += bestDist;
|
|
}
|
|
while (tuIterator.isNextSection());
|
|
|
|
if (initTuDepth != 0)
|
|
{
|
|
uint32_t combCbfU = 0;
|
|
uint32_t combCbfV = 0;
|
|
uint32_t qNumParts = tuIterator.absPartIdxStep;
|
|
for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
|
|
{
|
|
combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
|
|
combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
|
|
}
|
|
|
|
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
|
|
{
|
|
cu.m_cbf[1][offs] |= combCbfU;
|
|
cu.m_cbf[2][offs] |= combCbfV;
|
|
}
|
|
}
|
|
|
|
/* TODO: remove this */
|
|
m_entropyCoder.load(m_rqt[depth].cur);
|
|
return totalDistortion;
|
|
}
|
|
|
|
/* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */
|
|
uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m)
|
|
{
|
|
X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n");
|
|
|
|
MVField candMvField[MRG_MAX_NUM_CANDS][2];
|
|
uint8_t candDir[MRG_MAX_NUM_CANDS];
|
|
uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
|
|
|
|
if (cu.isBipredRestriction())
|
|
{
|
|
/* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
|
|
for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
|
|
{
|
|
if (candDir[mergeCand] == 3)
|
|
{
|
|
candDir[mergeCand] = 1;
|
|
candMvField[mergeCand][1].refIdx = REF_NOT_VALID;
|
|
}
|
|
}
|
|
}
|
|
|
|
Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
|
|
|
|
uint32_t outCost = MAX_UINT;
|
|
for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
|
|
{
|
|
/* Prevent TMVP candidates from using unavailable reference pixels */
|
|
if (m_bFrameParallel &&
|
|
(candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
|
|
candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4))
|
|
continue;
|
|
|
|
cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv;
|
|
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx;
|
|
cu.m_mv[1][pu.puAbsPartIdx] = candMvField[mergeCand][1].mv;
|
|
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][1].refIdx;
|
|
|
|
motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD);
|
|
|
|
uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size);
|
|
if (m_me.bChromaSATD)
|
|
costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx);
|
|
|
|
uint32_t bitsCand = getTUBits(mergeCand, numMergeCand);
|
|
costCand = costCand + m_rdCost.getCost(bitsCand);
|
|
if (costCand < outCost)
|
|
{
|
|
outCost = costCand;
|
|
m.bits = bitsCand;
|
|
m.index = mergeCand;
|
|
}
|
|
}
|
|
|
|
m.mvField[0] = candMvField[m.index][0];
|
|
m.mvField[1] = candMvField[m.index][1];
|
|
m.dir = candDir[m.index];
|
|
|
|
return outCost;
|
|
}
|
|
|
|
/* find the lowres motion vector from lookahead in middle of current PU */
|
|
MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref)
|
|
{
|
|
int diffPoc = abs(m_slice->m_poc - m_slice->m_refPOCList[list][ref]);
|
|
if (diffPoc > m_param->bframes + 1)
|
|
/* poc difference is out of range for lookahead */
|
|
return 0;
|
|
|
|
MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc - 1];
|
|
if (mvs[0].x == 0x7FFF)
|
|
/* this motion search was not estimated by lookahead */
|
|
return 0;
|
|
|
|
uint32_t block_x = (cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + pu.width / 2) >> 4;
|
|
uint32_t block_y = (cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + pu.height / 2) >> 4;
|
|
uint32_t idx = block_y * m_frame->m_lowres.maxBlocksInRow + block_x;
|
|
|
|
X265_CHECK(block_x < m_frame->m_lowres.maxBlocksInRow, "block_x is too high\n");
|
|
X265_CHECK(block_y < m_frame->m_lowres.maxBlocksInCol, "block_y is too high\n");
|
|
|
|
return mvs[idx] << 1; /* scale up lowres mv */
|
|
}
|
|
|
|
/* Pick between the two AMVP candidates which is the best one to use as
|
|
* MVP for the motion search, based on SAD cost */
|
|
int Search::selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref)
|
|
{
|
|
if (amvp[0] == amvp[1])
|
|
return 0;
|
|
|
|
Yuv& tmpPredYuv = m_rqt[cu.m_cuDepth[0]].tmpPredYuv;
|
|
uint32_t costs[AMVP_NUM_CANDS];
|
|
|
|
for (int i = 0; i < AMVP_NUM_CANDS; i++)
|
|
{
|
|
MV mvCand = amvp[i];
|
|
|
|
// NOTE: skip mvCand if Y is > merange and -FN>1
|
|
if (m_bFrameParallel && (mvCand.y >= (m_param->searchRange + 1) * 4))
|
|
costs[i] = m_me.COST_MAX;
|
|
else
|
|
{
|
|
cu.clipMv(mvCand);
|
|
predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand);
|
|
costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
|
|
}
|
|
}
|
|
|
|
return costs[0] <= costs[1] ? 0 : 1;
|
|
}
|
|
|
|
void Search::PME::processTasks(int workerThreadId)
|
|
{
|
|
#if DETAILED_CU_STATS
|
|
int fe = mode.cu.m_encData->m_frameEncoderID;
|
|
master.m_stats[fe].countPMETasks++;
|
|
ScopedElapsedTime pmeTime(master.m_stats[fe].pmeTime);
|
|
#endif
|
|
ProfileScopeEvent(pme);
|
|
master.processPME(*this, master.m_tld[workerThreadId].analysis);
|
|
}
|
|
|
|
void Search::processPME(PME& pme, Search& slave)
|
|
{
|
|
/* acquire a motion estimation job, else exit early */
|
|
int meId;
|
|
pme.m_lock.acquire();
|
|
if (pme.m_jobTotal > pme.m_jobAcquired)
|
|
{
|
|
meId = pme.m_jobAcquired++;
|
|
pme.m_lock.release();
|
|
}
|
|
else
|
|
{
|
|
pme.m_lock.release();
|
|
return;
|
|
}
|
|
|
|
/* Setup slave Search instance for ME for master's CU */
|
|
if (&slave != this)
|
|
{
|
|
slave.m_slice = m_slice;
|
|
slave.m_frame = m_frame;
|
|
slave.m_param = m_param;
|
|
slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp);
|
|
slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height);
|
|
}
|
|
|
|
/* Perform ME, repeat until no more work is available */
|
|
do
|
|
{
|
|
if (meId < pme.m_jobs.refCnt[0])
|
|
{
|
|
int refIdx = pme.m_jobs.ref[0][meId]; //L0
|
|
slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx);
|
|
}
|
|
else
|
|
{
|
|
int refIdx = pme.m_jobs.ref[1][meId - pme.m_jobs.refCnt[0]]; //L1
|
|
slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx);
|
|
}
|
|
|
|
meId = -1;
|
|
pme.m_lock.acquire();
|
|
if (pme.m_jobTotal > pme.m_jobAcquired)
|
|
meId = pme.m_jobAcquired++;
|
|
pme.m_lock.release();
|
|
}
|
|
while (meId >= 0);
|
|
}
|
|
|
|
void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref)
|
|
{
|
|
uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
|
|
bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
|
|
|
|
MotionData* bestME = interMode.bestME[part];
|
|
|
|
// 12 mv candidates including lowresMV
|
|
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
|
|
int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
|
|
|
|
const MV* amvp = interMode.amvpCand[list][ref];
|
|
int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
|
|
MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
|
|
|
|
MV lmv = getLowresMV(interMode.cu, pu, list, ref);
|
|
if (lmv.notZero())
|
|
mvc[numMvc++] = lmv;
|
|
|
|
setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
|
|
|
|
int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
|
|
|
|
/* Get total cost of partition, but only include MV bit cost once */
|
|
bits += m_me.bitcost(outmv);
|
|
uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
|
|
|
|
/* Refine MVP selection, updates: mvpIdx, bits, cost */
|
|
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
|
|
|
|
/* tie goes to the smallest ref ID, just like --no-pme */
|
|
ScopedLock _lock(master.m_meLock);
|
|
if (cost < bestME[list].cost ||
|
|
(cost == bestME[list].cost && ref < bestME[list].ref))
|
|
{
|
|
bestME[list].mv = outmv;
|
|
bestME[list].mvp = mvp;
|
|
bestME[list].mvpIdx = mvpIdx;
|
|
bestME[list].ref = ref;
|
|
bestME[list].cost = cost;
|
|
bestME[list].bits = bits;
|
|
}
|
|
}
|
|
|
|
/* find the best inter prediction for each PU of specified mode */
|
|
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
|
|
{
|
|
ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
|
|
|
|
CUData& cu = interMode.cu;
|
|
Yuv* predYuv = &interMode.predYuv;
|
|
|
|
// 12 mv candidates including lowresMV
|
|
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
|
|
|
|
const Slice *slice = m_slice;
|
|
int numPart = cu.getNumPartInter(0);
|
|
int numPredDir = slice->isInterP() ? 1 : 2;
|
|
const int* numRefIdx = slice->m_numRefIdx;
|
|
uint32_t lastMode = 0;
|
|
int totalmebits = 0;
|
|
MV mvzero(0, 0);
|
|
Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
|
|
|
|
MergeData merge;
|
|
memset(&merge, 0, sizeof(merge));
|
|
|
|
for (int puIdx = 0; puIdx < numPart; puIdx++)
|
|
{
|
|
MotionData* bestME = interMode.bestME[puIdx];
|
|
PredictionUnit pu(cu, cuGeom, puIdx);
|
|
|
|
m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height);
|
|
|
|
/* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
|
|
uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
|
|
|
|
bestME[0].cost = MAX_UINT;
|
|
bestME[1].cost = MAX_UINT;
|
|
|
|
getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
|
|
bool bDoUnidir = true;
|
|
|
|
cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
|
|
|
|
/* Uni-directional prediction */
|
|
if (m_param->analysisMode == X265_ANALYSIS_LOAD && bestME[0].ref >= 0)
|
|
{
|
|
for (int list = 0; list < numPredDir; list++)
|
|
{
|
|
int ref = bestME[list].ref;
|
|
uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
|
|
bits += getTUBits(ref, numRefIdx[list]);
|
|
|
|
int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
|
|
|
|
const MV* amvp = interMode.amvpCand[list][ref];
|
|
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
|
|
MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
|
|
|
|
MV lmv = getLowresMV(cu, pu, list, ref);
|
|
if (lmv.notZero())
|
|
mvc[numMvc++] = lmv;
|
|
|
|
setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
|
|
int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
|
|
|
|
/* Get total cost of partition, but only include MV bit cost once */
|
|
bits += m_me.bitcost(outmv);
|
|
uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
|
|
|
|
/* Refine MVP selection, updates: mvpIdx, bits, cost */
|
|
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
|
|
|
|
if (cost < bestME[list].cost)
|
|
{
|
|
bestME[list].mv = outmv;
|
|
bestME[list].mvp = mvp;
|
|
bestME[list].mvpIdx = mvpIdx;
|
|
bestME[list].cost = cost;
|
|
bestME[list].bits = bits;
|
|
}
|
|
}
|
|
bDoUnidir = false;
|
|
}
|
|
else if (m_param->bDistributeMotionEstimation)
|
|
{
|
|
PME pme(*this, interMode, cuGeom, pu, puIdx);
|
|
pme.m_jobTotal = 0;
|
|
pme.m_jobAcquired = 1; /* reserve L0-0 or L1-0 */
|
|
|
|
uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
|
|
for (int list = 0; list < numPredDir; list++)
|
|
{
|
|
int idx = 0;
|
|
for (int ref = 0; ref < numRefIdx[list]; ref++)
|
|
{
|
|
if (!(refMask & (1 << ref)))
|
|
continue;
|
|
|
|
pme.m_jobs.ref[list][idx++] = ref;
|
|
pme.m_jobTotal++;
|
|
}
|
|
pme.m_jobs.refCnt[list] = idx;
|
|
|
|
/* the second list ref bits start at bit 16 */
|
|
refMask >>= 16;
|
|
}
|
|
|
|
if (pme.m_jobTotal > 2)
|
|
{
|
|
pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1);
|
|
|
|
processPME(pme, *this);
|
|
|
|
int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0];
|
|
singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */
|
|
|
|
bDoUnidir = false;
|
|
|
|
ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
|
|
pme.waitForExit();
|
|
}
|
|
|
|
/* if no peer threads were bonded, fall back to doing unidirectional
|
|
* searches ourselves without overhead of singleMotionEstimation() */
|
|
}
|
|
if (bDoUnidir)
|
|
{
|
|
uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
|
|
|
|
for (int list = 0; list < numPredDir; list++)
|
|
{
|
|
for (int ref = 0; ref < numRefIdx[list]; ref++)
|
|
{
|
|
ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
|
|
|
|
if (!(refMask & (1 << ref)))
|
|
{
|
|
ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
|
|
continue;
|
|
}
|
|
|
|
uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
|
|
bits += getTUBits(ref, numRefIdx[list]);
|
|
|
|
int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
|
|
|
|
const MV* amvp = interMode.amvpCand[list][ref];
|
|
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
|
|
MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
|
|
|
|
MV lmv = getLowresMV(cu, pu, list, ref);
|
|
if (lmv.notZero())
|
|
mvc[numMvc++] = lmv;
|
|
|
|
setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
|
|
int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
|
|
|
|
/* Get total cost of partition, but only include MV bit cost once */
|
|
bits += m_me.bitcost(outmv);
|
|
uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
|
|
|
|
/* Refine MVP selection, updates: mvpIdx, bits, cost */
|
|
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
|
|
|
|
if (cost < bestME[list].cost)
|
|
{
|
|
bestME[list].mv = outmv;
|
|
bestME[list].mvp = mvp;
|
|
bestME[list].mvpIdx = mvpIdx;
|
|
bestME[list].ref = ref;
|
|
bestME[list].cost = cost;
|
|
bestME[list].bits = bits;
|
|
}
|
|
}
|
|
/* the second list ref bits start at bit 16 */
|
|
refMask >>= 16;
|
|
}
|
|
}
|
|
|
|
/* Bi-directional prediction */
|
|
MotionData bidir[2];
|
|
uint32_t bidirCost = MAX_UINT;
|
|
int bidirBits = 0;
|
|
|
|
if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */
|
|
cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */
|
|
bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
|
|
{
|
|
bidir[0] = bestME[0];
|
|
bidir[1] = bestME[1];
|
|
|
|
int satdCost;
|
|
|
|
if (m_me.bChromaSATD)
|
|
{
|
|
cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
|
|
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
|
|
cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
|
|
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
|
|
motionCompensation(cu, pu, tmpPredYuv, true, true);
|
|
|
|
satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
|
|
m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
|
|
}
|
|
else
|
|
{
|
|
PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
|
|
PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
|
|
Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
|
|
|
|
/* Generate reference subpels */
|
|
predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
|
|
predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
|
|
|
|
primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
|
|
bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
|
|
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
|
|
}
|
|
|
|
bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
|
|
bidirCost = satdCost + m_rdCost.getCost(bidirBits);
|
|
|
|
bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
|
|
if (bTryZero)
|
|
{
|
|
/* Do not try zero MV if unidir motion predictors are beyond
|
|
* valid search area */
|
|
MV mvmin, mvmax;
|
|
int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
|
|
setSearchRange(cu, mvzero, merange, mvmin, mvmax);
|
|
mvmax.y += 2; // there is some pad for subpel refine
|
|
mvmin <<= 2;
|
|
mvmax <<= 2;
|
|
|
|
bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
|
|
bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
|
|
}
|
|
if (bTryZero)
|
|
{
|
|
/* coincident blocks of the two reference pictures */
|
|
if (m_me.bChromaSATD)
|
|
{
|
|
cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
|
|
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
|
|
cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
|
|
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
|
|
motionCompensation(cu, pu, tmpPredYuv, true, true);
|
|
|
|
satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
|
|
m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
|
|
}
|
|
else
|
|
{
|
|
const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
|
|
const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
|
|
intptr_t refStride = slice->m_mref[0][0].lumaStride;
|
|
|
|
primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
|
|
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
|
|
}
|
|
|
|
MV mvp0 = bestME[0].mvp;
|
|
int mvpIdx0 = bestME[0].mvpIdx;
|
|
uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
|
|
|
|
MV mvp1 = bestME[1].mvp;
|
|
int mvpIdx1 = bestME[1].mvpIdx;
|
|
uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
|
|
|
|
uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
|
|
|
|
/* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
|
|
mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
|
|
mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);
|
|
|
|
if (cost < bidirCost)
|
|
{
|
|
bidir[0].mv = mvzero;
|
|
bidir[1].mv = mvzero;
|
|
bidir[0].mvp = mvp0;
|
|
bidir[1].mvp = mvp1;
|
|
bidir[0].mvpIdx = mvpIdx0;
|
|
bidir[1].mvpIdx = mvpIdx1;
|
|
bidirCost = cost;
|
|
bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* select best option and store into CU */
|
|
if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
|
|
{
|
|
cu.m_mergeFlag[pu.puAbsPartIdx] = true;
|
|
cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */
|
|
cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
|
|
cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
|
|
cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
|
|
cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
|
|
cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
|
|
|
|
totalmebits += merge.bits;
|
|
}
|
|
else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
|
|
{
|
|
lastMode = 2;
|
|
|
|
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
|
|
cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
|
|
cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
|
|
cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
|
|
cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
|
|
cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;
|
|
|
|
cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
|
|
cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
|
|
cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
|
|
cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
|
|
|
|
totalmebits += bidirBits;
|
|
}
|
|
else if (bestME[0].cost <= bestME[1].cost)
|
|
{
|
|
lastMode = 0;
|
|
|
|
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
|
|
cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
|
|
cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
|
|
cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
|
|
cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
|
|
cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;
|
|
|
|
cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
|
|
cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
|
|
|
|
totalmebits += bestME[0].bits;
|
|
}
|
|
else
|
|
{
|
|
lastMode = 1;
|
|
|
|
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
|
|
cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
|
|
cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
|
|
cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
|
|
cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
|
|
cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;
|
|
|
|
cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
|
|
cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
|
|
|
|
totalmebits += bestME[1].bits;
|
|
}
|
|
|
|
motionCompensation(cu, pu, *predYuv, true, bChromaMC);
|
|
}
|
|
X265_CHECK(interMode.ok(), "inter mode is not ok");
|
|
interMode.sa8dBits += totalmebits;
|
|
}
|
|
|
|
void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
|
|
{
|
|
if (cuMode == SIZE_2Nx2N)
|
|
{
|
|
blockBit[0] = (!bPSlice) ? 3 : 1;
|
|
blockBit[1] = 3;
|
|
blockBit[2] = 5;
|
|
}
|
|
else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD)
|
|
{
|
|
static const uint32_t listBits[2][3][3] =
|
|
{
|
|
{ { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
|
|
{ { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
|
|
};
|
|
if (bPSlice)
|
|
{
|
|
blockBit[0] = 3;
|
|
blockBit[1] = 0;
|
|
blockBit[2] = 0;
|
|
}
|
|
else
|
|
memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
|
|
}
|
|
else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N)
|
|
{
|
|
static const uint32_t listBits[2][3][3] =
|
|
{
|
|
{ { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
|
|
{ { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
|
|
};
|
|
if (bPSlice)
|
|
{
|
|
blockBit[0] = 3;
|
|
blockBit[1] = 0;
|
|
blockBit[2] = 0;
|
|
}
|
|
else
|
|
memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
|
|
}
|
|
else if (cuMode == SIZE_NxN)
|
|
{
|
|
blockBit[0] = (!bPSlice) ? 3 : 1;
|
|
blockBit[1] = 3;
|
|
blockBit[2] = 5;
|
|
}
|
|
else
|
|
{
|
|
X265_CHECK(0, "getBlkBits: unknown cuMode\n");
|
|
}
|
|
}
|
|
|
|
/* Check if using an alternative MVP would result in a smaller MVD + signal bits */
|
|
const MV& Search::checkBestMVP(const MV* amvpCand, const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const
|
|
{
|
|
int diffBits = m_me.bitcost(mv, amvpCand[!mvpIdx]) - m_me.bitcost(mv, amvpCand[mvpIdx]);
|
|
if (diffBits < 0)
|
|
{
|
|
mvpIdx = !mvpIdx;
|
|
uint32_t origOutBits = outBits;
|
|
outBits = origOutBits + diffBits;
|
|
outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
|
|
}
|
|
return amvpCand[mvpIdx];
|
|
}
|
|
|
|
void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const
|
|
{
|
|
MV dist((int16_t)merange << 2, (int16_t)merange << 2);
|
|
mvmin = mvp - dist;
|
|
mvmax = mvp + dist;
|
|
|
|
cu.clipMv(mvmin);
|
|
cu.clipMv(mvmax);
|
|
|
|
/* Clip search range to signaled maximum MV length.
|
|
* We do not support this VUI field being changed from the default */
|
|
const int maxMvLen = (1 << 15) - 1;
|
|
mvmin.x = X265_MAX(mvmin.x, -maxMvLen);
|
|
mvmin.y = X265_MAX(mvmin.y, -maxMvLen);
|
|
mvmax.x = X265_MIN(mvmax.x, maxMvLen);
|
|
mvmax.y = X265_MIN(mvmax.y, maxMvLen);
|
|
|
|
mvmin >>= 2;
|
|
mvmax >>= 2;
|
|
|
|
/* conditional clipping for frame parallelism */
|
|
mvmin.y = X265_MIN(mvmin.y, (int16_t)m_refLagPixels);
|
|
mvmax.y = X265_MIN(mvmax.y, (int16_t)m_refLagPixels);
|
|
}
|
|
|
|
/* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
|
|
void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
|
|
{
|
|
CUData& cu = interMode.cu;
|
|
Yuv* reconYuv = &interMode.reconYuv;
|
|
const Yuv* fencYuv = interMode.fencYuv;
|
|
Yuv* predYuv = &interMode.predYuv;
|
|
X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
|
|
uint32_t depth = cu.m_cuDepth[0];
|
|
|
|
// No residual coding : SKIP mode
|
|
|
|
cu.setPredModeSubParts(MODE_SKIP);
|
|
cu.clearCbf();
|
|
cu.setTUDepthSubParts(0, 0, depth);
|
|
|
|
reconYuv->copyFromYuv(interMode.predYuv);
|
|
|
|
// Luma
|
|
int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
|
|
interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
|
|
if (m_csp != X265_CSP_I400) {
|
|
// Chroma
|
|
interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
|
|
interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
|
|
} else {
|
|
interMode.chromaDistortion = 0;
|
|
}
|
|
interMode.distortion = interMode.lumaDistortion + interMode.chromaDistortion;
|
|
|
|
m_entropyCoder.load(m_rqt[depth].cur);
|
|
m_entropyCoder.resetBits();
|
|
if (m_slice->m_pps->bTransquantBypassEnabled)
|
|
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
|
|
m_entropyCoder.codeSkipFlag(cu, 0);
|
|
m_entropyCoder.codeMergeIndex(cu, 0);
|
|
|
|
interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
|
|
interMode.coeffBits = 0;
|
|
interMode.totalBits = interMode.mvBits;
|
|
if (m_rdCost.m_psyRd)
|
|
interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
|
|
interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
|
|
updateModeCost(interMode);
|
|
m_entropyCoder.store(interMode.contexts);
|
|
}
|
|
|
|
/* encode residual and calculate rate-distortion for a CU block.
|
|
* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
|
|
void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
|
|
{
|
|
ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]);
|
|
|
|
CUData& cu = interMode.cu;
|
|
Yuv* reconYuv = &interMode.reconYuv;
|
|
Yuv* predYuv = &interMode.predYuv;
|
|
uint32_t depth = cuGeom.depth;
|
|
ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;
|
|
const Yuv* fencYuv = interMode.fencYuv;
|
|
|
|
X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
|
|
|
|
uint32_t log2CUSize = cuGeom.log2CUSize;
|
|
int sizeIdx = log2CUSize - 2;
|
|
|
|
resiYuv->subtract(*fencYuv, *predYuv, log2CUSize);
|
|
|
|
uint32_t tuDepthRange[2];
|
|
cu.getInterTUQtDepthRange(tuDepthRange, 0);
|
|
|
|
m_entropyCoder.load(m_rqt[depth].cur);
|
|
|
|
Cost costs;
|
|
estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
|
|
|
|
uint32_t tqBypass = cu.m_tqBypass[0];
|
|
if (!tqBypass)
|
|
{
|
|
sse_ret_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
|
|
if (m_csp != X265_CSP_I400) {
|
|
cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
|
|
cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
|
|
}
|
|
|
|
/* Consider the RD cost of not signaling any residual */
|
|
m_entropyCoder.load(m_rqt[depth].cur);
|
|
m_entropyCoder.resetBits();
|
|
m_entropyCoder.codeQtRootCbfZero();
|
|
uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
|
|
|
|
uint64_t cbf0Cost;
|
|
uint32_t cbf0Energy;
|
|
if (m_rdCost.m_psyRd)
|
|
{
|
|
cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
|
|
cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
|
|
}
|
|
else
|
|
cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
|
|
|
|
if (cbf0Cost < costs.rdcost)
|
|
{
|
|
cu.clearCbf();
|
|
cu.setTUDepthSubParts(0, 0, depth);
|
|
}
|
|
}
|
|
|
|
if (cu.getQtRootCbf(0))
|
|
saveResidualQTData(cu, *resiYuv, 0, 0);
|
|
|
|
/* calculate signal bits for inter/merge/skip coded CU */
|
|
m_entropyCoder.load(m_rqt[depth].cur);
|
|
|
|
m_entropyCoder.resetBits();
|
|
if (m_slice->m_pps->bTransquantBypassEnabled)
|
|
m_entropyCoder.codeCUTransquantBypassFlag(tqBypass);
|
|
|
|
uint32_t coeffBits, bits;
|
|
if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
|
|
{
|
|
cu.setPredModeSubParts(MODE_SKIP);
|
|
|
|
/* Merge/Skip */
|
|
m_entropyCoder.codeSkipFlag(cu, 0);
|
|
m_entropyCoder.codeMergeIndex(cu, 0);
|
|
coeffBits = 0;
|
|
bits = m_entropyCoder.getNumberOfWrittenBits();
|
|
}
|
|
else
|
|
{
|
|
m_entropyCoder.codeSkipFlag(cu, 0);
|
|
m_entropyCoder.codePredMode(cu.m_predMode[0]);
|
|
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
|
|
m_entropyCoder.codePredInfo(cu, 0);
|
|
uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
|
|
|
|
bool bCodeDQP = m_slice->m_pps->bUseDQP;
|
|
m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
|
|
bits = m_entropyCoder.getNumberOfWrittenBits();
|
|
|
|
coeffBits = bits - mvBits;
|
|
}
|
|
|
|
m_entropyCoder.store(interMode.contexts);
|
|
|
|
if (cu.getQtRootCbf(0))
|
|
reconYuv->addClip(*predYuv, *resiYuv, log2CUSize);
|
|
else
|
|
reconYuv->copyFromYuv(*predYuv);
|
|
|
|
// update with clipped distortion and cost (qp estimation loop uses unclipped values)
|
|
sse_ret_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
|
|
sse_ret_t bestChromaDist;
|
|
if (m_csp != X265_CSP_I400) {
|
|
bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
|
|
bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
|
|
} else {
|
|
bestChromaDist = 0;
|
|
}
|
|
if (m_rdCost.m_psyRd)
|
|
interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
|
|
interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
|
|
interMode.totalBits = bits;
|
|
interMode.lumaDistortion = bestLumaDist;
|
|
interMode.chromaDistortion = bestChromaDist;
|
|
interMode.distortion = bestLumaDist + bestChromaDist;
|
|
interMode.coeffBits = coeffBits;
|
|
interMode.mvBits = bits - coeffBits;
|
|
updateModeCost(interMode);
|
|
checkDQP(interMode, cuGeom);
|
|
}
|
|
|
|
void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
|
|
{
|
|
uint32_t depth = cuGeom.depth + tuDepth;
|
|
CUData& cu = mode.cu;
|
|
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
|
|
|
|
bool bCheckFull = log2TrSize <= depthRange[1];
|
|
if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
|
|
bCheckFull = false;
|
|
|
|
if (bCheckFull)
|
|
{
|
|
// code full block
|
|
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
|
|
bool bCodeChroma = true;
|
|
uint32_t tuDepthC = tuDepth;
|
|
if (log2TrSizeC < 2)
|
|
{
|
|
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
|
|
log2TrSizeC = 2;
|
|
tuDepthC--;
|
|
bCodeChroma = !(absPartIdx & 3);
|
|
}
|
|
|
|
uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
|
|
uint32_t setCbf = 1 << tuDepth;
|
|
|
|
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
|
|
coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
|
|
|
|
uint32_t sizeIdx = log2TrSize - 2;
|
|
|
|
cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
|
|
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
|
|
|
|
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
|
|
const Yuv* fencYuv = mode.fencYuv;
|
|
|
|
int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
|
|
uint32_t strideResiY = resiYuv.m_size;
|
|
|
|
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
|
|
uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
|
|
|
|
if (numSigY)
|
|
{
|
|
m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
|
|
cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
|
|
}
|
|
else
|
|
{
|
|
primitives.cu[sizeIdx].blockfill_s(curResiY, strideResiY, 0);
|
|
cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
|
|
}
|
|
|
|
if (bCodeChroma)
|
|
{
|
|
uint32_t sizeIdxC = log2TrSizeC - 2;
|
|
uint32_t strideResiC = resiYuv.m_csize;
|
|
|
|
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
|
|
coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
|
|
coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
|
|
bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
|
|
|
|
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
|
|
do
|
|
{
|
|
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
|
|
uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
|
|
|
|
cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
|
|
cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
|
|
|
|
int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
|
|
const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
|
|
uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
|
|
if (numSigU)
|
|
{
|
|
m_quant.invtransformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
|
|
cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
|
|
}
|
|
else
|
|
{
|
|
primitives.cu[sizeIdxC].blockfill_s(curResiU, strideResiC, 0);
|
|
cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
|
|
}
|
|
|
|
int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
|
|
const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
|
|
uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
|
|
if (numSigV)
|
|
{
|
|
m_quant.invtransformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
|
|
cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
|
|
}
|
|
else
|
|
{
|
|
primitives.cu[sizeIdxC].blockfill_s(curResiV, strideResiC, 0);
|
|
cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
|
|
}
|
|
}
|
|
while (tuIterator.isNextSection());
|
|
|
|
if (splitIntoSubTUs)
|
|
{
|
|
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
|
|
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
|
|
|
|
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
|
|
uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
|
|
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
|
|
{
|
|
residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
|
|
ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
|
|
ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
|
|
vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
|
|
}
|
|
for (uint32_t i = 0; i < 4 * qNumParts; ++i)
|
|
{
|
|
cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
|
|
cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
|
|
cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
|
|
}
|
|
}
|
|
}
|
|
|
|
uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
|
|
{
|
|
uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
|
|
|
|
if (m_rdCost.m_psyRd)
|
|
return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy);
|
|
else
|
|
return m_rdCost.calcRdCost(dist, nullBits);
|
|
}
|
|
|
|
void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
|
|
{
|
|
CUData& cu = mode.cu;
|
|
uint32_t depth = cuGeom.depth + tuDepth;
|
|
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
|
|
|
|
bool bCheckSplit = log2TrSize > depthRange[0];
|
|
bool bCheckFull = log2TrSize <= depthRange[1];
|
|
bool bSplitPresentFlag = bCheckSplit && bCheckFull;
|
|
|
|
if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
|
|
bCheckFull = false;
|
|
|
|
X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
|
|
|
|
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
|
|
bool bCodeChroma;
|
|
uint32_t tuDepthC = tuDepth;
|
|
|
|
if (m_csp != X265_CSP_I400) {
|
|
bCodeChroma = true;
|
|
if (log2TrSizeC < 2)
|
|
{
|
|
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
|
|
log2TrSizeC = 2;
|
|
tuDepthC--;
|
|
bCodeChroma = !(absPartIdx & 3);
|
|
}
|
|
} else {
|
|
bCodeChroma = false;
|
|
}
|
|
|
|
// code full block
|
|
Cost fullCost;
|
|
fullCost.rdcost = MAX_INT64;
|
|
|
|
uint8_t cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
|
|
uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
|
|
uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
|
|
uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
|
|
uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
|
|
uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
|
|
uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
|
|
|
|
m_entropyCoder.store(m_rqt[depth].rqtRoot);
|
|
|
|
uint32_t trSize = 1 << log2TrSize;
|
|
const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
|
|
uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
|
|
const Yuv* fencYuv = mode.fencYuv;
|
|
|
|
// code full block
|
|
if (bCheckFull)
|
|
{
|
|
uint32_t trSizeC = 1 << log2TrSizeC;
|
|
int partSize = partitionFromLog2Size(log2TrSize);
|
|
int partSizeC = partitionFromLog2Size(log2TrSizeC);
|
|
const uint32_t qtLayer = log2TrSize - 2;
|
|
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
|
|
coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
|
|
|
|
bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
|
|
bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
|
|
bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
|
|
|
|
cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
|
|
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
|
|
|
|
if (m_bEnableRDOQ)
|
|
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
|
|
|
|
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
|
|
int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
|
|
numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
|
|
cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
|
|
|
|
m_entropyCoder.resetBits();
|
|
|
|
if (bSplitPresentFlag && log2TrSize > depthRange[0])
|
|
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
|
|
fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
|
|
|
|
// Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth.
|
|
// So it is valid if we encode coefficients and then cbfs at least for analysis.
|
|
// m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
|
|
if (cbfFlag[TEXT_LUMA][0])
|
|
m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
|
|
|
|
uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits();
|
|
singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits;
|
|
|
|
X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
|
|
uint32_t distY = primitives.cu[partSize].ssd_s(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
|
|
uint32_t psyEnergyY = 0;
|
|
if (m_rdCost.m_psyRd)
|
|
psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0);
|
|
|
|
int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
|
|
uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
|
|
|
|
if (cbfFlag[TEXT_LUMA][0])
|
|
{
|
|
m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
|
|
|
|
// non-zero cost calculation for luma - This is an approximation
|
|
// finally we have to encode correct cbf after comparing with null cost
|
|
const uint32_t nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
|
|
uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
|
|
uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
|
|
if (m_rdCost.m_psyRd)
|
|
{
|
|
nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
|
|
singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
|
|
}
|
|
else
|
|
singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
|
|
|
|
if (cu.m_tqBypass[0])
|
|
{
|
|
singleDist[TEXT_LUMA][0] = nonZeroDistY;
|
|
singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
|
|
}
|
|
else
|
|
{
|
|
// zero-cost calculation for luma. This is an approximation
|
|
// Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
|
|
// Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
|
|
uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
|
|
|
|
if (nullCostY < singleCostY)
|
|
{
|
|
cbfFlag[TEXT_LUMA][0] = 0;
|
|
singleBits[TEXT_LUMA][0] = 0;
|
|
primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
|
|
#if CHECKED_BUILD || _DEBUG
|
|
uint32_t numCoeffY = 1 << (log2TrSize << 1);
|
|
memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
|
|
#endif
|
|
if (checkTransformSkipY)
|
|
minCost[TEXT_LUMA][0] = nullCostY;
|
|
singleDist[TEXT_LUMA][0] = distY;
|
|
singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
|
|
}
|
|
else
|
|
{
|
|
if (checkTransformSkipY)
|
|
minCost[TEXT_LUMA][0] = singleCostY;
|
|
singleDist[TEXT_LUMA][0] = nonZeroDistY;
|
|
singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (checkTransformSkipY)
|
|
minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
|
|
primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
|
|
singleDist[TEXT_LUMA][0] = distY;
|
|
singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
|
|
}
|
|
|
|
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
|
|
|
|
if (bCodeChroma)
|
|
{
|
|
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
|
|
uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
|
|
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
|
|
{
|
|
uint32_t distC = 0, psyEnergyC = 0;
|
|
coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
|
|
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
|
|
|
|
do
|
|
{
|
|
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
|
|
uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
|
|
|
|
cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
|
|
|
|
if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
|
|
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
|
|
|
|
fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
|
|
resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
|
|
numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
|
|
cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
|
|
|
|
if (cbfFlag[chromaId][tuIterator.section])
|
|
m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
|
|
uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
|
|
singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev;
|
|
singleBitsPrev = newBits;
|
|
|
|
int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
|
|
distC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].ssd_s(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
|
|
|
|
if (cbfFlag[chromaId][tuIterator.section])
|
|
{
|
|
m_quant.invtransformNxN(cu, curResiC, strideResiC, coeffCurC + subTUOffset,
|
|
log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
|
|
|
|
// non-zero cost calculation for luma, same as luma - This is an approximation
|
|
// finally we have to encode correct cbf after comparing with null cost
|
|
uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
|
|
uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
|
|
uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
|
|
uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
|
|
if (m_rdCost.m_psyRd)
|
|
{
|
|
nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
|
|
singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
|
|
}
|
|
else
|
|
singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
|
|
|
|
if (cu.m_tqBypass[0])
|
|
{
|
|
singleDist[chromaId][tuIterator.section] = nonZeroDistC;
|
|
singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
|
|
}
|
|
else
|
|
{
|
|
//zero-cost calculation for chroma. This is an approximation
|
|
uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
|
|
|
|
if (nullCostC < singleCostC)
|
|
{
|
|
cbfFlag[chromaId][tuIterator.section] = 0;
|
|
singleBits[chromaId][tuIterator.section] = 0;
|
|
primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
|
|
#if CHECKED_BUILD || _DEBUG
|
|
uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
|
|
memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
|
|
#endif
|
|
if (checkTransformSkipC)
|
|
minCost[chromaId][tuIterator.section] = nullCostC;
|
|
singleDist[chromaId][tuIterator.section] = distC;
|
|
singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
|
|
}
|
|
else
|
|
{
|
|
if (checkTransformSkipC)
|
|
minCost[chromaId][tuIterator.section] = singleCostC;
|
|
singleDist[chromaId][tuIterator.section] = nonZeroDistC;
|
|
singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (checkTransformSkipC)
|
|
minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
|
|
primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
|
|
singleDist[chromaId][tuIterator.section] = distC;
|
|
singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
|
|
}
|
|
|
|
cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
|
|
}
|
|
while (tuIterator.isNextSection());
|
|
}
|
|
}
|
|
|
|
if (checkTransformSkipY)
|
|
{
|
|
uint32_t nonZeroDistY = 0;
|
|
uint32_t nonZeroPsyEnergyY = 0;
|
|
uint64_t singleCostY = MAX_INT64;
|
|
|
|
m_entropyCoder.load(m_rqt[depth].rqtRoot);
|
|
|
|
cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
|
|
|
|
if (m_bEnableRDOQ)
|
|
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
|
|
|
|
fenc = fencYuv->getLumaAddr(absPartIdx);
|
|
resi = resiYuv.getLumaAddr(absPartIdx);
|
|
uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true);
|
|
|
|
if (numSigTSkipY)
|
|
{
|
|
m_entropyCoder.resetBits();
|
|
m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
|
|
m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA);
|
|
const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
|
|
|
|
m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
|
|
|
|
nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, m_tsResidual, trSize);
|
|
|
|
if (m_rdCost.m_psyRd)
|
|
{
|
|
nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, m_tsResidual, trSize);
|
|
singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroPsyEnergyY);
|
|
}
|
|
else
|
|
singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
|
|
}
|
|
|
|
if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
|
|
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
|
|
else
|
|
{
|
|
singleDist[TEXT_LUMA][0] = nonZeroDistY;
|
|
singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
|
|
cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
|
|
bestTransformMode[TEXT_LUMA][0] = 1;
|
|
uint32_t numCoeffY = 1 << (log2TrSize << 1);
|
|
memcpy(coeffCurY, m_tsCoeff, sizeof(coeff_t) * numCoeffY);
|
|
primitives.cu[partSize].copy_ss(curResiY, strideResiY, m_tsResidual, trSize);
|
|
}
|
|
|
|
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
|
|
}
|
|
|
|
if (bCodeChroma && checkTransformSkipC)
|
|
{
|
|
uint32_t nonZeroDistC = 0, nonZeroPsyEnergyC = 0;
|
|
uint64_t singleCostC = MAX_INT64;
|
|
uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
|
|
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
|
|
|
|
m_entropyCoder.load(m_rqt[depth].rqtRoot);
|
|
|
|
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
|
|
{
|
|
coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
|
|
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
|
|
|
|
do
|
|
{
|
|
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
|
|
uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
|
|
|
|
int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
|
|
|
|
cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
|
|
|
|
if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
|
|
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
|
|
|
|
fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
|
|
resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
|
|
uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
|
|
|
|
m_entropyCoder.resetBits();
|
|
singleBits[chromaId][tuIterator.section] = 0;
|
|
|
|
if (numSigTSkipC)
|
|
{
|
|
m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
|
|
m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
|
|
singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
|
|
|
|
m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
|
|
log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
|
|
uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
|
|
nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
|
|
if (m_rdCost.m_psyRd)
|
|
{
|
|
nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
|
|
singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
|
|
}
|
|
else
|
|
singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
|
|
}
|
|
|
|
if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
|
|
cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
|
|
else
|
|
{
|
|
singleDist[chromaId][tuIterator.section] = nonZeroDistC;
|
|
singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
|
|
cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
|
|
bestTransformMode[chromaId][tuIterator.section] = 1;
|
|
uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
|
|
memcpy(coeffCurC + subTUOffset, m_tsCoeff, sizeof(coeff_t) * numCoeffC);
|
|
primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, m_tsResidual, trSizeC);
|
|
}
|
|
|
|
cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
|
|
}
|
|
while (tuIterator.isNextSection());
|
|
}
|
|
}
|
|
|
|
// Here we were encoding cbfs and coefficients, after calculating distortion above.
|
|
// Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
|
|
// bits required for coefficients and added with number of cbf bits. As I tested the order does not
|
|
// make any difference. But bit confused whether I should load the original context as below.
|
|
m_entropyCoder.load(m_rqt[depth].rqtRoot);
|
|
m_entropyCoder.resetBits();
|
|
|
|
//Encode cbf flags
|
|
if (bCodeChroma)
|
|
{
|
|
if (!splitIntoSubTUs)
|
|
{
|
|
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
|
|
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
|
|
}
|
|
else
|
|
{
|
|
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
|
|
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
|
|
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
|
|
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth);
|
|
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
|
|
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth);
|
|
}
|
|
}
|
|
|
|
m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
|
|
|
|
uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
|
|
|
|
uint32_t coeffBits = 0;
|
|
coeffBits = singleBits[TEXT_LUMA][0];
|
|
for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
|
|
{
|
|
coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
|
|
coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
|
|
}
|
|
|
|
// In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
|
|
// In case of chroma, if any one of the split block's cbf is 1, then we need to encode cbf 1, and then for
|
|
// four split block's individual cbf value. This is not known before analysis of four split blocks.
|
|
// For that reason, I am collecting individual coefficient bits only.
|
|
fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
|
|
|
|
fullCost.distortion += singleDist[TEXT_LUMA][0];
|
|
fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
|
|
for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
|
|
{
|
|
fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
|
|
fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
|
|
}
|
|
|
|
if (m_rdCost.m_psyRd)
|
|
fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
|
|
else
|
|
fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
|
|
}
|
|
|
|
// code sub-blocks
|
|
if (bCheckSplit)
|
|
{
|
|
if (bCheckFull)
|
|
{
|
|
m_entropyCoder.store(m_rqt[depth].rqtTest);
|
|
m_entropyCoder.load(m_rqt[depth].rqtRoot);
|
|
}
|
|
|
|
Cost splitCost;
|
|
if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
|
|
{
|
|
// Subdiv flag can be encoded at the start of analysis of split blocks.
|
|
m_entropyCoder.resetBits();
|
|
m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
|
|
splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
|
|
}
|
|
|
|
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
|
|
uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
|
|
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
|
|
{
|
|
estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange);
|
|
ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
|
|
ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
|
|
vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
|
|
}
|
|
for (uint32_t i = 0; i < 4 * qNumParts; ++i)
|
|
{
|
|
cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
|
|
cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
|
|
cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
|
|
}
|
|
|
|
// Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
|
|
// for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
|
|
// But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
|
|
// at depth 0 (for example).
|
|
m_entropyCoder.load(m_rqt[depth].rqtRoot);
|
|
m_entropyCoder.resetBits();
|
|
|
|
codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
|
|
uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
|
|
splitCost.bits += splitCbfBits;
|
|
|
|
if (m_rdCost.m_psyRd)
|
|
splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
|
|
else
|
|
splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
|
|
|
|
if (ycbf || ucbf || vcbf || !bCheckFull)
|
|
{
|
|
if (splitCost.rdcost < fullCost.rdcost)
|
|
{
|
|
outCosts.distortion += splitCost.distortion;
|
|
outCosts.rdcost += splitCost.rdcost;
|
|
outCosts.bits += splitCost.bits;
|
|
outCosts.energy += splitCost.energy;
|
|
return;
|
|
}
|
|
else
|
|
outCosts.energy += splitCost.energy;
|
|
}
|
|
|
|
cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
|
|
if (bCodeChroma)
|
|
{
|
|
if (!splitIntoSubTUs)
|
|
{
|
|
cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
|
|
cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
|
|
}
|
|
else
|
|
{
|
|
uint32_t tuNumParts = absPartIdxStep >> 1;
|
|
cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx , tuNumParts);
|
|
cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
|
|
cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx , tuNumParts);
|
|
cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
|
|
}
|
|
}
|
|
X265_CHECK(bCheckFull, "check-full must be set\n");
|
|
m_entropyCoder.load(m_rqt[depth].rqtTest);
|
|
}
|
|
|
|
cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
|
|
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
|
|
|
|
if (bCodeChroma)
|
|
{
|
|
if (!splitIntoSubTUs)
|
|
{
|
|
cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
|
|
cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
|
|
}
|
|
else
|
|
{
|
|
uint32_t tuNumParts = absPartIdxStep >> 1;
|
|
|
|
offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
|
|
offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
|
|
cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx , tuNumParts);
|
|
cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
|
|
cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx , tuNumParts);
|
|
cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
|
|
}
|
|
}
|
|
|
|
outCosts.distortion += fullCost.distortion;
|
|
outCosts.rdcost += fullCost.rdcost;
|
|
outCosts.bits += fullCost.bits;
|
|
outCosts.energy += fullCost.energy;
|
|
}
|
|
|
|
void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
|
|
{
|
|
X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
|
|
|
|
const bool bSubdiv = tuDepth < cu.m_tuDepth[absPartIdx];
|
|
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
|
|
|
|
if (!(log2TrSize - m_hChromaShift < 2))
|
|
{
|
|
if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
|
|
m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
|
|
if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
|
|
m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
|
|
}
|
|
else
|
|
{
|
|
X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
|
|
X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
|
|
}
|
|
|
|
if (!bSubdiv)
|
|
{
|
|
m_entropyCoder.codeQtCbfLuma(cu, absPartIdx, tuDepth);
|
|
}
|
|
else
|
|
{
|
|
uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
|
|
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
|
|
codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange);
|
|
}
|
|
}
|
|
|
|
void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth)
|
|
{
|
|
const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
|
|
|
|
if (tuDepth < cu.m_tuDepth[absPartIdx])
|
|
{
|
|
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
|
|
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
|
|
saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1);
|
|
return;
|
|
}
|
|
|
|
const uint32_t qtLayer = log2TrSize - 2;
|
|
|
|
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
|
|
bool bCodeChroma;
|
|
uint32_t tuDepthC = tuDepth;
|
|
|
|
if (m_csp != X265_CSP_I400) {
|
|
bCodeChroma = true;
|
|
if (log2TrSizeC < 2)
|
|
{
|
|
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
|
|
log2TrSizeC = 2;
|
|
tuDepthC--;
|
|
bCodeChroma = !(absPartIdx & 3);
|
|
}
|
|
} else {
|
|
bCodeChroma = false;
|
|
}
|
|
|
|
m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
|
|
|
|
uint32_t numCoeffY = 1 << (log2TrSize * 2);
|
|
uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2;
|
|
coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
|
|
coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
|
|
memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
|
|
|
|
if (bCodeChroma)
|
|
{
|
|
m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
|
|
|
|
uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
|
|
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
|
|
|
|
coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
|
|
coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
|
|
coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
|
|
coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
|
|
memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
|
|
memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
|
|
}
|
|
}
|
|
|
|
/* returns the number of bits required to signal a non-most-probable mode.
|
|
* on return mpms contains bitmap of most probable modes */
|
|
uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const
|
|
{
|
|
cu.getIntraDirLumaPredictor(absPartIdx, mpmModes);
|
|
|
|
mpms = 0;
|
|
for (int i = 0; i < 3; ++i)
|
|
mpms |= ((uint64_t)1 << mpmModes[i]);
|
|
|
|
return m_entropyCoder.bitsIntraModeNonMPM();
|
|
}
|
|
|
|
/* swap the current mode/cost with the mode with the highest cost in the
|
|
* current candidate list, if its cost is better (maintain a top N list) */
|
|
void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
|
|
{
|
|
uint32_t maxIndex = 0;
|
|
uint64_t maxValue = 0;
|
|
|
|
for (int i = 0; i < maxCandCount; i++)
|
|
{
|
|
if (maxValue < candCostList[i])
|
|
{
|
|
maxValue = candCostList[i];
|
|
maxIndex = i;
|
|
}
|
|
}
|
|
|
|
if (cost < maxValue)
|
|
{
|
|
candCostList[maxIndex] = cost;
|
|
candModeList[maxIndex] = mode;
|
|
}
|
|
}
|
|
|
|
void Search::checkDQP(Mode& mode, const CUGeom& cuGeom)
|
|
{
|
|
CUData& cu = mode.cu;
|
|
if (cu.m_slice->m_pps->bUseDQP && cuGeom.depth <= cu.m_slice->m_pps->maxCuDQPDepth)
|
|
{
|
|
if (cu.getQtRootCbf(0))
|
|
{
|
|
if (m_param->rdLevel >= 3)
|
|
{
|
|
mode.contexts.resetBits();
|
|
mode.contexts.codeDeltaQP(cu, 0);
|
|
uint32_t bits = mode.contexts.getNumberOfWrittenBits();
|
|
mode.mvBits += bits;
|
|
mode.totalBits += bits;
|
|
updateModeCost(mode);
|
|
}
|
|
else if (m_param->rdLevel <= 1)
|
|
{
|
|
mode.sa8dBits++;
|
|
mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
|
|
}
|
|
else
|
|
{
|
|
mode.mvBits++;
|
|
mode.totalBits++;
|
|
updateModeCost(mode);
|
|
}
|
|
}
|
|
else
|
|
cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
|
|
}
|
|
}
|
|
|
|
void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom)
|
|
{
|
|
CUData& cu = mode.cu;
|
|
|
|
if ((cuGeom.depth == cu.m_slice->m_pps->maxCuDQPDepth) && cu.m_slice->m_pps->bUseDQP)
|
|
{
|
|
bool hasResidual = false;
|
|
|
|
/* Check if any sub-CU has a non-zero QP */
|
|
for (uint32_t blkIdx = 0; blkIdx < cuGeom.numPartitions; blkIdx++)
|
|
{
|
|
if (cu.getQtRootCbf(blkIdx))
|
|
{
|
|
hasResidual = true;
|
|
break;
|
|
}
|
|
}
|
|
if (hasResidual)
|
|
{
|
|
if (m_param->rdLevel >= 3)
|
|
{
|
|
mode.contexts.resetBits();
|
|
mode.contexts.codeDeltaQP(cu, 0);
|
|
uint32_t bits = mode.contexts.getNumberOfWrittenBits();
|
|
mode.mvBits += bits;
|
|
mode.totalBits += bits;
|
|
updateModeCost(mode);
|
|
}
|
|
else if (m_param->rdLevel <= 1)
|
|
{
|
|
mode.sa8dBits++;
|
|
mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
|
|
}
|
|
else
|
|
{
|
|
mode.mvBits++;
|
|
mode.totalBits++;
|
|
updateModeCost(mode);
|
|
}
|
|
/* For all zero CBF sub-CUs, reset QP to RefQP (so that deltaQP is not signalled).
|
|
When the non-zero CBF sub-CU is found, stop */
|
|
cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth);
|
|
}
|
|
else
|
|
/* No residual within this CU or subCU, so reset QP to RefQP */
|
|
cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
|
|
}
|
|
}
|