libbpg/x265/source/encoder/search.h
2015-10-27 11:46:00 +01:00

468 lines
19 KiB
C++

/*****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve@borho.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#ifndef X265_SEARCH_H
#define X265_SEARCH_H
#include "common.h"
#include "predict.h"
#include "quant.h"
#include "bitcost.h"
#include "framedata.h"
#include "yuv.h"
#include "threadpool.h"
#include "rdcost.h"
#include "entropy.h"
#include "motion.h"
#if DETAILED_CU_STATS
#define ProfileCUScopeNamed(name, cu, acc, count) \
m_stats[cu.m_encData->m_frameEncoderID].count++; \
ScopedElapsedTime name(m_stats[cu.m_encData->m_frameEncoderID].acc)
#define ProfileCUScope(cu, acc, count) ProfileCUScopeNamed(timedScope, cu, acc, count)
#define ProfileCounter(cu, count) m_stats[cu.m_encData->m_frameEncoderID].count++;
#else
#define ProfileCUScopeNamed(name, cu, acc, count)
#define ProfileCUScope(cu, acc, count)
#define ProfileCounter(cu, count)
#endif
namespace X265_NS {
// private namespace
class Entropy;
struct ThreadLocalData;
/* All the CABAC contexts that Analysis needs to keep track of at each depth
* and temp buffers for residual, coeff, and recon for use during residual
* quad-tree depth recursion */
struct RQTData
{
Entropy cur; /* starting context for current CU */
/* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
* the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
* which are reconstructed at each depth are valid. At the end, the transform depth table
* is walked and the coeff and recon at the final split depths are collected */
Entropy rqtRoot; /* residual quad-tree start context */
Entropy rqtTemp; /* residual quad-tree temp context */
Entropy rqtTest; /* residual quad-tree test context */
coeff_t* coeffRQT[3]; /* coeff storage for entire CTU for each RQT layer */
Yuv reconQtYuv; /* recon storage for entire CTU for each RQT layer (intra) */
ShortYuv resiQtYuv; /* residual storage for entire CTU for each RQT layer (inter) */
/* per-depth temp buffers for inter prediction */
ShortYuv tmpResiYuv;
Yuv tmpPredYuv;
Yuv bidirPredYuv[2];
};
struct MotionData
{
MV mv;
MV mvp;
int mvpIdx;
int ref;
uint32_t cost;
int bits;
};
struct Mode
{
CUData cu;
const Yuv* fencYuv;
Yuv predYuv;
Yuv reconYuv;
Entropy contexts;
enum { MAX_INTER_PARTS = 2 };
MotionData bestME[MAX_INTER_PARTS][2];
MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
// Neighbour MVs of the current partition. 5 spatial candidates and the
// temporal candidate.
InterNeighbourMV interNeighbours[6];
uint64_t rdCost; // sum of partition (psy) RD costs (sse(fenc, recon) + lambda2 * bits)
uint64_t sa8dCost; // sum of partition sa8d distortion costs (sa8d(fenc, pred) + lambda * bits)
uint32_t sa8dBits; // signal bits used in sa8dCost calculation
uint32_t psyEnergy; // sum of partition psycho-visual energy difference
sse_ret_t resEnergy; // sum of partition residual energy after motion prediction
sse_ret_t lumaDistortion;
sse_ret_t chromaDistortion;
sse_ret_t distortion; // sum of partition SSE distortion
uint32_t totalBits; // sum of partition bits (mv + coeff)
uint32_t mvBits; // Mv bits + Ref + block type (or intra mode)
uint32_t coeffBits; // Texture bits (DCT Coeffs)
void initCosts()
{
rdCost = 0;
sa8dCost = 0;
sa8dBits = 0;
psyEnergy = 0;
resEnergy = 0;
lumaDistortion = 0;
chromaDistortion = 0;
distortion = 0;
totalBits = 0;
mvBits = 0;
coeffBits = 0;
}
void invalidate()
{
/* set costs to invalid data, catch uninitialized re-use */
rdCost = UINT64_MAX / 2;
sa8dCost = UINT64_MAX / 2;
sa8dBits = MAX_UINT / 2;
psyEnergy = MAX_UINT / 2;
#if X265_DEPTH <= 10
resEnergy = MAX_UINT / 2;
lumaDistortion = MAX_UINT / 2;
chromaDistortion = MAX_UINT / 2;
distortion = MAX_UINT / 2;
#else
resEnergy = UINT64_MAX / 2;
lumaDistortion = UINT64_MAX / 2;
chromaDistortion = UINT64_MAX / 2;
distortion = UINT64_MAX / 2;
#endif
totalBits = MAX_UINT / 2;
mvBits = MAX_UINT / 2;
coeffBits = MAX_UINT / 2;
}
bool ok() const
{
#if X265_DEPTH <= 10
return !(rdCost >= UINT64_MAX / 2 ||
sa8dCost >= UINT64_MAX / 2 ||
sa8dBits >= MAX_UINT / 2 ||
psyEnergy >= MAX_UINT / 2 ||
resEnergy >= MAX_UINT / 2 ||
lumaDistortion >= MAX_UINT / 2 ||
chromaDistortion >= MAX_UINT / 2 ||
distortion >= MAX_UINT / 2 ||
totalBits >= MAX_UINT / 2 ||
mvBits >= MAX_UINT / 2 ||
coeffBits >= MAX_UINT / 2);
#else
return !(rdCost >= UINT64_MAX / 2 ||
sa8dCost >= UINT64_MAX / 2 ||
sa8dBits >= MAX_UINT / 2 ||
psyEnergy >= MAX_UINT / 2 ||
resEnergy >= UINT64_MAX / 2 ||
lumaDistortion >= UINT64_MAX / 2 ||
chromaDistortion >= UINT64_MAX / 2 ||
distortion >= UINT64_MAX / 2 ||
totalBits >= MAX_UINT / 2 ||
mvBits >= MAX_UINT / 2 ||
coeffBits >= MAX_UINT / 2);
#endif
}
void addSubCosts(const Mode& subMode)
{
X265_CHECK(subMode.ok(), "sub-mode not initialized");
rdCost += subMode.rdCost;
sa8dCost += subMode.sa8dCost;
sa8dBits += subMode.sa8dBits;
psyEnergy += subMode.psyEnergy;
resEnergy += subMode.resEnergy;
lumaDistortion += subMode.lumaDistortion;
chromaDistortion += subMode.chromaDistortion;
distortion += subMode.distortion;
totalBits += subMode.totalBits;
mvBits += subMode.mvBits;
coeffBits += subMode.coeffBits;
}
};
#if DETAILED_CU_STATS
/* This structure is intended for performance debugging and we make no attempt
* to handle dynamic range overflows. Care should be taken to avoid long encodes
* if you care about the accuracy of these elapsed times and counters. This
* profiling is orthogonal to PPA/VTune and can be enabled independently from
* either of them */
struct CUStats
{
int64_t intraRDOElapsedTime[NUM_CU_DEPTH]; // elapsed worker time in intra RDO per CU depth
int64_t interRDOElapsedTime[NUM_CU_DEPTH]; // elapsed worker time in inter RDO per CU depth
int64_t intraAnalysisElapsedTime; // elapsed worker time in intra sa8d analysis
int64_t motionEstimationElapsedTime; // elapsed worker time in predInterSearch()
int64_t loopFilterElapsedTime; // elapsed worker time in deblock and SAO and PSNR/SSIM
int64_t pmeTime; // elapsed worker time processing ME slave jobs
int64_t pmeBlockTime; // elapsed worker time blocked for pme batch completion
int64_t pmodeTime; // elapsed worker time processing pmode slave jobs
int64_t pmodeBlockTime; // elapsed worker time blocked for pmode batch completion
int64_t weightAnalyzeTime; // elapsed worker time analyzing reference weights
int64_t totalCTUTime; // elapsed worker time in compressCTU (includes pmode master)
uint32_t skippedMotionReferences[NUM_CU_DEPTH];
uint32_t totalMotionReferences[NUM_CU_DEPTH];
uint32_t skippedIntraCU[NUM_CU_DEPTH];
uint32_t totalIntraCU[NUM_CU_DEPTH];
uint64_t countIntraRDO[NUM_CU_DEPTH];
uint64_t countInterRDO[NUM_CU_DEPTH];
uint64_t countIntraAnalysis;
uint64_t countMotionEstimate;
uint64_t countLoopFilter;
uint64_t countPMETasks;
uint64_t countPMEMasters;
uint64_t countPModeTasks;
uint64_t countPModeMasters;
uint64_t countWeightAnalyze;
uint64_t totalCTUs;
CUStats() { clear(); }
void clear()
{
memset(this, 0, sizeof(*this));
}
void accumulate(CUStats& other)
{
for (uint32_t i = 0; i <= g_maxCUDepth; i++)
{
intraRDOElapsedTime[i] += other.intraRDOElapsedTime[i];
interRDOElapsedTime[i] += other.interRDOElapsedTime[i];
countIntraRDO[i] += other.countIntraRDO[i];
countInterRDO[i] += other.countInterRDO[i];
skippedMotionReferences[i] += other.skippedMotionReferences[i];
totalMotionReferences[i] += other.totalMotionReferences[i];
skippedIntraCU[i] += other.skippedIntraCU[i];
totalIntraCU[i] += other.totalIntraCU[i];
}
intraAnalysisElapsedTime += other.intraAnalysisElapsedTime;
motionEstimationElapsedTime += other.motionEstimationElapsedTime;
loopFilterElapsedTime += other.loopFilterElapsedTime;
pmeTime += other.pmeTime;
pmeBlockTime += other.pmeBlockTime;
pmodeTime += other.pmodeTime;
pmodeBlockTime += other.pmodeBlockTime;
weightAnalyzeTime += other.weightAnalyzeTime;
totalCTUTime += other.totalCTUTime;
countIntraAnalysis += other.countIntraAnalysis;
countMotionEstimate += other.countMotionEstimate;
countLoopFilter += other.countLoopFilter;
countPMETasks += other.countPMETasks;
countPMEMasters += other.countPMEMasters;
countPModeTasks += other.countPModeTasks;
countPModeMasters += other.countPModeMasters;
countWeightAnalyze += other.countWeightAnalyze;
totalCTUs += other.totalCTUs;
other.clear();
}
};
#endif
inline int getTUBits(int idx, int numIdx)
{
return idx + (idx < numIdx - 1);
}
class Search : public Predict
{
public:
static const int16_t zeroShort[MAX_CU_SIZE];
MotionEstimate m_me;
Quant m_quant;
RDCost m_rdCost;
const x265_param* m_param;
Frame* m_frame;
const Slice* m_slice;
Entropy m_entropyCoder;
RQTData m_rqt[NUM_FULL_DEPTH];
uint8_t* m_qtTempCbf[3];
uint8_t* m_qtTempTransformSkipFlag[3];
pixel* m_fencScaled; /* 32x32 buffer for down-scaled version of 64x64 CU fenc */
pixel* m_fencTransposed; /* 32x32 buffer for transposed copy of fenc */
pixel* m_intraPred; /* 32x32 buffer for individual intra predictions */
pixel* m_intraPredAngs; /* allocation for 33 consecutive (all angular) 32x32 intra predictions */
coeff_t* m_tsCoeff; /* transform skip coeff 32x32 */
int16_t* m_tsResidual; /* transform skip residual 32x32 */
pixel* m_tsRecon; /* transform skip reconstructed pixels 32x32 */
bool m_bFrameParallel;
bool m_bEnableRDOQ;
uint32_t m_numLayers;
uint32_t m_refLagPixels;
#if DETAILED_CU_STATS
/* Accumulate CU statistics separately for each frame encoder */
CUStats m_stats[X265_MAX_FRAME_THREADS];
#endif
Search();
~Search();
bool initSearch(const x265_param& param, ScalingList& scalingList);
int setLambdaFromQP(const CUData& ctu, int qp); /* returns real quant QP in valid spec range */
// mark temp RD entropy contexts as uninitialized; useful for finding loads without stores
void invalidateContexts(int fromDepth);
// full RD search of intra modes. if sharedModes is not NULL, it directly uses them
void checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes);
// select best intra mode using only sa8d costs, cannot measure NxN intra
void checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
// encode luma mode selected by checkIntraInInter, then pick and encode a chroma mode
void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
// estimation inter prediction (non-skip)
void predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks[2]);
// encode residual and compute rd-cost for inter mode
void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
void encodeResAndCalcRdSkipCU(Mode& interMode);
// encode residual without rd-cost
void residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]);
void residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]);
void residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth);
// pick be chroma mode from available using just sa8d costs
void getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom);
/* update CBF flags and QP values to be internally consistent */
void checkDQP(Mode& mode, const CUGeom& cuGeom);
void checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom);
MV getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref);
class PME : public BondedTaskGroup
{
public:
Search& master;
Mode& mode;
const CUGeom& cuGeom;
const PredictionUnit& pu;
int puIdx;
struct {
int ref[2][MAX_NUM_REF];
int refCnt[2];
} m_jobs;
PME(Search& s, Mode& m, const CUGeom& g, const PredictionUnit& u, int p) : master(s), mode(m), cuGeom(g), pu(u), puIdx(p) {}
void processTasks(int workerThreadId);
protected:
PME operator=(const PME&);
};
void processPME(PME& pme, Search& slave);
void singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref);
protected:
/* motion estimation distribution */
ThreadLocalData* m_tld;
uint32_t m_listSelBits[3];
Lock m_meLock;
void saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth);
// RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
// RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned
uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes);
void codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
void codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]);
void codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype);
struct Cost
{
uint64_t rdcost;
uint32_t bits;
sse_ret_t distortion;
uint32_t energy;
Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
};
uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
void estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
// generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
void codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]);
void codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& costs);
void extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx);
// generate chroma prediction, generate residual and recon
uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
void extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth);
// reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks
void offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx);
/* output of mergeEstimation, best merge candidate */
struct MergeData
{
MVField mvField[2];
uint32_t dir;
uint32_t index;
uint32_t bits;
};
/* inter/ME helper functions */
int selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref);
const MV& checkBestMVP(const MV amvpCand[2], const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const;
void setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const;
uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m);
static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, uint32_t lastMode, uint32_t blockBit[3]);
/* intra helper functions */
enum { MAX_RD_INTRA_MODES = 16 };
static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList);
// get most probable luma modes for CU part, and bit cost of all non mpm modes
uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const;
void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy) : m_rdCost.calcRdCost(m.distortion, m.totalBits); }
};
}
#endif // ifndef X265_SEARCH_H