libbpg/x265/source/encoder/sao.cpp
2015-10-27 11:46:00 +01:00

1709 lines
55 KiB
C++

/*****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve@borho.org>
* Min Chen <chenm003@163.com>
* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#include "common.h"
#include "frame.h"
#include "framedata.h"
#include "picyuv.h"
#include "sao.h"
namespace {
inline int32_t roundIBDI(int32_t num, int32_t den)
{
return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2));
}
/* get the sign of input variable (TODO: this is a dup, make common) */
inline int8_t signOf(int x)
{
return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
}
inline int signOf2(const int a, const int b)
{
// NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order!
int r = 0;
if (a < b)
r = -1;
if (a > b)
r = 1;
return r;
}
inline int64_t estSaoDist(int32_t count, int offset, int32_t offsetOrg)
{
return (count * offset - offsetOrg * 2) * offset;
}
} // end anonymous namespace
namespace X265_NS {
const uint32_t SAO::s_eoTable[NUM_EDGETYPE] =
{
1, // 0
2, // 1
0, // 2
3, // 3
4 // 4
};
SAO::SAO()
{
m_count = NULL;
m_offset = NULL;
m_offsetOrg = NULL;
m_countPreDblk = NULL;
m_offsetOrgPreDblk = NULL;
m_refDepth = 0;
m_lumaLambda = 0;
m_chromaLambda = 0;
m_param = NULL;
m_clipTable = NULL;
m_clipTableBase = NULL;
m_tmpU1[0] = NULL;
m_tmpU1[1] = NULL;
m_tmpU1[2] = NULL;
m_tmpU2[0] = NULL;
m_tmpU2[1] = NULL;
m_tmpU2[2] = NULL;
m_tmpL1 = NULL;
m_tmpL2 = NULL;
m_depthSaoRate[0][0] = 0;
m_depthSaoRate[0][1] = 0;
m_depthSaoRate[0][2] = 0;
m_depthSaoRate[0][3] = 0;
m_depthSaoRate[1][0] = 0;
m_depthSaoRate[1][1] = 0;
m_depthSaoRate[1][2] = 0;
m_depthSaoRate[1][3] = 0;
}
bool SAO::create(x265_param* param)
{
m_param = param;
if (param->internalCsp != X265_CSP_I400) {
m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp);
m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp);
m_numPlanes = 3;
} else {
m_numPlanes = 1;
}
m_numCuInWidth = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize;
m_numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
const pixel maxY = (1 << X265_DEPTH) - 1;
const pixel rangeExt = maxY >> 1;
int numCtu = m_numCuInWidth * m_numCuInHeight;
CHECKED_MALLOC(m_clipTableBase, pixel, maxY + 2 * rangeExt);
CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1);
CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1);
for (int i = 0; i < 3; i++)
{
// SAO asm code will read 1 pixel before and after, so pad by 2
CHECKED_MALLOC(m_tmpU1[i], pixel, m_param->sourceWidth + 2);
m_tmpU1[i] += 1;
CHECKED_MALLOC(m_tmpU2[i], pixel, m_param->sourceWidth + 2);
m_tmpU2[i] += 1;
}
CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);
CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
m_clipTable = &(m_clipTableBase[rangeExt]);
for (int i = 0; i < rangeExt; i++)
m_clipTableBase[i] = 0;
for (int i = 0; i < maxY; i++)
m_clipTable[i] = (pixel)i;
for (int i = maxY; i < maxY + rangeExt; i++)
m_clipTable[i] = maxY;
return true;
fail:
return false;
}
void SAO::destroy()
{
X265_FREE(m_clipTableBase);
X265_FREE(m_tmpL1);
X265_FREE(m_tmpL2);
for (int i = 0; i < 3; i++)
{
if (m_tmpU1[i]) X265_FREE(m_tmpU1[i] - 1);
if (m_tmpU2[i]) X265_FREE(m_tmpU2[i] - 1);
}
X265_FREE(m_count);
X265_FREE(m_offset);
X265_FREE(m_offsetOrg);
X265_FREE(m_countPreDblk);
X265_FREE(m_offsetOrgPreDblk);
}
/* allocate memory for SAO parameters */
void SAO::allocSaoParam(SAOParam* saoParam) const
{
saoParam->numCuInWidth = m_numCuInWidth;
saoParam->ctuParam[0] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
saoParam->ctuParam[1] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
saoParam->ctuParam[2] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
}
void SAO::startSlice(Frame* frame, Entropy& initState, int qp)
{
Slice* slice = frame->m_encData->m_slice;
int qpCb = qp;
if (m_param->internalCsp == X265_CSP_I420)
qpCb = x265_clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice->m_pps->chromaQpOffset[0]]);
else
qpCb = X265_MIN(qp + slice->m_pps->chromaQpOffset[0], QP_MAX_SPEC);
m_lumaLambda = x265_lambda2_tab[qp];
m_chromaLambda = x265_lambda2_tab[qpCb]; // Use Cb QP for SAO chroma
m_frame = frame;
switch (slice->m_sliceType)
{
case I_SLICE:
m_refDepth = 0;
break;
case P_SLICE:
m_refDepth = 1;
break;
case B_SLICE:
m_refDepth = 2 + !IS_REFERENCED(frame);
break;
}
resetStats();
m_entropyCoder.load(initState);
m_rdContexts.next.load(initState);
m_rdContexts.cur.load(initState);
SAOParam* saoParam = frame->m_encData->m_saoParam;
if (!saoParam)
{
saoParam = new SAOParam;
allocSaoParam(saoParam);
frame->m_encData->m_saoParam = saoParam;
}
saoParam->bSaoFlag[0] = true;
saoParam->bSaoFlag[1] = (m_numPlanes > 1);
m_numNoSao[0] = 0; // Luma
m_numNoSao[1] = 0; // Chroma
// NOTE: Allow SAO automatic turn-off only when frame parallelism is disabled.
if (m_param->frameNumThreads == 1)
{
if (m_refDepth > 0 && m_depthSaoRate[0][m_refDepth - 1] > SAO_ENCODING_RATE)
saoParam->bSaoFlag[0] = false;
if (m_refDepth > 0 && m_depthSaoRate[1][m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
saoParam->bSaoFlag[1] = false;
}
}
// CTU-based SAO process without slice granularity
void SAO::processSaoCu(int addr, int typeIdx, int plane)
{
int x, y;
PicYuv* reconPic = m_frame->m_reconPic;
pixel* rec = reconPic->getPlaneAddr(plane, addr);
intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
uint32_t picWidth = m_param->sourceWidth;
uint32_t picHeight = m_param->sourceHeight;
const CUData* cu = m_frame->m_encData->getPicCTU(addr);
int ctuWidth = g_maxCUSize;
int ctuHeight = g_maxCUSize;
uint32_t lpelx = cu->m_cuPelX;
uint32_t tpely = cu->m_cuPelY;
if (plane)
{
picWidth >>= m_hChromaShift;
picHeight >>= m_vChromaShift;
ctuWidth >>= m_hChromaShift;
ctuHeight >>= m_vChromaShift;
lpelx >>= m_hChromaShift;
tpely >>= m_vChromaShift;
}
uint32_t rpelx = x265_min(lpelx + ctuWidth, picWidth);
uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
ctuWidth = rpelx - lpelx;
ctuHeight = bpely - tpely;
int startX;
int startY;
int endX;
int endY;
pixel* tmpL;
pixel* tmpU;
int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft1[2];
int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */
{
const pixel* recR = &rec[ctuWidth - 1];
for (int i = 0; i < ctuHeight + 1; i++)
{
m_tmpL2[i] = *recR;
recR += stride;
}
tmpL = m_tmpL1;
tmpU = &(m_tmpU1[plane][lpelx]);
}
switch (typeIdx)
{
case SAO_EO_0: // dir: -
{
pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl = 0;
startX = !lpelx;
endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
if (ctuWidth & 15)
{
for (y = 0; y < ctuHeight; y++)
{
int signLeft = signOf(rec[startX] - tmpL[y]);
for (x = startX; x < endX; x++)
{
int signRight = signOf(rec[x] - rec[x + 1]);
int edgeType = signRight + signLeft + 2;
signLeft = -signRight;
rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
}
rec += stride;
}
}
else
{
for (y = 0; y < ctuHeight; y += 2)
{
signLeft1[0] = signOf(rec[startX] - tmpL[y]);
signLeft1[1] = signOf(rec[stride + startX] - tmpL[y + 1]);
if (!lpelx)
{
firstPxl = rec[0];
row1FirstPxl = rec[stride];
}
if (rpelx == picWidth)
{
lastPxl = rec[ctuWidth - 1];
row1LastPxl = rec[stride + ctuWidth - 1];
}
primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, signLeft1, stride);
if (!lpelx)
{
rec[0] = firstPxl;
rec[stride] = row1FirstPxl;
}
if (rpelx == picWidth)
{
rec[ctuWidth - 1] = lastPxl;
rec[stride + ctuWidth - 1] = row1LastPxl;
}
rec += 2 * stride;
}
}
break;
}
case SAO_EO_1: // dir: |
{
startY = !tpely;
endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
if (!tpely)
rec += stride;
if (ctuWidth & 15)
{
for (x = 0; x < ctuWidth; x++)
upBuff1[x] = signOf(rec[x] - tmpU[x]);
for (y = startY; y < endY; y++)
{
for (x = 0; x < ctuWidth; x++)
{
int8_t signDown = signOf(rec[x] - rec[x + stride]);
int edgeType = signDown + upBuff1[x] + 2;
upBuff1[x] = -signDown;
rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
}
rec += stride;
}
}
else
{
primitives.sign(upBuff1, rec, tmpU, ctuWidth);
int diff = (endY - startY) % 2;
for (y = startY; y < endY - diff; y += 2)
{
primitives.saoCuOrgE1_2Rows(rec, upBuff1, m_offsetEo, stride, ctuWidth);
rec += 2 * stride;
}
if (diff & 1)
primitives.saoCuOrgE1(rec, upBuff1, m_offsetEo, stride, ctuWidth);
}
break;
}
case SAO_EO_2: // dir: 135
{
startX = !lpelx;
endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
startY = !tpely;
endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
if (!tpely)
rec += stride;
if (!(ctuWidth & 15))
{
int8_t firstSign, lastSign;
if (!lpelx)
firstSign = upBuff1[0];
if (rpelx == picWidth)
lastSign = upBuff1[ctuWidth - 1];
primitives.sign(upBuff1, rec, &tmpU[- 1], ctuWidth);
if (!lpelx)
upBuff1[0] = firstSign;
if (rpelx == picWidth)
upBuff1[ctuWidth - 1] = lastSign;
}
else
{
for (x = startX; x < endX; x++)
upBuff1[x] = signOf(rec[x] - tmpU[x - 1]);
}
if (ctuWidth & 15)
{
for (y = startY; y < endY; y++)
{
upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
for (x = startX; x < endX; x++)
{
int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
int edgeType = signDown + upBuff1[x] + 2;
upBufft[x + 1] = -signDown;
rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
}
std::swap(upBuff1, upBufft);
rec += stride;
}
}
else
{
for (y = startY; y < endY; y++)
{
int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]);
primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, m_offsetEo, endX - startX, stride);
upBufft[startX] = iSignDown2;
std::swap(upBuff1, upBufft);
rec += stride;
}
}
break;
}
case SAO_EO_3: // dir: 45
{
startX = !lpelx;
endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
startY = !tpely;
endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
if (!tpely)
rec += stride;
if (ctuWidth & 15)
{
for (x = startX - 1; x < endX; x++)
upBuff1[x] = signOf(rec[x] - tmpU[x + 1]);
for (y = startY; y < endY; y++)
{
x = startX;
int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
int edgeType = signDown + upBuff1[x] + 2;
upBuff1[x - 1] = -signDown;
rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
for (x = startX + 1; x < endX; x++)
{
signDown = signOf(rec[x] - rec[x + stride - 1]);
edgeType = signDown + upBuff1[x] + 2;
upBuff1[x - 1] = -signDown;
rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
}
upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
rec += stride;
}
}
else
{
int8_t firstSign, lastSign;
if (lpelx)
firstSign = signOf(rec[-1] - tmpU[0]);
if (rpelx == picWidth)
lastSign = upBuff1[ctuWidth - 1];
primitives.sign(upBuff1, rec, &tmpU[1], ctuWidth);
if (lpelx)
upBuff1[-1] = firstSign;
if (rpelx == picWidth)
upBuff1[ctuWidth - 1] = lastSign;
for (y = startY; y < endY; y++)
{
x = startX;
int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
int edgeType = signDown + upBuff1[x] + 2;
upBuff1[x - 1] = -signDown;
rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
primitives.saoCuOrgE3[endX > 16](rec, upBuff1, m_offsetEo, stride - 1, startX, endX);
upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
rec += stride;
}
}
break;
}
case SAO_BO:
{
const int8_t* offsetBo = m_offsetBo;
if (ctuWidth & 15)
{
#define SAO_BO_BITS 5
const int boShift = X265_DEPTH - SAO_BO_BITS;
for (y = 0; y < ctuHeight; y++)
{
for (x = 0; x < ctuWidth; x++)
{
int val = rec[x] + offsetBo[rec[x] >> boShift];
if (val < 0)
val = 0;
else if (val > ((1 << X265_DEPTH) - 1))
val = ((1 << X265_DEPTH) - 1);
rec[x] = (pixel)val;
}
rec += stride;
}
}
else
{
primitives.saoCuOrgB0(rec, offsetBo, ctuWidth, ctuHeight, stride);
}
break;
}
default: break;
}
// if (iSaoType!=SAO_BO_0 || iSaoType!=SAO_BO_1)
std::swap(m_tmpL1, m_tmpL2);
}
/* Process SAO all units */
void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
{
PicYuv* reconPic = m_frame->m_reconPic;
intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
uint32_t picWidth = m_param->sourceWidth;
int ctuWidth = g_maxCUSize;
int ctuHeight = g_maxCUSize;
if (plane)
{
picWidth >>= m_hChromaShift;
ctuWidth >>= m_hChromaShift;
ctuHeight >>= m_vChromaShift;
}
if (!idxY)
{
pixel* rec = reconPic->m_picOrg[plane];
memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth);
}
int addr = idxY * m_numCuInWidth;
pixel* rec = plane ? reconPic->getChromaAddr(plane, addr) : reconPic->getLumaAddr(addr);
for (int i = 0; i < ctuHeight + 1; i++)
{
m_tmpL1[i] = rec[0];
rec += stride;
}
rec -= (stride << 1);
memcpy(m_tmpU2[plane], rec, sizeof(pixel) * picWidth);
for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
{
addr = idxY * m_numCuInWidth + idxX;
bool mergeLeftFlag = ctuParam[addr].mergeMode == SAO_MERGE_LEFT;
int typeIdx = ctuParam[addr].typeIdx;
if (typeIdx >= 0)
{
if (!mergeLeftFlag)
{
if (typeIdx == SAO_BO)
{
memset(m_offsetBo, 0, sizeof(m_offsetBo));
for (int i = 0; i < SAO_NUM_OFFSET; i++)
m_offsetBo[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
}
else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
{
int offset[NUM_EDGETYPE];
offset[0] = 0;
for (int i = 0; i < SAO_NUM_OFFSET; i++)
offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC;
for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
m_offsetEo[edgeType] = (int8_t)offset[s_eoTable[edgeType]];
}
}
processSaoCu(addr, typeIdx, plane);
}
else if (idxX != (m_numCuInWidth - 1))
{
rec = plane ? reconPic->getChromaAddr(plane, addr) : reconPic->getLumaAddr(addr);
for (int i = 0; i < ctuHeight + 1; i++)
{
m_tmpL1[i] = rec[ctuWidth - 1];
rec += stride;
}
}
}
std::swap(m_tmpU1[plane], m_tmpU2[plane]);
}
void SAO::resetSaoUnit(SaoCtuParam* saoUnit)
{
saoUnit->mergeMode = SAO_MERGE_NONE;
saoUnit->typeIdx = -1;
saoUnit->bandPos = 0;
for (int i = 0; i < SAO_NUM_OFFSET; i++)
saoUnit->offset[i] = 0;
}
void SAO::copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc)
{
saoUnitDst->mergeMode = saoUnitSrc->mergeMode;
saoUnitDst->typeIdx = saoUnitSrc->typeIdx;
saoUnitDst->bandPos = saoUnitSrc->bandPos;
for (int i = 0; i < SAO_NUM_OFFSET; i++)
saoUnitDst->offset[i] = saoUnitSrc->offset[i];
}
/* Calculate SAO statistics for current CTU without non-crossing slice */
void SAO::calcSaoStatsCu(int addr, int plane)
{
const PicYuv* reconPic = m_frame->m_reconPic;
const CUData* cu = m_frame->m_encData->getPicCTU(addr);
const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
const pixel* rec0 = reconPic->getPlaneAddr(plane, addr);
const pixel* fenc;
const pixel* rec;
intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
uint32_t picWidth = m_param->sourceWidth;
uint32_t picHeight = m_param->sourceHeight;
int ctuWidth = g_maxCUSize;
int ctuHeight = g_maxCUSize;
uint32_t lpelx = cu->m_cuPelX;
uint32_t tpely = cu->m_cuPelY;
if (plane)
{
picWidth >>= m_hChromaShift;
picHeight >>= m_vChromaShift;
ctuWidth >>= m_hChromaShift;
ctuHeight >>= m_vChromaShift;
lpelx >>= m_hChromaShift;
tpely >>= m_vChromaShift;
}
uint32_t rpelx = x265_min(lpelx + ctuWidth, picWidth);
uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
ctuWidth = rpelx - lpelx;
ctuHeight = bpely - tpely;
int startX;
int startY;
int endX;
int endY;
int skipB = plane ? 2 : 4;
int skipR = plane ? 3 : 5;
int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
// SAO_BO:
{
if (m_param->bSaoNonDeblocked)
{
skipB = plane ? 1 : 3;
skipR = plane ? 2 : 4;
}
endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
primitives.saoCuStatsBO(fenc0, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
}
{
// SAO_EO_0: // dir: -
{
if (m_param->bSaoNonDeblocked)
{
skipB = plane ? 1 : 3;
skipR = plane ? 3 : 5;
}
startX = !lpelx;
endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
primitives.saoCuStatsE0(fenc0 + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
}
// SAO_EO_1: // dir: |
{
if (m_param->bSaoNonDeblocked)
{
skipB = plane ? 2 : 4;
skipR = plane ? 2 : 4;
}
fenc = fenc0;
rec = rec0;
startY = !tpely;
endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
if (!tpely)
{
fenc += stride;
rec += stride;
}
primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
primitives.saoCuStatsE1(fenc0 + startY * stride, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
}
// SAO_EO_2: // dir: 135
{
if (m_param->bSaoNonDeblocked)
{
skipB = plane ? 2 : 4;
skipR = plane ? 3 : 5;
}
fenc = fenc0;
rec = rec0;
startX = !lpelx;
endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
startY = !tpely;
endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
if (!tpely)
{
fenc += stride;
rec += stride;
}
primitives.sign(&upBuff1[startX], &rec[startX], &rec[startX - stride - 1], (endX - startX));
primitives.saoCuStatsE2(fenc0 + startX + startY * stride, rec0 + startX + startY * stride, stride, upBuff1 + startX, upBufft + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
}
// SAO_EO_3: // dir: 45
{
if (m_param->bSaoNonDeblocked)
{
skipB = plane ? 2 : 4;
skipR = plane ? 3 : 5;
}
fenc = fenc0;
rec = rec0;
startX = !lpelx;
endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
startY = !tpely;
endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
if (!tpely)
{
fenc += stride;
rec += stride;
}
primitives.sign(&upBuff1[startX - 1], &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
primitives.saoCuStatsE3(fenc0 + startX + startY * stride, rec0 + startX + startY * stride, stride, upBuff1 + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
}
}
}
void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY)
{
int addr = idxX + m_numCuInWidth * idxY;
int x, y;
const CUData* cu = frame->m_encData->getPicCTU(addr);
const PicYuv* reconPic = m_frame->m_reconPic;
const pixel* fenc;
const pixel* rec;
intptr_t stride = reconPic->m_stride;
uint32_t picWidth = m_param->sourceWidth;
uint32_t picHeight = m_param->sourceHeight;
int ctuWidth = g_maxCUSize;
int ctuHeight = g_maxCUSize;
uint32_t lpelx = cu->m_cuPelX;
uint32_t tpely = cu->m_cuPelY;
uint32_t rpelx = x265_min(lpelx + ctuWidth, picWidth);
uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
ctuWidth = rpelx - lpelx;
ctuHeight = bpely - tpely;
int startX;
int startY;
int endX;
int endY;
int firstX, firstY;
int32_t* stats;
int32_t* count;
int skipB, skipR;
int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
const int boShift = X265_DEPTH - SAO_BO_BITS;
memset(m_countPreDblk[addr], 0, sizeof(PerPlane));
memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane));
for (int plane = 0; plane < NUM_PLANE; plane++)
{
if (plane == 1)
{
stride = reconPic->m_strideC;
picWidth >>= m_hChromaShift;
picHeight >>= m_vChromaShift;
ctuWidth >>= m_hChromaShift;
ctuHeight >>= m_vChromaShift;
lpelx >>= m_hChromaShift;
tpely >>= m_vChromaShift;
rpelx >>= m_hChromaShift;
bpely >>= m_vChromaShift;
}
// SAO_BO:
skipB = plane ? 1 : 3;
skipR = plane ? 2 : 4;
stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
count = m_countPreDblk[addr][plane][SAO_BO];
const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
const pixel* rec0 = reconPic->getPlaneAddr(plane, addr);
fenc = fenc0;
rec = rec0;
startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
for (y = 0; y < ctuHeight; y++)
{
for (x = (y < startY ? startX : 0); x < ctuWidth; x++)
{
int classIdx = 1 + (rec[x] >> boShift);
stats[classIdx] += (fenc[x] - rec[x]);
count[classIdx]++;
}
fenc += stride;
rec += stride;
}
// SAO_EO_0: // dir: -
{
skipB = plane ? 1 : 3;
skipR = plane ? 3 : 5;
stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_0];
count = m_countPreDblk[addr][plane][SAO_EO_0];
fenc = fenc0;
rec = rec0;
startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
firstX = !lpelx;
// endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
endX = ctuWidth - 1; // not refer right CTU
for (y = 0; y < ctuHeight; y++)
{
x = (y < startY ? startX : firstX);
int signLeft = signOf(rec[x] - rec[x - 1]);
for (; x < endX; x++)
{
int signRight = signOf(rec[x] - rec[x + 1]);
int edgeType = signRight + signLeft + 2;
signLeft = -signRight;
stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
count[s_eoTable[edgeType]]++;
}
fenc += stride;
rec += stride;
}
}
// SAO_EO_1: // dir: |
{
skipB = plane ? 2 : 4;
skipR = plane ? 2 : 4;
stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_1];
count = m_countPreDblk[addr][plane][SAO_EO_1];
fenc = fenc0;
rec = rec0;
startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
firstY = !tpely;
// endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
endY = ctuHeight - 1; // not refer below CTU
if (!tpely)
{
fenc += stride;
rec += stride;
}
for (x = startX; x < ctuWidth; x++)
upBuff1[x] = signOf(rec[x] - rec[x - stride]);
for (y = firstY; y < endY; y++)
{
for (x = (y < startY - 1 ? startX : 0); x < ctuWidth; x++)
{
int signDown = signOf(rec[x] - rec[x + stride]);
int edgeType = signDown + upBuff1[x] + 2;
upBuff1[x] = -signDown;
if (x < startX && y < startY)
continue;
stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
count[s_eoTable[edgeType]]++;
}
fenc += stride;
rec += stride;
}
}
// SAO_EO_2: // dir: 135
{
skipB = plane ? 2 : 4;
skipR = plane ? 3 : 5;
stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_2];
count = m_countPreDblk[addr][plane][SAO_EO_2];
fenc = fenc0;
rec = rec0;
startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
firstX = !lpelx;
firstY = !tpely;
// endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
// endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
endX = ctuWidth - 1; // not refer right CTU
endY = ctuHeight - 1; // not refer below CTU
if (!tpely)
{
fenc += stride;
rec += stride;
}
for (x = startX; x < endX; x++)
upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]);
for (y = firstY; y < endY; y++)
{
x = (y < startY - 1 ? startX : firstX);
upBufft[x] = signOf(rec[x + stride] - rec[x - 1]);
for (; x < endX; x++)
{
int signDown = signOf(rec[x] - rec[x + stride + 1]);
int edgeType = signDown + upBuff1[x] + 2;
upBufft[x + 1] = -signDown;
if (x < startX && y < startY)
continue;
stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
count[s_eoTable[edgeType]]++;
}
std::swap(upBuff1, upBufft);
rec += stride;
fenc += stride;
}
}
// SAO_EO_3: // dir: 45
{
skipB = plane ? 2 : 4;
skipR = plane ? 3 : 5;
stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_3];
count = m_countPreDblk[addr][plane][SAO_EO_3];
fenc = fenc0;
rec = rec0;
startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
firstX = !lpelx;
firstY = !tpely;
// endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
// endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
endX = ctuWidth - 1; // not refer right CTU
endY = ctuHeight - 1; // not refer below CTU
if (!tpely)
{
fenc += stride;
rec += stride;
}
for (x = startX - 1; x < endX; x++)
upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]);
for (y = firstY; y < endY; y++)
{
for (x = (y < startY - 1 ? startX : firstX); x < endX; x++)
{
int signDown = signOf(rec[x] - rec[x + stride - 1]);
int edgeType = signDown + upBuff1[x] + 2;
upBuff1[x - 1] = -signDown;
if (x < startX && y < startY)
continue;
stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
count[s_eoTable[edgeType]]++;
}
upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
rec += stride;
fenc += stride;
}
}
}
}
/* reset offset statistics */
void SAO::resetStats()
{
memset(m_count, 0, sizeof(PerClass) * NUM_PLANE);
memset(m_offset, 0, sizeof(PerClass) * NUM_PLANE);
memset(m_offsetOrg, 0, sizeof(PerClass) * NUM_PLANE);
}
void SAO::rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus)
{
if (!saoParam->bSaoFlag[0])
m_depthSaoRate[0][m_refDepth] = 1.0;
else
m_depthSaoRate[0][m_refDepth] = m_numNoSao[0] / ((double)numctus);
if (!saoParam->bSaoFlag[1])
m_depthSaoRate[1][m_refDepth] = 1.0;
else
m_depthSaoRate[1][m_refDepth] = m_numNoSao[1] / ((double)numctus);
}
void SAO::rdoSaoUnitRow(SAOParam* saoParam, int idxY)
{
SaoCtuParam mergeSaoParam[NUM_MERGE_MODE][2];
double mergeDist[NUM_MERGE_MODE];
bool allowMerge[2]; // left, up
allowMerge[1] = (idxY > 0);
for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
{
int addr = idxX + idxY * m_numCuInWidth;
int addrUp = idxY ? addr - m_numCuInWidth : -1;
int addrLeft = idxX ? addr - 1 : -1;
allowMerge[0] = (idxX > 0);
m_entropyCoder.load(m_rdContexts.cur);
if (allowMerge[0])
m_entropyCoder.codeSaoMerge(0);
if (allowMerge[1])
m_entropyCoder.codeSaoMerge(0);
m_entropyCoder.store(m_rdContexts.temp);
// reset stats Y, Cb, Cr
for (int plane = 0; plane < m_numPlanes; plane++)
{
for (int j = 0; j < MAX_NUM_SAO_TYPE; j++)
{
for (int k = 0; k < MAX_NUM_SAO_CLASS; k++)
{
m_offset[plane][j][k] = 0;
if (m_param->bSaoNonDeblocked)
{
m_count[plane][j][k] = m_countPreDblk[addr][plane][j][k];
m_offsetOrg[plane][j][k] = m_offsetOrgPreDblk[addr][plane][j][k];
}
else
{
m_count[plane][j][k] = 0;
m_offsetOrg[plane][j][k] = 0;
}
}
}
saoParam->ctuParam[plane][addr].mergeMode = SAO_MERGE_NONE;
saoParam->ctuParam[plane][addr].typeIdx = -1;
saoParam->ctuParam[plane][addr].bandPos = 0;
if (saoParam->bSaoFlag[plane > 0])
calcSaoStatsCu(addr, plane);
}
saoComponentParamDist(saoParam, addr, addrUp, addrLeft, &mergeSaoParam[0][0], mergeDist);
if (m_numPlanes > 1) {
sao2ChromaParamDist(saoParam, addr, addrUp, addrLeft, mergeSaoParam, mergeDist);
}
if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
{
// Cost of new SAO_params
m_entropyCoder.load(m_rdContexts.cur);
m_entropyCoder.resetBits();
if (allowMerge[0])
m_entropyCoder.codeSaoMerge(0);
if (allowMerge[1])
m_entropyCoder.codeSaoMerge(0);
for (int plane = 0; plane < m_numPlanes; plane++)
{
if (saoParam->bSaoFlag[plane > 0])
m_entropyCoder.codeSaoOffset(saoParam->ctuParam[plane][addr], plane);
}
uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
double bestCost = mergeDist[0] + (double)rate;
m_entropyCoder.store(m_rdContexts.temp);
// Cost of Merge
for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
{
if (!allowMerge[mergeIdx])
continue;
m_entropyCoder.load(m_rdContexts.cur);
m_entropyCoder.resetBits();
if (allowMerge[0])
m_entropyCoder.codeSaoMerge(1 - mergeIdx);
if (allowMerge[1] && (mergeIdx == 1))
m_entropyCoder.codeSaoMerge(1);
rate = m_entropyCoder.getNumberOfWrittenBits();
double mergeCost = mergeDist[mergeIdx + 1] + (double)rate;
if (mergeCost < bestCost)
{
SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
bestCost = mergeCost;
m_entropyCoder.store(m_rdContexts.temp);
for (int plane = 0; plane < m_numPlanes; plane++)
{
mergeSaoParam[plane][mergeIdx].mergeMode = mergeMode;
if (saoParam->bSaoFlag[plane > 0])
copySaoUnit(&saoParam->ctuParam[plane][addr], &mergeSaoParam[plane][mergeIdx]);
}
}
}
if (saoParam->ctuParam[0][addr].typeIdx < 0)
m_numNoSao[0]++;
if (saoParam->ctuParam[1][addr].typeIdx < 0)
m_numNoSao[1]++;
m_entropyCoder.load(m_rdContexts.temp);
m_entropyCoder.store(m_rdContexts.cur);
}
}
}
/** rate distortion optimization of SAO unit */
inline int64_t SAO::estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo)
{
int64_t estDist = 0;
for (int classIdx = 1; classIdx < ((typeIdx < SAO_BO) ? SAO_EO_LEN + 1 : SAO_NUM_BO_CLASSES + 1); classIdx++)
{
int32_t count = m_count[plane][typeIdx][classIdx];
int32_t& offsetOrg = m_offsetOrg[plane][typeIdx][classIdx];
int32_t& offsetOut = m_offset[plane][typeIdx][classIdx];
if (typeIdx == SAO_BO)
{
currentDistortionTableBo[classIdx - 1] = 0;
currentRdCostTableBo[classIdx - 1] = lambda;
}
if (count)
{
int offset = roundIBDI(offsetOrg << (X265_DEPTH - 8), count);
offset = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offset);
if (typeIdx < SAO_BO)
{
if (classIdx < 3)
offset = X265_MAX(offset, 0);
else
offset = X265_MIN(offset, 0);
}
offsetOut = estIterOffset(typeIdx, classIdx, lambda, offset, count, offsetOrg, currentDistortionTableBo, currentRdCostTableBo);
}
else
{
offsetOrg = 0;
offsetOut = 0;
}
if (typeIdx != SAO_BO)
estDist += estSaoDist(count, (int)offsetOut << SAO_BIT_INC, offsetOrg);
}
return estDist;
}
inline int SAO::estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg, int32_t* currentDistortionTableBo, double* currentRdCostTableBo)
{
int offsetOut = 0;
// Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit. entropy coder can be used to measure the exact rate here.
double tempMinCost = lambda;
while (offset != 0)
{
// Calculate the bits required for signalling the offset
int tempRate = (typeIdx == SAO_BO) ? (abs(offset) + 2) : (abs(offset) + 1);
if (abs(offset) == OFFSET_THRESH - 1)
tempRate--;
// Do the dequntization before distorion calculation
int tempOffset = offset << SAO_BIT_INC;
int64_t tempDist = estSaoDist(count, tempOffset, offsetOrg);
double tempCost = ((double)tempDist + lambda * (double)tempRate);
if (tempCost < tempMinCost)
{
tempMinCost = tempCost;
offsetOut = offset;
if (typeIdx == SAO_BO)
{
currentDistortionTableBo[classIdx - 1] = (int)tempDist;
currentRdCostTableBo[classIdx - 1] = tempCost;
}
}
offset = (offset > 0) ? (offset - 1) : (offset + 1);
}
return offsetOut;
}
void SAO::saoComponentParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam* mergeSaoParam, double* mergeDist)
{
int64_t bestDist = 0;
SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];
double bestRDCostTableBo = MAX_DOUBLE;
int bestClassTableBo = 0;
int currentDistortionTableBo[MAX_NUM_SAO_CLASS];
double currentRdCostTableBo[MAX_NUM_SAO_CLASS];
resetSaoUnit(lclCtuParam);
m_entropyCoder.load(m_rdContexts.temp);
m_entropyCoder.resetBits();
m_entropyCoder.codeSaoOffset(*lclCtuParam, 0);
double dCostPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_lumaLambda;
for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE; typeIdx++)
{
int64_t estDist = estSaoTypeDist(0, typeIdx, m_lumaLambda, currentDistortionTableBo, currentRdCostTableBo);
if (typeIdx == SAO_BO)
{
// Estimate Best Position
for (int i = 0; i < SAO_NUM_BO_CLASSES - SAO_BO_LEN + 1; i++)
{
double currentRDCost = 0.0;
for (int j = i; j < i + SAO_BO_LEN; j++)
currentRDCost += currentRdCostTableBo[j];
if (currentRDCost < bestRDCostTableBo)
{
bestRDCostTableBo = currentRDCost;
bestClassTableBo = i;
}
}
// Re code all Offsets
// Code Center
estDist = 0;
for (int classIdx = bestClassTableBo; classIdx < bestClassTableBo + SAO_BO_LEN; classIdx++)
estDist += currentDistortionTableBo[classIdx];
}
SaoCtuParam ctuParamRdo;
ctuParamRdo.mergeMode = SAO_MERGE_NONE;
ctuParamRdo.typeIdx = typeIdx;
ctuParamRdo.bandPos = (typeIdx == SAO_BO) ? bestClassTableBo : 0;
for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
ctuParamRdo.offset[classIdx] = (int)m_offset[0][typeIdx][classIdx + ctuParamRdo.bandPos + 1];
m_entropyCoder.load(m_rdContexts.temp);
m_entropyCoder.resetBits();
m_entropyCoder.codeSaoOffset(ctuParamRdo, 0);
uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
double cost = (double)estDist + m_lumaLambda * (double)estRate;
if (cost < dCostPartBest)
{
dCostPartBest = cost;
copySaoUnit(lclCtuParam, &ctuParamRdo);
bestDist = estDist;
}
}
mergeDist[0] = ((double)bestDist / m_lumaLambda);
m_entropyCoder.load(m_rdContexts.temp);
m_entropyCoder.codeSaoOffset(*lclCtuParam, 0);
m_entropyCoder.store(m_rdContexts.temp);
// merge left or merge up
for (int mergeIdx = 0; mergeIdx < 2; mergeIdx++)
{
SaoCtuParam* mergeSrcParam = NULL;
if (addrLeft >= 0 && mergeIdx == 0)
mergeSrcParam = &(saoParam->ctuParam[0][addrLeft]);
else if (addrUp >= 0 && mergeIdx == 1)
mergeSrcParam = &(saoParam->ctuParam[0][addrUp]);
if (mergeSrcParam)
{
int64_t estDist = 0;
int typeIdx = mergeSrcParam->typeIdx;
if (typeIdx >= 0)
{
int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 0;
for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
{
int mergeOffset = mergeSrcParam->offset[classIdx];
estDist += estSaoDist(m_count[0][typeIdx][classIdx + bandPos + 1], mergeOffset, m_offsetOrg[0][typeIdx][classIdx + bandPos + 1]);
}
}
copySaoUnit(&mergeSaoParam[mergeIdx], mergeSrcParam);
mergeSaoParam[mergeIdx].mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
mergeDist[mergeIdx + 1] = ((double)estDist / m_lumaLambda);
}
else
resetSaoUnit(&mergeSaoParam[mergeIdx]);
}
}
void SAO::sao2ChromaParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[][2], double* mergeDist)
{
int64_t bestDist = 0;
SaoCtuParam* lclCtuParam[2] = { &saoParam->ctuParam[1][addr], &saoParam->ctuParam[2][addr] };
double currentRdCostTableBo[MAX_NUM_SAO_CLASS];
int bestClassTableBo[2] = { 0, 0 };
int currentDistortionTableBo[MAX_NUM_SAO_CLASS];
resetSaoUnit(lclCtuParam[0]);
resetSaoUnit(lclCtuParam[1]);
m_entropyCoder.load(m_rdContexts.temp);
m_entropyCoder.resetBits();
m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1);
m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2);
double costPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_chromaLambda;
for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE; typeIdx++)
{
int64_t estDist[2];
if (typeIdx == SAO_BO)
{
// Estimate Best Position
for (int compIdx = 0; compIdx < 2; compIdx++)
{
double bestRDCostTableBo = MAX_DOUBLE;
estDist[compIdx] = estSaoTypeDist(compIdx + 1, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
for (int i = 0; i < SAO_NUM_BO_CLASSES - SAO_BO_LEN + 1; i++)
{
double currentRDCost = 0.0;
for (int j = i; j < i + SAO_BO_LEN; j++)
currentRDCost += currentRdCostTableBo[j];
if (currentRDCost < bestRDCostTableBo)
{
bestRDCostTableBo = currentRDCost;
bestClassTableBo[compIdx] = i;
}
}
// Re code all Offsets
// Code Center
estDist[compIdx] = 0;
for (int classIdx = bestClassTableBo[compIdx]; classIdx < bestClassTableBo[compIdx] + SAO_BO_LEN; classIdx++)
estDist[compIdx] += currentDistortionTableBo[classIdx];
}
}
else
{
estDist[0] = estSaoTypeDist(1, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
estDist[1] = estSaoTypeDist(2, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
}
m_entropyCoder.load(m_rdContexts.temp);
m_entropyCoder.resetBits();
SaoCtuParam ctuParamRdo[2];
for (int compIdx = 0; compIdx < 2; compIdx++)
{
ctuParamRdo[compIdx].mergeMode = SAO_MERGE_NONE;
ctuParamRdo[compIdx].typeIdx = typeIdx;
ctuParamRdo[compIdx].bandPos = (typeIdx == SAO_BO) ? bestClassTableBo[compIdx] : 0;
for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
ctuParamRdo[compIdx].offset[classIdx] = (int)m_offset[compIdx + 1][typeIdx][classIdx + ctuParamRdo[compIdx].bandPos + 1];
m_entropyCoder.codeSaoOffset(ctuParamRdo[compIdx], compIdx + 1);
}
uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
double cost = (double)(estDist[0] + estDist[1]) + m_chromaLambda * (double)estRate;
if (cost < costPartBest)
{
costPartBest = cost;
copySaoUnit(lclCtuParam[0], &ctuParamRdo[0]);
copySaoUnit(lclCtuParam[1], &ctuParamRdo[1]);
bestDist = (estDist[0] + estDist[1]);
}
}
mergeDist[0] += ((double)bestDist / m_chromaLambda);
m_entropyCoder.load(m_rdContexts.temp);
m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1);
m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2);
m_entropyCoder.store(m_rdContexts.temp);
// merge left or merge up
for (int mergeIdx = 0; mergeIdx < 2; mergeIdx++)
{
for (int compIdx = 0; compIdx < 2; compIdx++)
{
int plane = compIdx + 1;
SaoCtuParam* mergeSrcParam = NULL;
if (addrLeft >= 0 && mergeIdx == 0)
mergeSrcParam = &(saoParam->ctuParam[plane][addrLeft]);
else if (addrUp >= 0 && mergeIdx == 1)
mergeSrcParam = &(saoParam->ctuParam[plane][addrUp]);
if (mergeSrcParam)
{
int64_t estDist = 0;
int typeIdx = mergeSrcParam->typeIdx;
if (typeIdx >= 0)
{
int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 0;
for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
{
int mergeOffset = mergeSrcParam->offset[classIdx];
estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos + 1], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos + 1]);
}
}
copySaoUnit(&mergeSaoParam[plane][mergeIdx], mergeSrcParam);
mergeSaoParam[plane][mergeIdx].mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
mergeDist[mergeIdx + 1] += ((double)estDist / m_chromaLambda);
}
else
resetSaoUnit(&mergeSaoParam[plane][mergeIdx]);
}
}
}
// NOTE: must put in namespace X265_NS since we need class SAO
void saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
{
int x, y;
const int boShift = X265_DEPTH - SAO_BO_BITS;
for (y = 0; y < endY; y++)
{
for (x = 0; x < endX; x++)
{
int classIdx = 1 + (rec[x] >> boShift);
stats[classIdx] += (fenc[x] - rec[x]);
count[classIdx]++;
}
fenc += stride;
rec += stride;
}
}
void saoCuStatsE0_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
{
int x, y;
int32_t tmp_stats[SAO::NUM_EDGETYPE];
int32_t tmp_count[SAO::NUM_EDGETYPE];
memset(tmp_stats, 0, sizeof(tmp_stats));
memset(tmp_count, 0, sizeof(tmp_count));
for (y = 0; y < endY; y++)
{
int signLeft = signOf(rec[0] - rec[-1]);
for (x = 0; x < endX; x++)
{
int signRight = signOf2(rec[x], rec[x + 1]);
X265_CHECK(signRight == signOf(rec[x] - rec[x + 1]), "signDown check failure\n");
uint32_t edgeType = signRight + signLeft + 2;
signLeft = -signRight;
X265_CHECK(edgeType <= 4, "edgeType check failure\n");
tmp_stats[edgeType] += (fenc[x] - rec[x]);
tmp_count[edgeType]++;
}
fenc += stride;
rec += stride;
}
for (x = 0; x < SAO::NUM_EDGETYPE; x++)
{
stats[SAO::s_eoTable[x]] += tmp_stats[x];
count[SAO::s_eoTable[x]] += tmp_count[x];
}
}
void saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
{
X265_CHECK(endX <= MAX_CU_SIZE, "endX check failure\n");
X265_CHECK(endY <= MAX_CU_SIZE, "endY check failure\n");
int x, y;
int32_t tmp_stats[SAO::NUM_EDGETYPE];
int32_t tmp_count[SAO::NUM_EDGETYPE];
memset(tmp_stats, 0, sizeof(tmp_stats));
memset(tmp_count, 0, sizeof(tmp_count));
for (y = 0; y < endY; y++)
{
for (x = 0; x < endX; x++)
{
int signDown = signOf2(rec[x], rec[x + stride]);
X265_CHECK(signDown == signOf(rec[x] - rec[x + stride]), "signDown check failure\n");
uint32_t edgeType = signDown + upBuff1[x] + 2;
upBuff1[x] = (int8_t)(-signDown);
tmp_stats[edgeType] += (fenc[x] - rec[x]);
tmp_count[edgeType]++;
}
fenc += stride;
rec += stride;
}
for (x = 0; x < SAO::NUM_EDGETYPE; x++)
{
stats[SAO::s_eoTable[x]] += tmp_stats[x];
count[SAO::s_eoTable[x]] += tmp_count[x];
}
}
void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
{
X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
int x, y;
int32_t tmp_stats[SAO::NUM_EDGETYPE];
int32_t tmp_count[SAO::NUM_EDGETYPE];
memset(tmp_stats, 0, sizeof(tmp_stats));
memset(tmp_count, 0, sizeof(tmp_count));
for (y = 0; y < endY; y++)
{
upBufft[0] = signOf(rec[stride] - rec[-1]);
for (x = 0; x < endX; x++)
{
int signDown = signOf2(rec[x], rec[x + stride + 1]);
X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
uint32_t edgeType = signDown + upBuff1[x] + 2;
upBufft[x + 1] = (int8_t)(-signDown);
tmp_stats[edgeType] += (fenc[x] - rec[x]);
tmp_count[edgeType]++;
}
std::swap(upBuff1, upBufft);
rec += stride;
fenc += stride;
}
for (x = 0; x < SAO::NUM_EDGETYPE; x++)
{
stats[SAO::s_eoTable[x]] += tmp_stats[x];
count[SAO::s_eoTable[x]] += tmp_count[x];
}
}
void saoCuStatsE3_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
{
X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
int x, y;
int32_t tmp_stats[SAO::NUM_EDGETYPE];
int32_t tmp_count[SAO::NUM_EDGETYPE];
memset(tmp_stats, 0, sizeof(tmp_stats));
memset(tmp_count, 0, sizeof(tmp_count));
for (y = 0; y < endY; y++)
{
for (x = 0; x < endX; x++)
{
int signDown = signOf2(rec[x], rec[x + stride - 1]);
X265_CHECK(signDown == signOf(rec[x] - rec[x + stride - 1]), "signDown check failure\n");
X265_CHECK(abs(upBuff1[x]) <= 1, "upBuffer1 check failure\n");
uint32_t edgeType = signDown + upBuff1[x] + 2;
upBuff1[x - 1] = (int8_t)(-signDown);
tmp_stats[edgeType] += (fenc[x] - rec[x]);
tmp_count[edgeType]++;
}
upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
rec += stride;
fenc += stride;
}
for (x = 0; x < SAO::NUM_EDGETYPE; x++)
{
stats[SAO::s_eoTable[x]] += tmp_stats[x];
count[SAO::s_eoTable[x]] += tmp_count[x];
}
}
void setupSaoPrimitives_c(EncoderPrimitives &p)
{
// TODO: move other sao functions to here
p.saoCuStatsBO = saoCuStatsBO_c;
p.saoCuStatsE0 = saoCuStatsE0_c;
p.saoCuStatsE1 = saoCuStatsE1_c;
p.saoCuStatsE2 = saoCuStatsE2_c;
p.saoCuStatsE3 = saoCuStatsE3_c;
}
}