1 /*****************************************************************************
2  * Copyright (C) 2013-2020 MulticoreWare, Inc
3  *
4  * Authors: Steve Borho <steve@borho.org>
5  *          Min Chen <chenm003@163.com>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20  *
21  * This program is also available under a commercial proprietary license.
22  * For more information, contact us at license @ x265.com.
23  *****************************************************************************/
24 
25 #include "common.h"
26 #include "primitives.h"
27 #include "quant.h"
28 #include "framedata.h"
29 #include "entropy.h"
30 #include "yuv.h"
31 #include "cudata.h"
32 #include "contexts.h"
33 
34 using namespace X265_NS;
35 
36 #define SIGN(x,y) ((x^(y >> 31))-(y >> 31))
37 
38 namespace {
39 
40 struct coeffGroupRDStats
41 {
42     int     nnzBeforePos0;     /* indicates coeff other than pos 0 are coded */
43     int64_t codedLevelAndDist; /* distortion and level cost of coded coefficients */
44     int64_t uncodedDist;       /* uncoded distortion cost of coded coefficients */
45     int64_t sigCost;           /* cost of signaling significant coeff bitmap */
46     int64_t sigCost0;          /* cost of signaling sig coeff bit of coeff 0 */
47 };
48 
fastMin(int x,int y)49 inline int fastMin(int x, int y)
50 {
51     return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
52 }
53 
getICRate(uint32_t absLevel,int32_t diffLevel,const int * greaterOneBits,const int * levelAbsBits,const uint32_t absGoRice,const uint32_t maxVlc,const uint32_t c1c2Rate)54 inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, const uint32_t c1c2Rate)
55 {
56     X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
57     if (!absLevel)
58     {
59         X265_CHECK(diffLevel < 0, "diffLevel check failure\n");
60         return 0;
61     }
62     int rate = 0;
63 
64     if (diffLevel < 0)
65     {
66         X265_CHECK(absLevel <= 2, "absLevel check failure\n");
67         rate += greaterOneBits[(absLevel == 2)];
68 
69         if (absLevel == 2)
70             rate += levelAbsBits[0];
71     }
72     else
73     {
74         uint32_t symbol = diffLevel;
75         bool expGolomb = (symbol > maxVlc);
76 
77         if (expGolomb)
78         {
79             absLevel = symbol - maxVlc;
80 
81             // NOTE: mapping to x86 hardware instruction BSR
82             unsigned long size;
83             CLZ(size, absLevel);
84             int egs = size * 2 + 1;
85 
86             rate += egs << 15;
87 
88             // NOTE: in here, expGolomb=true means (symbol >= maxVlc + 1)
89             X265_CHECK(fastMin(symbol, (maxVlc + 1)) == (int)maxVlc + 1, "min check failure\n");
90             symbol = maxVlc + 1;
91         }
92 
93         uint32_t prefLen = (symbol >> absGoRice) + 1;
94         uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
95 
96         rate += numBins << 15;
97         rate += c1c2Rate;
98     }
99     return rate;
100 }
101 
102 #if CHECKED_BUILD || _DEBUG
getICRateNegDiff(uint32_t absLevel,const int * greaterOneBits,const int * levelAbsBits)103 inline int getICRateNegDiff(uint32_t absLevel, const int* greaterOneBits, const int* levelAbsBits)
104 {
105     X265_CHECK(absLevel <= 2, "absLevel check failure\n");
106 
107     int rate;
108     if (absLevel == 0)
109         rate = 0;
110     else if (absLevel == 2)
111         rate = greaterOneBits[1] + levelAbsBits[0];
112     else
113         rate = greaterOneBits[0];
114     return rate;
115 }
116 #endif
117 
getICRateLessVlc(uint32_t absLevel,int32_t diffLevel,const uint32_t absGoRice)118 inline int getICRateLessVlc(uint32_t absLevel, int32_t diffLevel, const uint32_t absGoRice)
119 {
120     X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
121     if (!absLevel)
122     {
123         X265_CHECK(diffLevel < 0, "diffLevel check failure\n");
124         return 0;
125     }
126     int rate;
127 
128     uint32_t symbol = diffLevel;
129     uint32_t prefLen = (symbol >> absGoRice) + 1;
130     uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
131 
132     rate = numBins << 15;
133 
134     return rate;
135 }
136 
137 /* Calculates the cost for specific absolute transform level */
getICRateCost(uint32_t absLevel,int32_t diffLevel,const int * greaterOneBits,const int * levelAbsBits,uint32_t absGoRice,const uint32_t c1c2Rate)138 inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, const uint32_t c1c2Rate)
139 {
140     X265_CHECK(absLevel, "absLevel should not be zero\n");
141 
142     if (diffLevel < 0)
143     {
144         X265_CHECK((absLevel == 1) || (absLevel == 2), "absLevel range check failure\n");
145 
146         uint32_t rate = greaterOneBits[(absLevel == 2)];
147         if (absLevel == 2)
148             rate += levelAbsBits[0];
149         return rate;
150     }
151     else
152     {
153         uint32_t rate;
154         uint32_t symbol = diffLevel;
155         if ((symbol >> absGoRice) < COEF_REMAIN_BIN_REDUCTION)
156         {
157             uint32_t length = symbol >> absGoRice;
158             rate = (length + 1 + absGoRice) << 15;
159         }
160         else
161         {
162             uint32_t length = 0;
163             symbol = (symbol >> absGoRice) - COEF_REMAIN_BIN_REDUCTION;
164             if (symbol)
165             {
166                 unsigned long idx;
167                 CLZ(idx, symbol + 1);
168                 length = idx;
169             }
170 
171             rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15;
172         }
173         rate += c1c2Rate;
174         return rate;
175     }
176 }
177 
178 }
179 
180 Quant::rdoQuant_t Quant::rdoQuant_func[NUM_CU_DEPTH] = {&Quant::rdoQuant<2>, &Quant::rdoQuant<3>, &Quant::rdoQuant<4>, &Quant::rdoQuant<5>};
181 
Quant()182 Quant::Quant()
183 {
184     m_resiDctCoeff = NULL;
185     m_fencDctCoeff = NULL;
186     m_fencShortBuf = NULL;
187     m_frameNr      = NULL;
188     m_nr           = NULL;
189 }
190 
init(double psyScale,const ScalingList & scalingList,Entropy & entropy)191 bool Quant::init(double psyScale, const ScalingList& scalingList, Entropy& entropy)
192 {
193     m_entropyCoder = &entropy;
194     m_psyRdoqScale = (int32_t)(psyScale * 256.0);
195     X265_CHECK((psyScale * 256.0) < (double)MAX_INT, "psyScale value too large\n");
196     m_scalingList  = &scalingList;
197     m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
198     m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
199     m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
200 
201     return m_resiDctCoeff && m_fencShortBuf;
202 }
203 
allocNoiseReduction(const x265_param & param)204 bool Quant::allocNoiseReduction(const x265_param& param)
205 {
206     m_frameNr = X265_MALLOC(NoiseReduction, param.frameNumThreads);
207     if (m_frameNr)
208         memset(m_frameNr, 0, sizeof(NoiseReduction) * param.frameNumThreads);
209     else
210         return false;
211     return true;
212 }
213 
~Quant()214 Quant::~Quant()
215 {
216     X265_FREE(m_frameNr);
217     X265_FREE(m_resiDctCoeff);
218     X265_FREE(m_fencShortBuf);
219 }
220 
setQPforQuant(const CUData & ctu,int qp)221 void Quant::setQPforQuant(const CUData& ctu, int qp)
222 {
223     m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
224     m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
225     m_rdoqLevel = ctu.m_encData->m_param->rdoqLevel;
226     if (ctu.m_chromaFormat != X265_CSP_I400)
227     {
228         setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0] + ctu.m_slice->m_chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
229         setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1] + ctu.m_slice->m_chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
230     }
231 }
232 
setChromaQP(int qpin,TextType ttype,int chFmt)233 void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
234 {
235     int qp = x265_clip3(-QP_BD_OFFSET, 57, qpin);
236     if (qp >= 30)
237     {
238         if (chFmt == X265_CSP_I420)
239             qp = g_chromaScale[qp];
240         else
241             qp = X265_MIN(qp, QP_MAX_SPEC);
242     }
243     m_qpParam[ttype].setQpParam(qp + QP_BD_OFFSET);
244 }
245 
246 /* To minimize the distortion only. No rate is considered */
signBitHidingHDQ(int16_t * coeff,int32_t * deltaU,uint32_t numSig,const TUEntropyCodingParameters & codeParams,uint32_t log2TrSize)247 uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams, uint32_t log2TrSize)
248 {
249     uint32_t trSize = 1 << log2TrSize;
250     const uint16_t* scan = codeParams.scan;
251 
252     uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
253     uint16_t coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
254     uint16_t coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
255 
256 #if CHECKED_BUILD || _DEBUG
257     // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group
258     memset(coeffNum, 0, sizeof(coeffNum));
259     memset(coeffSign, 0, sizeof(coeffNum));
260     memset(coeffFlag, 0, sizeof(coeffNum));
261 #endif
262     const int lastScanPos = primitives.scanPosLast(codeParams.scan, coeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize);
263     const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE);
264     unsigned long tmp;
265 
266     // first CG need specially processing
267     const uint32_t correctOffset = 0x0F & (lastScanPos ^ 0xF);
268     coeffFlag[cgLastScanPos] <<= correctOffset;
269 
270     for (int cg = cgLastScanPos; cg >= 0; cg--)
271     {
272         int cgStartPos = cg << LOG2_SCAN_SET_SIZE;
273         int n;
274 
275 #if CHECKED_BUILD || _DEBUG
276         for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
277             if (coeff[scan[n + cgStartPos]])
278                 break;
279         int lastNZPosInCG0 = n;
280 #endif
281 
282         if (coeffNum[cg] == 0)
283         {
284             X265_CHECK(lastNZPosInCG0 < 0, "all zero block check failure\n");
285             continue;
286         }
287 
288 #if CHECKED_BUILD || _DEBUG
289         for (n = 0;; n++)
290             if (coeff[scan[n + cgStartPos]])
291                 break;
292 
293         int firstNZPosInCG0 = n;
294 #endif
295 
296         CLZ(tmp, coeffFlag[cg]);
297         const int firstNZPosInCG = (15 ^ tmp);
298 
299         CTZ(tmp, coeffFlag[cg]);
300         const int lastNZPosInCG = (15 ^ tmp);
301 
302         X265_CHECK(firstNZPosInCG0 == firstNZPosInCG, "firstNZPosInCG0 check failure\n");
303         X265_CHECK(lastNZPosInCG0 == lastNZPosInCG, "lastNZPosInCG0 check failure\n");
304 
305         if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
306         {
307             uint32_t signbit = coeff[scan[cgStartPos + firstNZPosInCG]] > 0 ? 0 : 1;
308             uint32_t absSum = 0;
309 
310             for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
311                 absSum += coeff[scan[n + cgStartPos]];
312 
313             if (signbit != (absSum & 0x1)) // compare signbit with sum_parity
314             {
315                 int minCostInc = MAX_INT,  minPos = -1, curCost = MAX_INT;
316                 int32_t finalChange = 0, curChange = 0;
317                 uint32_t cgFlags = coeffFlag[cg];
318                 if (cg == cgLastScanPos)
319                     cgFlags >>= correctOffset;
320 
321                 for (n = (cg == cgLastScanPos ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
322                 {
323                     uint32_t blkPos = scan[n + cgStartPos];
324                     X265_CHECK(!!coeff[blkPos] == !!(cgFlags & 1), "non zero coeff check failure\n");
325 
326                     if (cgFlags & 1)
327                     {
328                         if (deltaU[blkPos] > 0)
329                         {
330                             curCost = -deltaU[blkPos];
331                             curChange = 1;
332                         }
333                         else
334                         {
335                             if ((cgFlags == 1) && (abs(coeff[blkPos]) == 1))
336                             {
337                                 X265_CHECK(n == firstNZPosInCG, "firstNZPosInCG position check failure\n");
338                                 curCost = MAX_INT;
339                             }
340                             else
341                             {
342                                 curCost = deltaU[blkPos];
343                                 curChange = -1;
344                             }
345                         }
346                     }
347                     else
348                     {
349                         if (cgFlags == 0)
350                         {
351                             X265_CHECK(n < firstNZPosInCG, "firstNZPosInCG position check failure\n");
352                             uint32_t thisSignBit = m_resiDctCoeff[blkPos] >= 0 ? 0 : 1;
353                             if (thisSignBit != signbit)
354                                 curCost = MAX_INT;
355                             else
356                             {
357                                 curCost = -deltaU[blkPos];
358                                 curChange = 1;
359                             }
360                         }
361                         else
362                         {
363                             curCost = -deltaU[blkPos];
364                             curChange = 1;
365                         }
366                     }
367 
368                     if (curCost < minCostInc)
369                     {
370                         minCostInc = curCost;
371                         finalChange = curChange;
372                         minPos = blkPos;
373                     }
374                     cgFlags>>=1;
375                 }
376 
377                 /* do not allow change to violate coeff clamp */
378                 if (coeff[minPos] == 32767 || coeff[minPos] == -32768)
379                     finalChange = -1;
380 
381                 if (!coeff[minPos])
382                     numSig++;
383                 else if (finalChange == -1 && abs(coeff[minPos]) == 1)
384                     numSig--;
385 
386                 {
387                     const int16_t sigMask = ((int16_t)m_resiDctCoeff[minPos]) >> 15;
388                     coeff[minPos] += ((int16_t)finalChange ^ sigMask) - sigMask;
389                 }
390             }
391         }
392     }
393 
394     return numSig;
395 }
396 
transformNxN(const CUData & cu,const pixel * fenc,uint32_t fencStride,const int16_t * residual,uint32_t resiStride,coeff_t * coeff,uint32_t log2TrSize,TextType ttype,uint32_t absPartIdx,bool useTransformSkip)397 uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
398                              coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
399 {
400     const uint32_t sizeIdx = log2TrSize - 2;
401 
402     if (cu.m_tqBypass[0])
403     {
404         X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
405         return primitives.cu[sizeIdx].copy_cnt(coeff, residual, resiStride);
406     }
407 
408     bool isLuma  = ttype == TEXT_LUMA;
409     bool usePsy  = m_psyRdoqScale && isLuma && !useTransformSkip;
410     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
411 
412     X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n");
413     if (useTransformSkip)
414     {
415 #if X265_DEPTH <= 10
416         X265_CHECK(transformShift >= 0, "invalid transformShift\n");
417         primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift);
418 #else
419         if (transformShift >= 0)
420             primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift);
421         else
422             primitives.cu[sizeIdx].cpy2Dto1D_shr(m_resiDctCoeff, residual, resiStride, -transformShift);
423 #endif
424     }
425     else
426     {
427         bool isIntra = cu.isIntra(absPartIdx);
428 
429         if (!sizeIdx && isLuma && isIntra)
430             primitives.dst4x4(residual, m_resiDctCoeff, resiStride);
431         else
432             primitives.cu[sizeIdx].dct(residual, m_resiDctCoeff, resiStride);
433 
434         /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
435          * there is no risk of performing this DCT unnecessarily */
436         if (usePsy)
437         {
438             int trSize = 1 << log2TrSize;
439             /* perform DCT on source pixels for psy-rdoq */
440             primitives.cu[sizeIdx].copy_ps(m_fencShortBuf, trSize, fenc, fencStride);
441             primitives.cu[sizeIdx].dct(m_fencShortBuf, m_fencDctCoeff, trSize);
442         }
443 
444         if (m_nr && m_nr->offset)
445         {
446             /* denoise is not applied to intra residual, so DST can be ignored */
447             int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
448             int numCoeff = 1 << (log2TrSize * 2);
449             primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offset[cat], numCoeff);
450             m_nr->count[cat]++;
451         }
452     }
453 
454     if (m_rdoqLevel)
455         return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy);
456     else
457     {
458         int deltaU[32 * 32];
459 
460         int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
461         int rem = m_qpParam[ttype].rem;
462         int per = m_qpParam[ttype].per;
463         const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
464 
465         int qbits = QUANT_SHIFT + per + transformShift;
466         int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9);
467         int numCoeff = 1 << (log2TrSize * 2);
468 
469         uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff);
470 
471         if (numSig >= 2 && cu.m_slice->m_pps->bSignHideEnabled)
472         {
473             TUEntropyCodingParameters codeParams;
474             cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, isLuma);
475             return signBitHidingHDQ(coeff, deltaU, numSig, codeParams, log2TrSize);
476         }
477         else
478             return numSig;
479     }
480 }
481 
ssimDistortion(const CUData & cu,const pixel * fenc,uint32_t fStride,const pixel * recon,intptr_t rstride,uint32_t log2TrSize,TextType ttype,uint32_t absPartIdx)482 uint64_t Quant::ssimDistortion(const CUData& cu, const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx)
483 {
484     static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); // 416
485     static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); // 235963
486     int shift = (X265_DEPTH - 8);
487 
488     int trSize = 1 << log2TrSize;
489     uint64_t ssDc = 0, ssBlock = 0, ssAc = 0;
490 
491     // Calculation of (X(0) - Y(0)) * (X(0) - Y(0)), DC
492     ssDc = 0;
493     for (int y = 0; y < trSize; y += 4)
494     {
495         for (int x = 0; x < trSize; x += 4)
496         {
497             int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff
498             ssDc += temp * temp;
499         }
500     }
501 
502     // Calculation of (X(k) - Y(k)) * (X(k) - Y(k)), AC
503     ssBlock = 0;
504     uint64_t ac_k = 0;
505     primitives.cu[log2TrSize - 2].ssimDist(fenc, fStride, recon, rstride, &ssBlock, shift, &ac_k);
506     ssAc = ssBlock - ssDc;
507 
508     // 1. Calculation of fdc'
509     // Calculate numerator of dc normalization factor
510     uint64_t fDc_num = 0;
511 
512     // 2. Calculate dc component
513     uint64_t dc_k = 0;
514     for (int block_yy = 0; block_yy < trSize; block_yy += 4)
515     {
516         for (int block_xx = 0; block_xx < trSize; block_xx += 4)
517         {
518             uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;
519             dc_k += temp * temp;
520         }
521     }
522 
523     fDc_num = (2 * dc_k)  + (trSize * trSize * ssim_c1); // 16 pixels -> for each 4x4 block
524     fDc_num /= ((trSize >> 2) * (trSize >> 2));
525 
526     // 1. Calculation of fac'
527     // Calculate numerator of ac normalization factor
528     uint64_t fAc_num = 0;
529 
530     // 2. Calculate ac component
531     ac_k -= dc_k;
532 
533     double s = 1 + 0.005 * cu.m_qp[absPartIdx];
534 
535     fAc_num = ac_k + uint64_t(s * ac_k) + ssim_c2;
536     fAc_num /= ((trSize >> 2) * (trSize >> 2));
537 
538     // Calculate dc and ac normalization factor
539     uint64_t ssim_distortion = ((ssDc * cu.m_fDc_den[ttype]) / fDc_num) + ((ssAc * cu.m_fAc_den[ttype]) / fAc_num);
540     return ssim_distortion;
541 }
542 
invtransformNxN(const CUData & cu,int16_t * residual,uint32_t resiStride,const coeff_t * coeff,uint32_t log2TrSize,TextType ttype,bool bIntra,bool useTransformSkip,uint32_t numSig)543 void Quant::invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
544                             uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
545 {
546     const uint32_t sizeIdx = log2TrSize - 2;
547     if (cu.m_tqBypass[0])
548     {
549         primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, coeff, resiStride, 0);
550         return;
551     }
552     // Values need to pass as input parameter in dequant
553     int rem = m_qpParam[ttype].rem;
554     int per = m_qpParam[ttype].per;
555     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
556     int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
557     int numCoeff = 1 << (log2TrSize * 2);
558 
559     if (m_scalingList->m_bEnabled)
560     {
561         int scalingListType = (bIntra ? 0 : 3) + ttype;
562         const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem];
563         primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
564     }
565     else
566     {
567         int scale = m_scalingList->s_invQuantScales[rem] << per;
568         primitives.dequant_normal(coeff, m_resiDctCoeff, numCoeff, scale, shift);
569     }
570 
571     if (useTransformSkip)
572     {
573 #if X265_DEPTH <= 10
574         X265_CHECK(transformShift > 0, "invalid transformShift\n");
575         primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
576 #else
577         if (transformShift > 0)
578             primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
579         else
580             primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, m_resiDctCoeff, resiStride, -transformShift);
581 #endif
582     }
583     else
584     {
585         int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
586         X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(coeff), "numSig differ\n");
587         // DC only
588         if (numSig == 1 && coeff[0] != 0 && !useDST)
589         {
590             const int shift_1st = 7 - 6;
591             const int add_1st = 1 << (shift_1st - 1);
592             const int shift_2nd = 12 - (X265_DEPTH - 8) - 3;
593             const int add_2nd = 1 << (shift_2nd - 1);
594 
595             int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
596             primitives.cu[sizeIdx].blockfill_s[resiStride % 64 == 0](residual, resiStride, (int16_t)dc_val);
597             return;
598         }
599 
600         if (useDST)
601             primitives.idst4x4(m_resiDctCoeff, residual, resiStride);
602         else
603             primitives.cu[sizeIdx].idct(m_resiDctCoeff, residual, resiStride);
604     }
605 }
606 
607 /* Rate distortion optimized quantization for entropy coding engines using
608  * probability models like CABAC */
609 template<uint32_t log2TrSize>
rdoQuant(const CUData & cu,int16_t * dstCoeff,TextType ttype,uint32_t absPartIdx,bool usePsy)610 uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy)
611 {
612     const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
613     int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
614     const uint32_t usePsyMask = usePsy ? -1 : 0;
615 
616     X265_CHECK(scalingListType < 6, "scaling list type out of range\n");
617 
618     int rem = m_qpParam[ttype].rem;
619     int per = m_qpParam[ttype].per;
620     int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
621     int add = (1 << (qbits - 1));
622     const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
623 
624     const int numCoeff = 1 << (log2TrSize * 2);
625     uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
626     X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
627     if (!numSig)
628         return 0;
629     const uint32_t trSize = 1 << log2TrSize;
630     int64_t lambda2 = m_qpParam[ttype].lambda2;
631     int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
632     /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
633      * scale applied that must be removed during unquant. Note that in real dequant there is clipping
634      * at several stages. We skip the clipping for simplicity when measuring RD cost */
635     const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
636     int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
637     int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
638     const int scaleBits = SCALE_BITS - 2 * transformShift;
639 
640 #define UNQUANT(lvl)    (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
641 #define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
642 #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
643 #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
644 
645     int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
646     int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
647     int64_t costSig[trSize * trSize];     /* lambda * bits       */
648 
649     int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
650     int rateIncDown[trSize * trSize];    /* signal overhead of decreasing level */
651     int sigRateDelta[trSize * trSize];   /* signal difference between zero and non-zero */
652 
653     int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */
654     uint64_t sigCoeffGroupFlag64 = 0;
655 
656     const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */
657     bool bIsLuma = ttype == TEXT_LUMA;
658 
659     /* total rate distortion cost of transform block, as CBF=0 */
660     int64_t totalUncodedCost = 0;
661 
662     /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
663      * the distortion and signal cost of coded blocks, and the coding cost of significant
664      * coefficient and coefficient group bitmaps */
665     int64_t totalRdCost = 0;
666 
667     TUEntropyCodingParameters codeParams;
668     cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
669     const uint32_t log2TrSizeCG = log2TrSize - 2;
670     const uint32_t cgNum = 1 << (log2TrSizeCG * 2);
671     const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE);
672 
673     uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
674     uint16_t coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
675     uint16_t coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
676 
677 #if CHECKED_BUILD || _DEBUG
678     // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group
679     memset(coeffNum, 0, sizeof(coeffNum));
680     memset(coeffSign, 0, sizeof(coeffNum));
681     memset(coeffFlag, 0, sizeof(coeffNum));
682 #endif
683     const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize);
684     const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE);
685 
686 
687     /* TODO: update bit estimates if dirty */
688     EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac;
689 
690     uint32_t scanPos = 0;
691     uint32_t c1 = 1;
692 
693     // process trail all zero Coeff Group
694 
695     /* coefficients after lastNZ have no distortion signal cost */
696     const int zeroCG = cgNum - 1 - cgLastScanPos;
697     memset(&costCoeff[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t));
698     memset(&costSig[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t));
699 
700     /* sum zero coeff (uncodec) cost */
701 
702     // TODO: does we need these cost?
703     if (usePsyMask)
704     {
705         for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
706         {
707             X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
708             uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
709             uint32_t blkPos      = codeParams.scan[scanPosBase];
710 #if X265_ARCH_X86
711             bool enable512 = detect512();
712             if (enable512)
713                 primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
714             else
715             {
716                 primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff,  costUncoded, &totalUncodedCost, &totalRdCost,blkPos);
717                 primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
718             }
719 #else
720             primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
721             primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
722 #endif
723         }
724     }
725     else
726     {
727         // non-psy path
728         for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
729         {
730             X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
731             uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
732             uint32_t blkPos      = codeParams.scan[scanPosBase];
733             primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
734         }
735     }
736     static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
737     {
738         // patternSigCtx = 0
739         {
740             2, 1, 1, 0,
741             1, 1, 0, 0,
742             1, 0, 0, 0,
743             0, 0, 0, 0,
744         },
745         // patternSigCtx = 1
746         {
747             2, 2, 2, 2,
748             1, 1, 1, 1,
749             0, 0, 0, 0,
750             0, 0, 0, 0,
751         },
752         // patternSigCtx = 2
753         {
754             2, 1, 0, 0,
755             2, 1, 0, 0,
756             2, 1, 0, 0,
757             2, 1, 0, 0,
758         },
759         // patternSigCtx = 3
760         {
761             2, 2, 2, 2,
762             2, 2, 2, 2,
763             2, 2, 2, 2,
764             2, 2, 2, 2,
765         },
766         // 4x4
767         {
768             0, 1, 4, 5,
769             2, 3, 4, 5,
770             6, 6, 8, 8,
771             7, 7, 8, 8
772         }
773     };
774 
775     /* iterate over coding groups in reverse scan order */
776     for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--)
777     {
778         uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0;
779         const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos];
780         const uint32_t cgPosY   = cgBlkPos >> log2TrSizeCG;
781         const uint32_t cgPosX   = cgBlkPos & ((1 << log2TrSizeCG) - 1);
782         const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
783         const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
784         const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
785 
786         if (c1 == 0)
787             ctxSet++;
788         c1 = 1;
789 
790         if (cgScanPos && (coeffNum[cgScanPos] == 0))
791         {
792             // TODO: does we need zero-coeff cost?
793             const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
794             uint32_t blkPos = codeParams.scan[scanPosBase];
795             if (usePsyMask)
796             {
797 #if X265_ARCH_X86
798                 bool enable512 = detect512();
799                 if (enable512)
800                     primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
801                 else
802                 {
803                     primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
804                     primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
805                 }
806 #else
807                 primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
808                 primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
809 #endif
810                 blkPos = codeParams.scan[scanPosBase];
811                 for (int y = 0; y < MLS_CG_SIZE; y++)
812                 {
813                     for (int x = 0; x < MLS_CG_SIZE; x++)
814                     {
815                         const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
816                         const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
817                         X265_CHECK(trSize > 4, "trSize check failure\n");
818                         X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
819 
820                         costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
821                         costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
822                         sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
823                     }
824                     blkPos += trSize;
825                 }
826             }
827             else
828             {
829                 // non-psy path
830                 primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
831                 blkPos = codeParams.scan[scanPosBase];
832                 for (int y = 0; y < MLS_CG_SIZE; y++)
833                 {
834                     for (int x = 0; x < MLS_CG_SIZE; x++)
835                     {
836                         const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
837                         const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
838                         X265_CHECK(trSize > 4, "trSize check failure\n");
839                         X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
840 
841                         costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
842                         costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
843                         sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
844                     }
845                     blkPos += trSize;
846                 }
847             }
848 
849             /* there were no coded coefficients in this coefficient group */
850             {
851                 uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
852                 costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
853                 totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
854             }
855             continue;
856         }
857 
858         coeffGroupRDStats cgRdStats;
859         memset(&cgRdStats, 0, sizeof(coeffGroupRDStats));
860 
861         uint32_t subFlagMask = coeffFlag[cgScanPos];
862         int    c2            = 0;
863         uint32_t goRiceParam = 0;
864         uint32_t levelThreshold = 3;
865         uint32_t c1Idx       = 0;
866         uint32_t c2Idx       = 0;
867         /* iterate over coefficients in each group in reverse scan order */
868         for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
869         {
870             scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
871             uint32_t blkPos      = codeParams.scan[scanPos];
872             uint32_t maxAbsLevel = dstCoeff[blkPos];                  /* abs(quantized coeff) */
873             int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
874             int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
875 
876             /* RDOQ measures distortion as the squared difference between the unquantized coded level
877              * and the original DCT coefficient. The result is shifted scaleBits to account for the
878              * FIX15 nature of the CABAC cost tables minus the forward transform scale */
879 
880             /* cost of not coding this coefficient (all distortion, no signal bits) */
881             costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;
882             X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n");
883             if (usePsyMask & scanPos)
884                 /* when no residual coefficient is coded, predicted coef == recon coef */
885                 costUncoded[blkPos] -= PSYVALUE(predictedCoef);
886 
887             totalUncodedCost += costUncoded[blkPos];
888 
889             // coefficient level estimation
890             const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
891             //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
892             static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL};
893             uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx];
894             const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset;
895             // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset'
896             X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
897 
898             // before find lastest non-zero coeff
899             if (scanPos > (uint32_t)lastScanPos)
900             {
901                 /* coefficients after lastNZ have no distortion signal cost */
902                 costCoeff[scanPos] = 0;
903                 costSig[scanPos] = 0;
904 
905                 /* No non-zero coefficient yet found, but this does not mean
906                  * there is no uncoded-cost for this coefficient. Pre-
907                  * quantization the coefficient may have been non-zero */
908                 totalRdCost += costUncoded[blkPos];
909             }
910             else if (!(subFlagMask & 1))
911             {
912                 // fast zero coeff path
913                 /* set default costs to uncoded costs */
914                 costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
915                 costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
916                 sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
917                 totalRdCost += costCoeff[scanPos];
918                 rateIncUp[blkPos] = greaterOneBits[0];
919 
920                 subFlagMask >>= 1;
921             }
922             else
923             {
924                 subFlagMask >>= 1;
925 
926                 const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2;
927                 const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3;  // {1, 2, 1, 3}
928 
929                 X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n");
930                 X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n");
931                 X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n");
932                 X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n");
933 
934                 // coefficient level estimation
935                 const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2];
936                 const uint32_t c1c2Rate = ((c1c2idx & 1) ?  greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0);
937 
938                 uint32_t level = 0;
939                 uint32_t sigCoefBits = 0;
940                 costCoeff[scanPos] = MAX_INT64;
941 
942                 if ((int)scanPos == lastScanPos)
943                     sigRateDelta[blkPos] = 0;
944                 else
945                 {
946                     if (maxAbsLevel < 3)
947                     {
948                         /* set default costs to uncoded costs */
949                         costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
950                         costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
951                     }
952                     sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
953                     sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
954                 }
955 
956                 const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound);
957                 // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1)  | (0 < X < 2 ==> X=1)
958                 if (maxAbsLevel == 1)
959                 {
960                     uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE;
961                     X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n");
962 
963                     int unquantAbsLevel = unQuantLevel >> unquantShift;
964                     X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n");
965                     int d = abs(signCoef) - unquantAbsLevel;
966                     int64_t curCost = RDCOST(d, sigCoefBits + levelBits);
967 
968                     /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
969                     if (usePsyMask & scanPos)
970                     {
971                         int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef));
972                         curCost -= PSYVALUE(reconCoef);
973                     }
974 
975                     if (curCost < costCoeff[scanPos])
976                     {
977                         level = 1;
978                         costCoeff[scanPos] = curCost;
979                         costSig[scanPos] = SIGCOST(sigCoefBits);
980                     }
981                 }
982                 else if (maxAbsLevel)
983                 {
984                     uint32_t levelBits0 = getICRateCost(maxAbsLevel,     maxAbsLevel     - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
985                     uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
986 
987                     const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
988 
989                     const int unquantAbsLevel0 = unQuantLevel >> unquantShift;
990                     X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n");
991                     int d0 = abs(signCoef) - unquantAbsLevel0;
992                     int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0);
993 
994                     const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift;
995                     X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n");
996                     int d1 = abs(signCoef) - unquantAbsLevel1;
997                     int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1);
998 
999                     /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
1000                     if (usePsyMask & scanPos)
1001                     {
1002                         int reconCoef;
1003                         reconCoef = abs(unquantAbsLevel0 + SIGN(predictedCoef, signCoef));
1004                         curCost0 -= PSYVALUE(reconCoef);
1005 
1006                         reconCoef = abs(unquantAbsLevel1 + SIGN(predictedCoef, signCoef));
1007                         curCost1 -= PSYVALUE(reconCoef);
1008                     }
1009                     if (curCost0 < costCoeff[scanPos])
1010                     {
1011                         level = maxAbsLevel;
1012                         costCoeff[scanPos] = curCost0;
1013                         costSig[scanPos] = SIGCOST(sigCoefBits);
1014                     }
1015                     if (curCost1 < costCoeff[scanPos])
1016                     {
1017                         level = maxAbsLevel - 1;
1018                         costCoeff[scanPos] = curCost1;
1019                         costSig[scanPos] = SIGCOST(sigCoefBits);
1020                     }
1021                 }
1022 
1023                 dstCoeff[blkPos] = (int16_t)level;
1024                 totalRdCost += costCoeff[scanPos];
1025 
1026                 /* record costs for sign-hiding performed at the end */
1027                 if ((cu.m_slice->m_pps->bSignHideEnabled ? ~0 : 0) & level)
1028                 {
1029                     const int32_t diff0 = level - 1 - baseLevel;
1030                     const int32_t diff2 = level + 1 - baseLevel;
1031                     const int32_t maxVlc = g_goRiceRange[goRiceParam];
1032                     int rate0, rate1, rate2;
1033 
1034                     if (diff0 < -2)  // prob (92.9, 86.5, 74.5)%
1035                     {
1036                         // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2}
1037                         //            additional L > 0, so I got (L > 0 && L < 2) ==> L = 1
1038                         X265_CHECK(level == 1, "absLevel check failure\n");
1039 
1040                         const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];;
1041                         const int rateNotEqual2 = greaterOneBits[0];
1042 
1043                         rate0 = 0;
1044                         rate2 = rateEqual2;
1045                         rate1 = rateNotEqual2;
1046 
1047                         X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1048                         X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1049                         X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1050                     }
1051                     else if (diff0 >= 0 && diff2 <= maxVlc)     // prob except from above path (98.6, 97.9, 96.9)%
1052                     {
1053                         // NOTE: no c1c2 correct rate since all of rate include this factor
1054                         rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam);
1055                         rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam);
1056                         rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam);
1057                     }
1058                     else
1059                     {
1060                         rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1061                         rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1062                         rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1063                     }
1064                     rateIncUp[blkPos] = rate2 - rate1;
1065                     rateIncDown[blkPos] = rate0 - rate1;
1066                 }
1067                 else
1068                 {
1069                     rateIncUp[blkPos] = greaterOneBits[0];
1070                     rateIncDown[blkPos] = 0;
1071                 }
1072 
1073                 /* Update CABAC estimation state */
1074                 if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold))
1075                 {
1076                     goRiceParam++;
1077                     levelThreshold <<= 1;
1078                 }
1079 
1080                 const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31;
1081                 c1Idx += isNonZero;
1082 
1083                 /* update bin model */
1084                 if (level > 1)
1085                 {
1086                     c1 = 0;
1087                     c2 += (uint32_t)(c2 - 2) >> 31;
1088                     c2Idx++;
1089                 }
1090                 else if (((c1 == 1) | (c1 == 2)) & isNonZero)
1091                     c1++;
1092 
1093                 if (dstCoeff[blkPos])
1094                 {
1095                     sigCoeffGroupFlag64 |= cgBlkPosMask;
1096                     cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];
1097                     cgRdStats.uncodedDist += costUncoded[blkPos];
1098                     cgRdStats.nnzBeforePos0 += scanPosinCG;
1099                 }
1100             }
1101 
1102             cgRdStats.sigCost += costSig[scanPos];
1103         } /* end for (scanPosinCG) */
1104 
1105         X265_CHECK((cgScanPos << MLS_CG_SIZE) == (int)scanPos, "scanPos mistake\n");
1106         cgRdStats.sigCost0 = costSig[scanPos];
1107 
1108         costCoeffGroupSig[cgScanPos] = 0;
1109 
1110         /* nothing to do at this case */
1111         X265_CHECK(cgLastScanPos >= 0, "cgLastScanPos check failure\n");
1112 
1113         if (!cgScanPos || cgScanPos == cgLastScanPos)
1114         {
1115             /* coeff group 0 is implied to be present, no signal cost */
1116             /* coeff group with last NZ is implied to be present, handled below */
1117         }
1118         else if (sigCoeffGroupFlag64 & cgBlkPosMask)
1119         {
1120             if (!cgRdStats.nnzBeforePos0)
1121             {
1122                 /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */
1123                 totalRdCost -= cgRdStats.sigCost0;
1124                 cgRdStats.sigCost -= cgRdStats.sigCost0;
1125             }
1126 
1127             /* there are coded coefficients in this group, but now we include the signaling cost
1128              * of the significant coefficient group flag and evaluate whether the RD cost of the
1129              * coded group is more than the RD cost of the uncoded group */
1130 
1131             uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
1132 
1133             int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
1134             costZeroCG += cgRdStats.uncodedDist;       /* add distortion for resetting non-zero levels to zero levels */
1135             costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */
1136             costZeroCG -= cgRdStats.sigCost;           /* remove signaling cost of significant coeff bitmap */
1137 
1138             costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]);
1139             totalRdCost += costCoeffGroupSig[cgScanPos];  /* add the cost of 1 bit in significant CG bitmap */
1140 
1141             if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
1142             {
1143                 sigCoeffGroupFlag64 &= ~cgBlkPosMask;
1144                 totalRdCost = costZeroCG;
1145                 costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
1146 
1147                 /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
1148                 const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize];
1149                 memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
1150                 memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff));
1151                 memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff));
1152                 memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff));
1153             }
1154         }
1155         else
1156         {
1157             /* there were no coded coefficients in this coefficient group */
1158             uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
1159             costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
1160             totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
1161             totalRdCost -= cgRdStats.sigCost;             /* remove cost of significant coefficient bitmap */
1162         }
1163     } /* end for (cgScanPos) */
1164 
1165     X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n");
1166 
1167     /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */
1168     int64_t bestCost;
1169     if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
1170     {
1171         bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]);
1172         totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
1173     }
1174     else
1175     {
1176         int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
1177         bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]);
1178         totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
1179     }
1180 
1181     /* This loop starts with the last non-zero found in the first loop and then refines this last
1182      * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs
1183      * at all previous coefficients until a coefficient greater than 1 is encountered or we run out
1184      * of coefficients to evaluate.  This will factor in the cost of coding empty groups and empty
1185      * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */
1186     int  bestLastIdx = 0;
1187     bool foundLast = false;
1188     for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--)
1189     {
1190         if (!cgScanPos || cgScanPos == cgLastScanPos)
1191         {
1192             /* the presence of these coefficient groups are inferred, they have no bit in
1193              * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */
1194         }
1195         else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos]))
1196         {
1197             /* remove cost of significant coeff group flag, the group's presence would be inferred
1198              * from lastNZ if it were present in this group */
1199             totalRdCost -= costCoeffGroupSig[cgScanPos];
1200         }
1201         else
1202         {
1203             /* remove cost of signaling this empty group as not present */
1204             totalRdCost -= costCoeffGroupSig[cgScanPos];
1205             continue;
1206         }
1207 
1208         for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
1209         {
1210             scanPos = cgScanPos * cgSize + scanPosinCG;
1211             if ((int)scanPos > lastScanPos)
1212                 continue;
1213 
1214             /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then
1215              * continue as if it were uncoded. If the coefficient was already uncoded, remove the
1216              * cost of signaling it as not-significant */
1217             uint32_t blkPos = codeParams.scan[scanPos];
1218             if (dstCoeff[blkPos])
1219             {
1220                 // Calculates the cost of signaling the last significant coefficient in the block
1221                 uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) };
1222                 if (codeParams.scanType == SCAN_VER)
1223                     std::swap(pos[0], pos[1]);
1224                 uint32_t bitsLastNZ = 0;
1225 
1226                 for (int i = 0; i < 2; i++)
1227                 {
1228                     int temp = g_lastCoeffTable[pos[i]];
1229                     int prefixOnes = temp & 15;
1230                     int suffixLen = temp >> 4;
1231 
1232                     bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes];
1233                     bitsLastNZ += IEP_RATE * suffixLen;
1234                 }
1235 
1236                 int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);
1237 
1238                 if (costAsLast < bestCost)
1239                 {
1240                     bestLastIdx = scanPos + 1;
1241                     bestCost = costAsLast;
1242                 }
1243                 if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1)
1244                 {
1245                     foundLast = true;
1246                     break;
1247                 }
1248 
1249                 totalRdCost -= costCoeff[scanPos];
1250                 totalRdCost += costUncoded[blkPos];
1251             }
1252             else
1253                 totalRdCost -= costSig[scanPos];
1254         }
1255     }
1256 
1257     /* recount non-zero coefficients and re-apply sign of DCT coef */
1258     numSig = 0;
1259     for (int pos = 0; pos < bestLastIdx; pos++)
1260     {
1261         int blkPos = codeParams.scan[pos];
1262         int level  = dstCoeff[blkPos];
1263         numSig += (level != 0);
1264 
1265         uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31;
1266         dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask);
1267     }
1268 
1269     // Average 49.62 pixels
1270     /* clean uncoded coefficients */
1271     X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n");
1272     for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++)
1273     {
1274         dstCoeff[codeParams.scan[pos]] = 0;
1275     }
1276     for (int pos = (bestLastIdx & ~(SCAN_SET_SIZE - 1)) + SCAN_SET_SIZE; pos <= lastScanPos; pos += SCAN_SET_SIZE)
1277     {
1278         const uint32_t blkPos = codeParams.scan[pos];
1279         memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
1280         memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff));
1281         memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff));
1282         memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff));
1283     }
1284 
1285     /* rate-distortion based sign-hiding */
1286     if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2)
1287     {
1288         const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE;
1289         int lastCG = 1;
1290 
1291         for (int subSet = realLastScanPos; subSet >= 0; subSet--)
1292         {
1293             int subPos = subSet << LOG2_SCAN_SET_SIZE;
1294             int n;
1295 
1296             if (!(sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[subSet])))
1297                 continue;
1298 
1299             /* measure distance between first and last non-zero coef in this
1300              * coding group */
1301             const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]);
1302             const int firstNZPosInCG = (uint8_t)posFirstLast;
1303             const int lastNZPosInCG = (int8_t)(posFirstLast >> 8);
1304             const uint32_t absSumSign = posFirstLast;
1305 
1306             if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
1307             {
1308                 const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]);
1309 
1310 #if CHECKED_BUILD || _DEBUG
1311                 int32_t absSum_dummy = 0;
1312                 for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
1313                     absSum_dummy += dstCoeff[codeParams.scan[n + subPos]];
1314                 X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n");
1315 #endif
1316 
1317                 //if (signbit != absSumSign)
1318                 if (((int32_t)(signbit ^ absSumSign)) < 0)
1319                 {
1320                     /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff
1321                      * is properly implied. Note dstCoeff[] are signed by this point but curChange and
1322                      * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */
1323 
1324                     int64_t minCostInc = MAX_INT64, curCost = MAX_INT64;
1325                     uint32_t minPos = 0;
1326                     int8_t finalChange = 0;
1327                     int curChange = 0;
1328                     uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE;
1329 
1330                     for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
1331                     {
1332                         const uint32_t blkPos = codeParams.scan[n + subPos];
1333                         const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
1334                         const int absLevel = abs(dstCoeff[blkPos]);
1335                         // TODO: this is constant in non-scaling mode
1336                         const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
1337                         const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound);
1338 
1339                         int d = abs(signCoef) - (unQuantLevel >> unquantShift);
1340                         X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n");
1341 
1342                         const int64_t origDist = (((int64_t)d * d));
1343 
1344 #define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8))
1345 
1346                         const uint32_t isOne = (absLevel == 1);
1347                         if (dstCoeff[blkPos])
1348                         {
1349                             d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift);
1350                             X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
1351                             int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]);
1352 
1353                             /* if decrementing would make the coeff 0, we can include the
1354                              * significant coeff flag cost savings */
1355                             d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift);
1356                             X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
1357                             int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0);
1358                             int64_t costDown = DELTARDCOST(origDist, d, downBits);
1359 
1360                             costDown -= lastCoeffAdjust;
1361                             curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown;
1362 
1363                             curChange = 2 * (costUp < costDown) - 1;
1364                             curCost = (costUp < costDown) ? costUp : curCost;
1365                         }
1366                         //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31)))
1367                         else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0))
1368                         {
1369                             /* don't try to make a new coded coeff before the first coeff if its
1370                              * sign would be different than the first coeff, the inferred sign would
1371                              * still be wrong and we'd have to do this again. */
1372                             curCost = MAX_INT64;
1373                         }
1374                         else
1375                         {
1376                             /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */
1377                             d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift);
1378                             X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n");
1379                             curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]);
1380                             curChange = 1;
1381                         }
1382 
1383                         if (curCost < minCostInc)
1384                         {
1385                             minCostInc = curCost;
1386                             finalChange = (int8_t)curChange;
1387                             minPos = blkPos + (absLevel << 16);
1388                         }
1389                         lastCoeffAdjust = 0;
1390                     }
1391 
1392                     const int absInMinPos = (minPos >> 16);
1393                     minPos = (uint16_t)minPos;
1394 
1395                     // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768)
1396                     if (absInMinPos >= 32767)
1397                         /* don't allow sign hiding to violate the SPEC range */
1398                         finalChange = -1;
1399 
1400                     // NOTE: Reference code
1401                     //if (dstCoeff[minPos] == 0)
1402                     //    numSig++;
1403                     //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1)
1404                     //    numSig--;
1405                     numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1));
1406 
1407 
1408                     // NOTE: Reference code
1409                     //if (m_resiDctCoeff[minPos] >= 0)
1410                     //    dstCoeff[minPos] += finalChange;
1411                     //else
1412                     //    dstCoeff[minPos] -= finalChange;
1413                     const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16);
1414                     dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign);
1415                 }
1416             }
1417 
1418             lastCG = 0;
1419         }
1420     }
1421 
1422     return numSig;
1423 }
1424 
1425 /* Context derivation process of coeff_abs_significant_flag */
getSigCtxInc(uint32_t patternSigCtx,uint32_t log2TrSize,uint32_t trSize,uint32_t blkPos,bool bIsLuma,uint32_t firstSignificanceMapContext)1426 uint32_t Quant::getSigCtxInc(uint32_t patternSigCtx, uint32_t log2TrSize, uint32_t trSize, uint32_t blkPos, bool bIsLuma,
1427                              uint32_t firstSignificanceMapContext)
1428 {
1429     static const uint8_t ctxIndMap[16] =
1430     {
1431         0, 1, 4, 5,
1432         2, 3, 4, 5,
1433         6, 6, 8, 8,
1434         7, 7, 8, 8
1435     };
1436 
1437     if (!blkPos) // special case for the DC context variable
1438         return 0;
1439 
1440     if (log2TrSize == 2) // 4x4
1441         return ctxIndMap[blkPos];
1442 
1443     const uint32_t posY = blkPos >> log2TrSize;
1444     const uint32_t posX = blkPos & (trSize - 1);
1445     X265_CHECK((blkPos - (posY << log2TrSize)) == posX, "block pos check failed\n");
1446 
1447     int posXinSubset = blkPos & 3;
1448     X265_CHECK((posX & 3) == (blkPos & 3), "pos alignment fail\n");
1449     int posYinSubset = posY & 3;
1450 
1451     // NOTE: [patternSigCtx][posXinSubset][posYinSubset]
1452     static const uint8_t table_cnt[4][4][4] =
1453     {
1454         // patternSigCtx = 0
1455         {
1456             { 2, 1, 1, 0 },
1457             { 1, 1, 0, 0 },
1458             { 1, 0, 0, 0 },
1459             { 0, 0, 0, 0 },
1460         },
1461         // patternSigCtx = 1
1462         {
1463             { 2, 1, 0, 0 },
1464             { 2, 1, 0, 0 },
1465             { 2, 1, 0, 0 },
1466             { 2, 1, 0, 0 },
1467         },
1468         // patternSigCtx = 2
1469         {
1470             { 2, 2, 2, 2 },
1471             { 1, 1, 1, 1 },
1472             { 0, 0, 0, 0 },
1473             { 0, 0, 0, 0 },
1474         },
1475         // patternSigCtx = 3
1476         {
1477             { 2, 2, 2, 2 },
1478             { 2, 2, 2, 2 },
1479             { 2, 2, 2, 2 },
1480             { 2, 2, 2, 2 },
1481         }
1482     };
1483 
1484     int cnt = table_cnt[patternSigCtx][posXinSubset][posYinSubset];
1485     int offset = firstSignificanceMapContext;
1486 
1487     offset += cnt;
1488 
1489     return (bIsLuma && (posX | posY) >= 4) ? 3 + offset : offset;
1490 }
1491 
1492