1 /*****************************************************************************
2 * Copyright (C) 2013-2020 MulticoreWare, Inc
3 *
4 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5 *          Steve Borho <steve@borho.org>
6 *          Min Chen <chenm003@163.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
25 
26 #include "common.h"
27 #include "frame.h"
28 #include "framedata.h"
29 #include "picyuv.h"
30 #include "primitives.h"
31 #include "threading.h"
32 
33 #include "analysis.h"
34 #include "rdcost.h"
35 #include "encoder.h"
36 
37 using namespace X265_NS;
38 
39 /* An explanation of rate distortion levels (--rd-level)
40  *
41  * rd-level 0 generates no recon per CU (NO RDO or Quant)
42  *
43  *   sa8d selection between merge / skip / inter / intra and split
44  *   no recon pixels generated until CTU analysis is complete, requiring
45  *   intra predictions to use source pixels
46  *
47  * rd-level 1 uses RDO for merge and skip, sa8d for all else
48  *
49  *   RDO selection between merge and skip
50  *   sa8d selection between (merge/skip) / inter modes / intra and split
51  *   intra prediction uses reconstructed pixels
52  *
53  * rd-level 2 uses RDO for merge/skip and split
54  *
55  *   RDO selection between merge and skip
56  *   sa8d selection between (merge/skip) / inter modes / intra
57  *   RDO split decisions
58  *
59  * rd-level 3 uses RDO for merge/skip/best inter/intra
60  *
61  *   RDO selection between merge and skip
62  *   sa8d selection of best inter mode
63  *   sa8d decisions include chroma residual cost
64  *   RDO selection between (merge/skip) / best inter mode / intra / split
65  *
66  * rd-level 4 enables RDOQuant
67  *   chroma residual cost included in satd decisions, including subpel refine
68  *    (as a result of --subme 3 being used by preset slow)
69  *
70  * rd-level 5,6 does RDO for each inter mode
71  */
72 
Analysis()73 Analysis::Analysis()
74 {
75     m_reuseInterDataCTU = NULL;
76     m_reuseRef = NULL;
77     m_bHD = false;
78     m_modeFlag[0] = false;
79     m_modeFlag[1] = false;
80     m_checkMergeAndSkipOnly[0] = false;
81     m_checkMergeAndSkipOnly[1] = false;
82     m_evaluateInter = 0;
83 }
84 
create(ThreadLocalData * tld)85 bool Analysis::create(ThreadLocalData *tld)
86 {
87     m_tld = tld;
88     m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
89 
90     int costArrSize = 1;
91     uint32_t maxDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
92     for (uint32_t i = 1; i <= maxDQPDepth; i++)
93         costArrSize += (1 << (i * 2));
94     cacheCost = X265_MALLOC(uint64_t, costArrSize);
95 
96     int csp = m_param->internalCsp;
97     uint32_t cuSize = m_param->maxCUSize;
98 
99     bool ok = true;
100     for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++, cuSize >>= 1)
101     {
102         ModeDepth &md = m_modeDepth[depth];
103         ok &= md.cuMemPool.create(depth, csp, MAX_PRED_TYPES, *m_param);
104         ok &= md.fencYuv.create(cuSize, csp);
105         if (ok)
106         {
107             for (int j = 0; j < MAX_PRED_TYPES; j++)
108             {
109                 md.pred[j].cu.initialize(md.cuMemPool, depth, *m_param, j);
110                 ok &= md.pred[j].predYuv.create(cuSize, csp);
111                 ok &= md.pred[j].reconYuv.create(cuSize, csp);
112                 md.pred[j].fencYuv = &md.fencYuv;
113             }
114         }
115     }
116     if (m_param->sourceHeight >= 1080)
117         m_bHD = true;
118 
119     return ok;
120 }
121 
destroy()122 void Analysis::destroy()
123 {
124     for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
125     {
126         m_modeDepth[i].cuMemPool.destroy();
127         m_modeDepth[i].fencYuv.destroy();
128 
129         for (int j = 0; j < MAX_PRED_TYPES; j++)
130         {
131             m_modeDepth[i].pred[j].predYuv.destroy();
132             m_modeDepth[i].pred[j].reconYuv.destroy();
133         }
134     }
135     X265_FREE(cacheCost);
136 }
137 
compressCTU(CUData & ctu,Frame & frame,const CUGeom & cuGeom,const Entropy & initialContext)138 Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
139 {
140     m_slice = ctu.m_slice;
141     m_frame = &frame;
142     m_bChromaSa8d = m_param->rdLevel >= 3;
143     m_param = m_frame->m_param;
144 
145 #if _DEBUG || CHECKED_BUILD
146     invalidateContexts(0);
147 #endif
148 
149     int qp = setLambdaFromQP(ctu, m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, cuGeom) : m_slice->m_sliceQp);
150     ctu.setQPSubParts((int8_t)qp, 0, 0);
151 
152     m_rqt[0].cur.load(initialContext);
153     ctu.m_meanQP = initialContext.m_meanQP;
154     m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
155 
156     if (m_param->bSsimRd)
157         calculateNormFactor(ctu, qp);
158 
159     uint32_t numPartition = ctu.m_numPartitions;
160     if (m_param->bCTUInfo && (*m_frame->m_ctuInfo + ctu.m_cuAddr))
161     {
162         x265_ctu_info_t* ctuTemp = *m_frame->m_ctuInfo + ctu.m_cuAddr;
163         int32_t depthIdx = 0;
164         uint32_t maxNum8x8Partitions = 64;
165         uint8_t* depthInfoPtr = m_frame->m_addOnDepth[ctu.m_cuAddr];
166         uint8_t* contentInfoPtr = m_frame->m_addOnCtuInfo[ctu.m_cuAddr];
167         int* prevCtuInfoChangePtr = m_frame->m_addOnPrevChange[ctu.m_cuAddr];
168         do
169         {
170             uint8_t depth = (uint8_t)ctuTemp->ctuPartitions[depthIdx];
171             uint8_t content = (uint8_t)(*((int32_t *)ctuTemp->ctuInfo + depthIdx));
172             int prevCtuInfoChange = m_frame->m_prevCtuInfoChange[ctu.m_cuAddr * maxNum8x8Partitions + depthIdx];
173             memset(depthInfoPtr, depth, sizeof(uint8_t) * numPartition >> 2 * depth);
174             memset(contentInfoPtr, content, sizeof(uint8_t) * numPartition >> 2 * depth);
175             memset(prevCtuInfoChangePtr, 0, sizeof(int) * numPartition >> 2 * depth);
176             for (uint32_t l = 0; l < numPartition >> 2 * depth; l++)
177                 prevCtuInfoChangePtr[l] = prevCtuInfoChange;
178             depthInfoPtr += ctu.m_numPartitions >> 2 * depth;
179             contentInfoPtr += ctu.m_numPartitions >> 2 * depth;
180             prevCtuInfoChangePtr += ctu.m_numPartitions >> 2 * depth;
181             depthIdx++;
182         } while (ctuTemp->ctuPartitions[depthIdx] != 0);
183 
184         m_additionalCtuInfo = m_frame->m_addOnCtuInfo[ctu.m_cuAddr];
185         m_prevCtuInfoChange = m_frame->m_addOnPrevChange[ctu.m_cuAddr];
186         memcpy(ctu.m_cuDepth, m_frame->m_addOnDepth[ctu.m_cuAddr], sizeof(uint8_t) * numPartition);
187         //Calculate log2CUSize from depth
188         for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
189             ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
190     }
191     if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && (m_slice->m_sliceType != I_SLICE))
192     {
193         int numPredDir = m_slice->isInterP() ? 1 : 2;
194         m_reuseInterDataCTU = m_frame->m_analysisData.interData;
195         for (int dir = 0; dir < numPredDir; dir++)
196         {
197             m_reuseMv[dir] = &m_reuseInterDataCTU->mv[dir][ctu.m_cuAddr * ctu.m_numPartitions];
198             m_reuseMvpIdx[dir] = &m_reuseInterDataCTU->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions];
199         }
200         m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * ctu.m_numPartitions];
201         m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
202         m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
203     }
204 
205     int reuseLevel = X265_MAX(m_param->analysisSaveReuseLevel, m_param->analysisLoadReuseLevel);
206     if ((m_param->analysisSave || m_param->analysisLoad) && m_slice->m_sliceType != I_SLICE && reuseLevel > 1 && reuseLevel < 10)
207     {
208         int numPredDir = m_slice->isInterP() ? 1 : 2;
209         m_reuseInterDataCTU = m_frame->m_analysisData.interData;
210         if (((m_param->analysisSaveReuseLevel > 1) && (m_param->analysisSaveReuseLevel < 7)) ||
211             ((m_param->analysisLoadReuseLevel > 1) && (m_param->analysisLoadReuseLevel < 7)))
212             m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
213         m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
214         m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
215         if (reuseLevel > 4)
216         {
217             m_reusePartSize = &m_reuseInterDataCTU->partSize[ctu.m_cuAddr * ctu.m_numPartitions];
218             m_reuseMergeFlag = &m_reuseInterDataCTU->mergeFlag[ctu.m_cuAddr * ctu.m_numPartitions];
219         }
220         if (m_param->analysisSave && !m_param->analysisLoad)
221             for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * numPredDir; i++)
222                 m_reuseRef[i] = -1;
223     }
224     ProfileCUScope(ctu, totalCTUTime, totalCTUs);
225 
226     if (m_slice->m_sliceType == I_SLICE)
227     {
228         x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
229         if (m_param->analysisLoadReuseLevel > 1)
230         {
231             memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
232             memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
233             memcpy(ctu.m_partSize, &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * numPartition);
234             memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
235         }
236         compressIntraCU(ctu, cuGeom, qp);
237     }
238     else
239     {
240         bool bCopyAnalysis = ((m_param->analysisLoadReuseLevel == 10) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16));
241         bool bCompressInterCUrd0_4 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel <= 4);
242         bool bCompressInterCUrd5_6 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel >= 5 && m_param->rdLevel <= 6);
243         bCopyAnalysis = bCopyAnalysis || bCompressInterCUrd0_4 || bCompressInterCUrd5_6;
244 
245         if (bCopyAnalysis)
246         {
247             x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
248             int posCTU = ctu.m_cuAddr * numPartition;
249             memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
250             memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
251             memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
252             for (int list = 0; list < m_slice->isInterB() + 1; list++)
253                 memcpy(ctu.m_skipFlag[list], &m_frame->m_analysisData.modeFlag[list][posCTU], sizeof(uint8_t) * numPartition);
254 
255             if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO))
256             {
257                 x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
258                 memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
259                 memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
260             }
261             //Calculate log2CUSize from depth
262             for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
263                 ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
264         }
265 
266         if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
267             ctu.m_cuPelX / m_param->maxCUSize >= frame.m_encData->m_pir.pirStartCol
268             && ctu.m_cuPelX / m_param->maxCUSize < frame.m_encData->m_pir.pirEndCol)
269             compressIntraCU(ctu, cuGeom, qp);
270         else if (!m_param->rdLevel)
271         {
272             /* In RD Level 0/1, copy source pixels into the reconstructed block so
273              * they are available for intra predictions */
274             m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
275 
276             compressInterCU_rd0_4(ctu, cuGeom, qp);
277 
278             /* generate residual for entire CTU at once and copy to reconPic */
279             encodeResidue(ctu, cuGeom);
280         }
281         else if ((m_param->analysisLoadReuseLevel == 10 && (!(m_param->bAnalysisType == HEVC_INFO) || m_slice->m_sliceType != P_SLICE)) ||
282                 ((m_param->bAnalysisType == AVC_INFO) && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16))
283         {
284             x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
285             int posCTU = ctu.m_cuAddr * numPartition;
286             memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
287             memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
288             memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
289             if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO))
290             {
291                 x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
292                 memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
293                 memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
294             }
295             //Calculate log2CUSize from depth
296             for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
297                 ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
298 
299             qprdRefine (ctu, cuGeom, qp, qp);
300             return *m_modeDepth[0].bestMode;
301         }
302         else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
303             compressInterCU_dist(ctu, cuGeom, qp);
304         else if (m_param->rdLevel <= 4)
305             compressInterCU_rd0_4(ctu, cuGeom, qp);
306         else
307             compressInterCU_rd5_6(ctu, cuGeom, qp);
308     }
309 
310     if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP)
311         qprdRefine(ctu, cuGeom, qp, qp);
312 
313     if (m_param->csvLogLevel >= 2)
314         collectPUStatistics(ctu, cuGeom);
315 
316     return *m_modeDepth[0].bestMode;
317 }
318 
collectPUStatistics(const CUData & ctu,const CUGeom & cuGeom)319 void Analysis::collectPUStatistics(const CUData& ctu, const CUGeom& cuGeom)
320 {
321     uint8_t depth = 0;
322     uint8_t partSize = 0;
323     for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
324     {
325         depth = ctu.m_cuDepth[absPartIdx];
326         partSize = ctu.m_partSize[absPartIdx];
327         uint32_t numPU = nbPartsTable[(int)partSize];
328         int shift = 2 * (m_param->maxCUDepth + 1 - depth);
329         for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
330         {
331             PredictionUnit pu(ctu, cuGeom, puIdx);
332             int puabsPartIdx = ctu.getPUOffset(puIdx, absPartIdx);
333             int mode = 1;
334             if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_Nx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxN)
335                 mode = 2;
336             else if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnU || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnD || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nLx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nRx2N)
337                  mode = 3;
338             if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_SKIP)
339             {
340                 ctu.m_encData->m_frameStats.cntSkipPu[depth] += 1ULL << shift;
341                 ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
342             }
343             else if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_INTRA)
344             {
345                 if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_NxN)
346                 {
347                     ctu.m_encData->m_frameStats.cnt4x4++;
348                     ctu.m_encData->m_frameStats.totalPu[4]++;
349                 }
350                 else
351                 {
352                     ctu.m_encData->m_frameStats.cntIntraPu[depth] += 1ULL << shift;
353                     ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
354                 }
355             }
356             else if (mode == 3)
357             {
358                 ctu.m_encData->m_frameStats.cntAmp[depth] += 1ULL << shift;
359                 ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
360                 break;
361             }
362             else
363             {
364                 if (ctu.m_mergeFlag[puabsPartIdx + absPartIdx])
365                     ctu.m_encData->m_frameStats.cntMergePu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode;
366                 else
367                     ctu.m_encData->m_frameStats.cntInterPu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode;
368 
369                 ctu.m_encData->m_frameStats.totalPu[depth] += (1 << shift) / mode;
370             }
371         }
372     }
373 }
374 
loadTUDepth(CUGeom cuGeom,CUData parentCTU)375 int32_t Analysis::loadTUDepth(CUGeom cuGeom, CUData parentCTU)
376 {
377     float predDepth = 0;
378     CUData* neighbourCU;
379     uint8_t count = 0;
380     int32_t maxTUDepth = -1;
381     neighbourCU = &m_slice->m_refFrameList[0][0]->m_encData->m_picCTU[parentCTU.m_cuAddr];
382     predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
383     count++;
384     if (m_slice->isInterB())
385     {
386         neighbourCU = &m_slice->m_refFrameList[1][0]->m_encData->m_picCTU[parentCTU.m_cuAddr];
387         predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
388         count++;
389     }
390     if (parentCTU.m_cuAbove)
391     {
392         predDepth += parentCTU.m_cuAbove->m_refTuDepth[cuGeom.geomRecurId];
393         count++;
394         if (parentCTU.m_cuAboveLeft)
395         {
396             predDepth += parentCTU.m_cuAboveLeft->m_refTuDepth[cuGeom.geomRecurId];
397             count++;
398         }
399         if (parentCTU.m_cuAboveRight)
400         {
401             predDepth += parentCTU.m_cuAboveRight->m_refTuDepth[cuGeom.geomRecurId];
402             count++;
403         }
404     }
405     if (parentCTU.m_cuLeft)
406     {
407         predDepth += parentCTU.m_cuLeft->m_refTuDepth[cuGeom.geomRecurId];
408         count++;
409     }
410     predDepth /= count;
411 
412     if (predDepth == 0)
413         maxTUDepth = 0;
414     else if (predDepth < 1)
415         maxTUDepth = 1;
416     else if (predDepth >= 1 && predDepth <= 1.5)
417         maxTUDepth = 2;
418     else if (predDepth > 1.5 && predDepth <= 2.5)
419         maxTUDepth = 3;
420     else
421         maxTUDepth = -1;
422 
423     return maxTUDepth;
424 }
425 
tryLossless(const CUGeom & cuGeom)426 void Analysis::tryLossless(const CUGeom& cuGeom)
427 {
428     ModeDepth& md = m_modeDepth[cuGeom.depth];
429 
430     if (!md.bestMode->distortion)
431         /* already lossless */
432         return;
433     else if (md.bestMode->cu.isIntra(0))
434     {
435         md.pred[PRED_LOSSLESS].initCosts();
436         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
437         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
438         checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size);
439         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
440     }
441     else
442     {
443         md.pred[PRED_LOSSLESS].initCosts();
444         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
445         md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
446         encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
447         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
448     }
449 }
450 
qprdRefine(const CUData & parentCTU,const CUGeom & cuGeom,int32_t qp,int32_t lqp)451 void Analysis::qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
452 {
453     uint32_t depth = cuGeom.depth;
454     ModeDepth& md = m_modeDepth[depth];
455     md.bestMode = NULL;
456 
457     bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
458 
459     int bestCUQP = qp;
460     int lambdaQP = lqp;
461     bool doQPRefine = (bDecidedDepth && depth <= m_slice->m_pps->maxCuDQPDepth) || (!bDecidedDepth && depth == m_slice->m_pps->maxCuDQPDepth);
462     if (m_param->analysisLoadReuseLevel >= 7)
463         doQPRefine = false;
464     if (doQPRefine)
465     {
466         uint64_t bestCUCost, origCUCost, cuCost, cuPrevCost;
467 
468         int cuIdx = (cuGeom.childOffset - 1) / 3;
469         bestCUCost = origCUCost = cacheCost[cuIdx];
470 
471         int direction = m_param->bOptCUDeltaQP ? 1 : 2;
472 
473         for (int dir = direction; dir >= -direction; dir -= (direction * 2))
474         {
475             if (m_param->bOptCUDeltaQP && ((dir != 1) || ((qp + 3) >= (int32_t)parentCTU.m_meanQP)))
476                 break;
477 
478             int threshold = 1;
479             int failure = 0;
480             cuPrevCost = origCUCost;
481 
482             int modCUQP = qp + dir;
483             while (modCUQP >= m_param->rc.qpMin && modCUQP <= QP_MAX_SPEC)
484             {
485                 if (m_param->bOptCUDeltaQP && modCUQP > (int32_t)parentCTU.m_meanQP)
486                     break;
487 
488                 recodeCU(parentCTU, cuGeom, modCUQP, qp);
489                 cuCost = md.bestMode->rdCost;
490 
491                 COPY2_IF_LT(bestCUCost, cuCost, bestCUQP, modCUQP);
492                 if (cuCost < cuPrevCost)
493                     failure = 0;
494                 else
495                     failure++;
496 
497                 if (failure > threshold)
498                     break;
499 
500                 cuPrevCost = cuCost;
501                 modCUQP += dir;
502             }
503         }
504         lambdaQP = bestCUQP;
505     }
506 
507     recodeCU(parentCTU, cuGeom, bestCUQP, lambdaQP);
508 
509     /* Copy best data to encData CTU and recon */
510     md.bestMode->cu.copyToPic(depth);
511     md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
512 }
513 
compressIntraCU(const CUData & parentCTU,const CUGeom & cuGeom,int32_t qp)514 uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
515 {
516     uint32_t depth = cuGeom.depth;
517     ModeDepth& md = m_modeDepth[depth];
518     md.bestMode = NULL;
519 
520     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
521     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
522 
523     bool bAlreadyDecided = m_param->intraRefine != 4 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX && !(m_param->bAnalysisType == HEVC_INFO);
524     bool bDecidedDepth = m_param->intraRefine != 4 && parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
525     int split = 0;
526     if (m_param->intraRefine && m_param->intraRefine != 4)
527     {
528         split = m_param->scaleFactor && bDecidedDepth && (!mightNotSplit ||
529             ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
530         if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth)
531             bAlreadyDecided = false;
532     }
533 
534     if (bAlreadyDecided)
535     {
536         if (bDecidedDepth && mightNotSplit)
537         {
538             Mode& mode = md.pred[0];
539             md.bestMode = &mode;
540             mode.cu.initSubCU(parentCTU, cuGeom, qp);
541             bool reuseModes = !((m_param->intraRefine == 3) ||
542                                 (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
543             if (reuseModes)
544             {
545                 memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
546                 memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
547             }
548             checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
549 
550             if (m_bTryLossless)
551                 tryLossless(cuGeom);
552 
553             if (mightSplit)
554                 addSplitFlagCost(*md.bestMode, cuGeom.depth);
555         }
556     }
557     else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
558     {
559         md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
560         checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
561         checkBestMode(md.pred[PRED_INTRA], depth);
562 
563         if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
564         {
565             md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
566             checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
567             checkBestMode(md.pred[PRED_INTRA_NxN], depth);
568         }
569 
570         if (m_bTryLossless)
571             tryLossless(cuGeom);
572 
573         if (mightSplit)
574             addSplitFlagCost(*md.bestMode, cuGeom.depth);
575     }
576 
577     // stop recursion if we reach the depth of previous analysis decision
578     mightSplit &= !(bAlreadyDecided && bDecidedDepth) || split;
579 
580     if (mightSplit)
581     {
582         Mode* splitPred = &md.pred[PRED_SPLIT];
583         splitPred->initCosts();
584         CUData* splitCU = &splitPred->cu;
585         splitCU->initSubCU(parentCTU, cuGeom, qp);
586 
587         uint32_t nextDepth = depth + 1;
588         ModeDepth& nd = m_modeDepth[nextDepth];
589         invalidateContexts(nextDepth);
590         Entropy* nextContext = &m_rqt[depth].cur;
591         int32_t nextQP = qp;
592         uint64_t curCost = 0;
593         int skipSplitCheck = 0;
594 
595         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
596         {
597             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
598             if (childGeom.flags & CUGeom::PRESENT)
599             {
600                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
601                 m_rqt[nextDepth].cur.load(*nextContext);
602 
603                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
604                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
605 
606                 if (m_param->bEnableSplitRdSkip)
607                 {
608                     curCost += compressIntraCU(parentCTU, childGeom, nextQP);
609                     if (m_modeDepth[depth].bestMode && curCost > m_modeDepth[depth].bestMode->rdCost)
610                     {
611                         skipSplitCheck = 1;
612                         break;
613                     }
614                 }
615                 else
616                     compressIntraCU(parentCTU, childGeom, nextQP);
617 
618                 // Save best CU and pred data for this sub CU
619                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
620                 splitPred->addSubCosts(*nd.bestMode);
621                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
622                 nextContext = &nd.bestMode->contexts;
623             }
624             else
625             {
626                 /* record the depth of this non-present sub-CU */
627                 splitCU->setEmptyPart(childGeom, subPartIdx);
628 
629                 /* Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP */
630                 if (bAlreadyDecided)
631                     memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
632             }
633         }
634         if (!skipSplitCheck)
635         {
636             nextContext->store(splitPred->contexts);
637             if (mightNotSplit)
638                 addSplitFlagCost(*splitPred, cuGeom.depth);
639             else
640                 updateModeCost(*splitPred);
641 
642             checkDQPForSplitPred(*splitPred, cuGeom);
643             checkBestMode(*splitPred, depth);
644         }
645     }
646 
647     if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
648     {
649         int cuIdx = (cuGeom.childOffset - 1) / 3;
650         cacheCost[cuIdx] = md.bestMode->rdCost;
651     }
652 
653     if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
654     {
655         CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
656         int8_t maxTUDepth = -1;
657         for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
658             maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
659         ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
660     }
661 
662     /* Copy best data to encData CTU and recon */
663     md.bestMode->cu.copyToPic(depth);
664     if (md.bestMode != &md.pred[PRED_SPLIT])
665         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
666 
667     return md.bestMode->rdCost;
668 }
669 
processTasks(int workerThreadId)670 void Analysis::PMODE::processTasks(int workerThreadId)
671 {
672 #if DETAILED_CU_STATS
673     int fe = master.m_modeDepth[cuGeom.depth].pred[PRED_2Nx2N].cu.m_encData->m_frameEncoderID;
674     master.m_stats[fe].countPModeTasks++;
675     ScopedElapsedTime pmodeTime(master.m_stats[fe].pmodeTime);
676 #endif
677     ProfileScopeEvent(pmode);
678     master.processPmode(*this, master.m_tld[workerThreadId].analysis);
679 }
680 
681 /* process pmode jobs until none remain; may be called by the master thread or by
682  * a bonded peer (slave) thread via pmodeTasks() */
processPmode(PMODE & pmode,Analysis & slave)683 void Analysis::processPmode(PMODE& pmode, Analysis& slave)
684 {
685     /* acquire a mode task, else exit early */
686     int task;
687     pmode.m_lock.acquire();
688     if (pmode.m_jobTotal > pmode.m_jobAcquired)
689     {
690         task = pmode.m_jobAcquired++;
691         pmode.m_lock.release();
692     }
693     else
694     {
695         pmode.m_lock.release();
696         return;
697     }
698 
699     ModeDepth& md = m_modeDepth[pmode.cuGeom.depth];
700 
701     /* setup slave Analysis */
702     if (&slave != this)
703     {
704         slave.m_slice = m_slice;
705         slave.m_frame = m_frame;
706         slave.m_param = m_param;
707         slave.m_bChromaSa8d = m_param->rdLevel >= 3;
708         slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp);
709         slave.invalidateContexts(0);
710         slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
711     }
712 
713     /* perform Mode task, repeat until no more work is available */
714     do
715     {
716         uint32_t refMasks[2] = { 0, 0 };
717 
718         if (m_param->rdLevel <= 4)
719         {
720             switch (pmode.modes[task])
721             {
722             case PRED_INTRA:
723                 slave.checkIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
724                 if (m_param->rdLevel > 2)
725                     slave.encodeIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
726                 break;
727 
728             case PRED_2Nx2N:
729                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
730 
731                 slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
732                 if (m_slice->m_sliceType == B_SLICE)
733                     slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
734                 break;
735 
736             case PRED_Nx2N:
737                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
738                 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
739 
740                 slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
741                 break;
742 
743             case PRED_2NxN:
744                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
745                 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
746 
747                 slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
748                 break;
749 
750             case PRED_2NxnU:
751                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
752                 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
753 
754                 slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
755                 break;
756 
757             case PRED_2NxnD:
758                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
759                 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
760 
761                 slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
762                 break;
763 
764             case PRED_nLx2N:
765                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
766                 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
767 
768                 slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
769                 break;
770 
771             case PRED_nRx2N:
772                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
773                 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
774 
775                 slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
776                 break;
777 
778             default:
779                 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
780                 break;
781             }
782         }
783         else
784         {
785             switch (pmode.modes[task])
786             {
787             case PRED_INTRA:
788                 slave.checkIntra(md.pred[PRED_INTRA], pmode.cuGeom, SIZE_2Nx2N);
789                 if (pmode.cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
790                     slave.checkIntra(md.pred[PRED_INTRA_NxN], pmode.cuGeom, SIZE_NxN);
791                 break;
792 
793             case PRED_2Nx2N:
794                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
795 
796                 slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
797                 md.pred[PRED_BIDIR].rdCost = MAX_INT64;
798                 if (m_slice->m_sliceType == B_SLICE)
799                 {
800                     slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
801                     if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
802                         slave.encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], pmode.cuGeom);
803                 }
804                 break;
805 
806             case PRED_Nx2N:
807                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
808                 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
809 
810                 slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
811                 break;
812 
813             case PRED_2NxN:
814                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
815                 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
816 
817                 slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
818                 break;
819 
820             case PRED_2NxnU:
821                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
822                 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
823 
824                 slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
825                 break;
826 
827             case PRED_2NxnD:
828                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
829                 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
830                 slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
831                 break;
832 
833             case PRED_nLx2N:
834                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
835                 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
836 
837                 slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
838                 break;
839 
840             case PRED_nRx2N:
841                 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
842                 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
843                 slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
844                 break;
845 
846             default:
847                 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
848                 break;
849             }
850         }
851 
852         task = -1;
853         pmode.m_lock.acquire();
854         if (pmode.m_jobTotal > pmode.m_jobAcquired)
855             task = pmode.m_jobAcquired++;
856         pmode.m_lock.release();
857     }
858     while (task >= 0);
859 }
860 
compressInterCU_dist(const CUData & parentCTU,const CUGeom & cuGeom,int32_t qp)861 uint32_t Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
862 {
863     uint32_t depth = cuGeom.depth;
864     uint32_t cuAddr = parentCTU.m_cuAddr;
865     ModeDepth& md = m_modeDepth[depth];
866     md.bestMode = NULL;
867 
868     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
869     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
870     uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0;
871     uint32_t splitRefs[4] = { 0, 0, 0, 0 };
872 
873     X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n");
874 
875     PMODE pmode(*this, cuGeom);
876 
877     if (mightNotSplit && depth >= minDepth)
878     {
879         /* Initialize all prediction CUs based on parentCTU */
880         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
881         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
882 
883         if (m_param->rdLevel <= 4)
884             checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
885         else
886             checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
887     }
888 
889     bool bNoSplit = false;
890     bool splitIntra = true;
891     if (md.bestMode)
892     {
893         bNoSplit = md.bestMode->cu.isSkipped(0);
894         if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
895             bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
896     }
897 
898     if (mightSplit && !bNoSplit)
899     {
900         Mode* splitPred = &md.pred[PRED_SPLIT];
901         splitPred->initCosts();
902         CUData* splitCU = &splitPred->cu;
903         splitCU->initSubCU(parentCTU, cuGeom, qp);
904 
905         uint32_t nextDepth = depth + 1;
906         ModeDepth& nd = m_modeDepth[nextDepth];
907         invalidateContexts(nextDepth);
908         Entropy* nextContext = &m_rqt[depth].cur;
909         int nextQP = qp;
910         splitIntra = false;
911 
912         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
913         {
914             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
915             if (childGeom.flags & CUGeom::PRESENT)
916             {
917                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
918                 m_rqt[nextDepth].cur.load(*nextContext);
919 
920                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
921                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
922 
923                 splitRefs[subPartIdx] = compressInterCU_dist(parentCTU, childGeom, nextQP);
924 
925                 // Save best CU and pred data for this sub CU
926                 splitIntra |= nd.bestMode->cu.isIntra(0);
927                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
928                 splitPred->addSubCosts(*nd.bestMode);
929 
930                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
931                 nextContext = &nd.bestMode->contexts;
932             }
933             else
934                 splitCU->setEmptyPart(childGeom, subPartIdx);
935         }
936         nextContext->store(splitPred->contexts);
937 
938         if (mightNotSplit)
939             addSplitFlagCost(*splitPred, cuGeom.depth);
940         else
941             updateModeCost(*splitPred);
942 
943         checkDQPForSplitPred(*splitPred, cuGeom);
944     }
945 
946     if (mightNotSplit && depth >= minDepth)
947     {
948         int bTryAmp = m_slice->m_sps->maxAMPDepth > depth;
949         int bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (!m_param->limitReferences || splitIntra) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE);
950 
951         if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
952             setLambdaFromQP(parentCTU, qp);
953 
954         if (bTryIntra)
955         {
956             md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
957             if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3 && m_param->rdLevel >= 5)
958                 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
959             pmode.modes[pmode.m_jobTotal++] = PRED_INTRA;
960         }
961         md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2Nx2N;
962         md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
963         if (m_param->bEnableRectInter)
964         {
965             md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxN;
966             md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_Nx2N;
967         }
968         if (bTryAmp)
969         {
970             md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnU;
971             md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnD;
972             md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nLx2N;
973             md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N;
974         }
975 
976         m_splitRefIdx[0] = splitRefs[0]; m_splitRefIdx[1] = splitRefs[1]; m_splitRefIdx[2] = splitRefs[2]; m_splitRefIdx[3] = splitRefs[3];
977 
978         pmode.tryBondPeers(*m_frame->m_encData->m_jobProvider, pmode.m_jobTotal);
979 
980         /* participate in processing jobs, until all are distributed */
981         processPmode(pmode, *this);
982 
983         /* the master worker thread (this one) does merge analysis. By doing
984          * merge after all the other jobs are at least started, we usually avoid
985          * blocking on another thread */
986 
987         if (m_param->rdLevel <= 4)
988         {
989             {
990                 ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
991                 pmode.waitForExit();
992             }
993 
994             /* select best inter mode based on sa8d cost */
995             Mode *bestInter = &md.pred[PRED_2Nx2N];
996 
997             if (m_param->bEnableRectInter)
998             {
999                 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
1000                     bestInter = &md.pred[PRED_Nx2N];
1001                 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1002                     bestInter = &md.pred[PRED_2NxN];
1003             }
1004 
1005             if (bTryAmp)
1006             {
1007                 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
1008                     bestInter = &md.pred[PRED_2NxnU];
1009                 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1010                     bestInter = &md.pred[PRED_2NxnD];
1011                 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
1012                     bestInter = &md.pred[PRED_nLx2N];
1013                 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1014                     bestInter = &md.pred[PRED_nRx2N];
1015             }
1016 
1017             if (m_param->rdLevel > 2)
1018             {
1019                 /* RD selection between merge, inter, bidir and intra */
1020                 if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
1021                 {
1022                     uint32_t numPU = bestInter->cu.getNumPartInter(0);
1023                     for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1024                     {
1025                         PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
1026                         motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
1027                     }
1028                 }
1029                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
1030                 checkBestMode(*bestInter, depth);
1031 
1032                 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
1033                 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
1034                     md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
1035                 {
1036                     encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1037                     checkBestMode(md.pred[PRED_BIDIR], depth);
1038                 }
1039 
1040                 if (bTryIntra)
1041                     checkBestMode(md.pred[PRED_INTRA], depth);
1042             }
1043             else /* m_param->rdLevel == 2 */
1044             {
1045                 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
1046                     md.bestMode = bestInter;
1047 
1048                 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
1049                     md.bestMode = &md.pred[PRED_BIDIR];
1050 
1051                 if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
1052                 {
1053                     md.bestMode = &md.pred[PRED_INTRA];
1054                     encodeIntraInInter(*md.bestMode, cuGeom);
1055                 }
1056                 else if (!md.bestMode->cu.m_mergeFlag[0])
1057                 {
1058                     /* finally code the best mode selected from SA8D costs */
1059                     uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
1060                     for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1061                     {
1062                         PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
1063                         motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
1064                     }
1065                     encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
1066                 }
1067             }
1068         }
1069         else
1070         {
1071             {
1072                 ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
1073                 pmode.waitForExit();
1074             }
1075 
1076             checkBestMode(md.pred[PRED_2Nx2N], depth);
1077             if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
1078                 checkBestMode(md.pred[PRED_BIDIR], depth);
1079 
1080             if (m_param->bEnableRectInter)
1081             {
1082                 checkBestMode(md.pred[PRED_Nx2N], depth);
1083                 checkBestMode(md.pred[PRED_2NxN], depth);
1084             }
1085 
1086             if (bTryAmp)
1087             {
1088                 checkBestMode(md.pred[PRED_2NxnU], depth);
1089                 checkBestMode(md.pred[PRED_2NxnD], depth);
1090                 checkBestMode(md.pred[PRED_nLx2N], depth);
1091                 checkBestMode(md.pred[PRED_nRx2N], depth);
1092             }
1093 
1094             if (bTryIntra)
1095             {
1096                 checkBestMode(md.pred[PRED_INTRA], depth);
1097                 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
1098                     checkBestMode(md.pred[PRED_INTRA_NxN], depth);
1099             }
1100         }
1101 
1102         if (m_bTryLossless)
1103             tryLossless(cuGeom);
1104 
1105         if (mightSplit)
1106             addSplitFlagCost(*md.bestMode, cuGeom.depth);
1107     }
1108 
1109     /* compare split RD cost against best cost */
1110     if (mightSplit && !bNoSplit)
1111         checkBestMode(md.pred[PRED_SPLIT], depth);
1112 
1113     /* determine which motion references the parent CU should search */
1114     uint32_t refMask;
1115     if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
1116         refMask = 0;
1117     else if (md.bestMode == &md.pred[PRED_SPLIT])
1118         refMask = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
1119     else
1120     {
1121         /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1122         CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1123         uint32_t numPU = cu.getNumPartInter(0);
1124         refMask = 0;
1125         for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1126             refMask |= cu.getBestRefIdx(subPartIdx);
1127     }
1128 
1129     if (mightNotSplit)
1130     {
1131         /* early-out statistics */
1132         FrameData& curEncData = *m_frame->m_encData;
1133         FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1134         uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
1135         cuStat.count[depth] += 1;
1136         cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
1137     }
1138 
1139     /* Copy best data to encData CTU and recon */
1140     md.bestMode->cu.copyToPic(depth);
1141     md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
1142 
1143     return refMask;
1144 }
1145 
compressInterCU_rd0_4(const CUData & parentCTU,const CUGeom & cuGeom,int32_t qp)1146 SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
1147 {
1148     if (parentCTU.m_vbvAffected && calculateQpforCuSize(parentCTU, cuGeom, 1))
1149         return compressInterCU_rd5_6(parentCTU, cuGeom, qp);
1150 
1151     uint32_t depth = cuGeom.depth;
1152     uint32_t cuAddr = parentCTU.m_cuAddr;
1153     ModeDepth& md = m_modeDepth[depth];
1154 
1155 
1156     if (m_param->searchMethod == X265_SEA)
1157     {
1158         int numPredDir = m_slice->isInterP() ? 1 : 2;
1159         int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
1160         for (int list = 0; list < numPredDir; list++)
1161             for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
1162                 for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
1163                     m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
1164     }
1165 
1166     PicYuv& reconPic = *m_frame->m_reconPic;
1167     SplitData splitCUData;
1168 
1169     bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16);
1170     bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
1171     bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO);
1172 
1173     if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
1174     {
1175         md.bestMode = NULL;
1176         bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1177         bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1178         uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
1179         bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
1180         bool skipModes = false; /* Skip any remaining mode analyses at current depth */
1181         bool skipRecursion = false; /* Skip recursion */
1182         bool splitIntra = true;
1183         bool skipRectAmp = false;
1184         bool chooseMerge = false;
1185         bool bCtuInfoCheck = false;
1186         int sameContentRef = 0;
1187 
1188         if (m_evaluateInter)
1189         {
1190             if (m_refineLevel == 2)
1191             {
1192                 if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
1193                     skipModes = true;
1194                 if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1195                     skipRectAmp = true;
1196             }
1197             mightSplit &= false;
1198             minDepth = depth;
1199         }
1200 
1201         if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
1202             m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
1203 
1204         SplitData splitData[4];
1205         splitData[0].initSplitCUData();
1206         splitData[1].initSplitCUData();
1207         splitData[2].initSplitCUData();
1208         splitData[3].initSplitCUData();
1209 
1210         // avoid uninitialize value in below reference
1211         if (m_param->limitModes)
1212         {
1213             md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
1214             md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
1215             md.pred[PRED_2Nx2N].sa8dCost = 0;
1216         }
1217 
1218         if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
1219         {
1220             if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
1221                 sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
1222             if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
1223             {
1224                 mightNotSplit &= bDecidedDepth;
1225                 bCtuInfoCheck = skipRecursion = false;
1226                 skipModes = true;
1227             }
1228             else if (mightNotSplit && bDecidedDepth)
1229             {
1230                 if (m_additionalCtuInfo[cuGeom.absPartIdx])
1231                 {
1232                     bCtuInfoCheck = skipRecursion = true;
1233                     md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1234                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1235                     checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1236                     if (!sameContentRef)
1237                     {
1238                         if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
1239                         {
1240                             qp -= int32_t(0.04 * qp);
1241                             setLambdaFromQP(parentCTU, qp);
1242                         }
1243                         if (m_param->bCTUInfo & 4)
1244                             skipModes = false;
1245                     }
1246                     if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
1247                     {
1248                         if (m_param->rdLevel)
1249                             skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1250                         if ((m_param->bCTUInfo & 4) && sameContentRef)
1251                             skipModes = md.bestMode && true;
1252                     }
1253                 }
1254                 else
1255                 {
1256                     md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1257                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1258                     checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1259                     if (m_param->rdLevel)
1260                         skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1261                 }
1262                 mightSplit &= !bDecidedDepth;
1263             }
1264         }
1265         if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10))
1266         {
1267             if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1268             {
1269                 if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1270                 {
1271                     md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1272                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1273                     checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1274 
1275                     skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
1276                     if (m_param->rdLevel)
1277                         skipModes = m_param->bEnableEarlySkip && md.bestMode;
1278                 }
1279                 if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1280                 {
1281                     if (m_reuseModes[cuGeom.absPartIdx] != MODE_INTRA  && m_reuseModes[cuGeom.absPartIdx] != 4)
1282                     {
1283                         skipRectAmp = true && !!md.bestMode;
1284                         chooseMerge = !!m_reuseMergeFlag[cuGeom.absPartIdx] && !!md.bestMode;
1285                     }
1286                 }
1287             }
1288         }
1289         if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
1290         {
1291             if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1292             {
1293                 if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1294                 {
1295                     md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1296                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1297                     checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1298 
1299                     skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
1300                     if (m_param->rdLevel)
1301                         skipModes = m_param->bEnableEarlySkip && md.bestMode;
1302                 }
1303             }
1304         }
1305         /* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */
1306         if ((mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
1307             /* TODO: Re-evaluate if analysis load/save still works */
1308         {
1309             /* Compute Merge Cost */
1310             md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1311             md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1312             checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1313             if (m_param->rdLevel)
1314                 skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
1315                 && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
1316         }
1317         if (md.bestMode && m_param->recursionSkipMode && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
1318         {
1319             skipRecursion = md.bestMode->cu.isSkipped(0);
1320             if (mightSplit && !skipRecursion)
1321             {
1322                 if (depth >= minDepth && m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
1323                 {
1324                     if (depth)
1325                         skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
1326                     if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
1327                         skipRecursion = complexityCheckCU(*md.bestMode);
1328                 }
1329                 else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
1330                 {
1331                     skipRecursion = complexityCheckCU(*md.bestMode);
1332                 }
1333 
1334             }
1335         }
1336         if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
1337             skipRecursion = true;
1338         /* Step 2. Evaluate each of the 4 split sub-blocks in series */
1339         if (mightSplit && !skipRecursion)
1340         {
1341             if (bCtuInfoCheck && m_param->bCTUInfo & 2)
1342                 qp = int((1 / 0.96) * qp + 0.5);
1343             Mode* splitPred = &md.pred[PRED_SPLIT];
1344             splitPred->initCosts();
1345             CUData* splitCU = &splitPred->cu;
1346             splitCU->initSubCU(parentCTU, cuGeom, qp);
1347 
1348             uint32_t nextDepth = depth + 1;
1349             ModeDepth& nd = m_modeDepth[nextDepth];
1350             invalidateContexts(nextDepth);
1351             Entropy* nextContext = &m_rqt[depth].cur;
1352             int nextQP = qp;
1353             splitIntra = false;
1354 
1355             for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1356             {
1357                 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1358                 if (childGeom.flags & CUGeom::PRESENT)
1359                 {
1360                     m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
1361                     m_rqt[nextDepth].cur.load(*nextContext);
1362 
1363                     if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
1364                         nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
1365 
1366                     splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
1367 
1368                     // Save best CU and pred data for this sub CU
1369                     splitIntra |= nd.bestMode->cu.isIntra(0);
1370                     splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
1371                     splitPred->addSubCosts(*nd.bestMode);
1372 
1373                     if (m_param->rdLevel)
1374                         nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
1375                     else
1376                         nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
1377                     if (m_param->rdLevel > 1)
1378                         nextContext = &nd.bestMode->contexts;
1379                 }
1380                 else
1381                     splitCU->setEmptyPart(childGeom, subPartIdx);
1382             }
1383             nextContext->store(splitPred->contexts);
1384 
1385             if (mightNotSplit)
1386                 addSplitFlagCost(*splitPred, cuGeom.depth);
1387             else if (m_param->rdLevel > 1)
1388                 updateModeCost(*splitPred);
1389             else
1390                 splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);
1391         }
1392         /* If analysis mode is simple do not Evaluate other modes */
1393         if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
1394         {
1395             if (m_slice->m_sliceType == P_SLICE)
1396             {
1397                 if (m_checkMergeAndSkipOnly[0])
1398                     skipModes = true;
1399             }
1400             else
1401             {
1402                 if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])
1403                     skipModes = true;
1404             }
1405         }
1406         /* Split CUs
1407          *   0  1
1408          *   2  3 */
1409         uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1410         /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
1411         if (mightNotSplit && (depth >= minDepth || (m_param->bCTUInfo && !md.bestMode)))
1412         {
1413             if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
1414                 setLambdaFromQP(parentCTU, qp);
1415 
1416             if (!skipModes)
1417             {
1418                 uint32_t refMasks[2];
1419                 refMasks[0] = allSplitRefs;
1420                 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1421                 checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1422 
1423                 if (m_param->limitReferences & X265_REF_LIMIT_CU)
1424                 {
1425                     CUData& cu = md.pred[PRED_2Nx2N].cu;
1426                     uint32_t refMask = cu.getBestRefIdx(0);
1427                     allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
1428                 }
1429 
1430                 if (m_slice->m_sliceType == B_SLICE)
1431                 {
1432                     md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
1433                     checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
1434                 }
1435 
1436                 Mode *bestInter = &md.pred[PRED_2Nx2N];
1437                 if (!skipRectAmp)
1438                 {
1439                     if (m_param->bEnableRectInter)
1440                     {
1441                         uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
1442                         uint32_t threshold_2NxN, threshold_Nx2N;
1443 
1444                         if (m_slice->m_sliceType == P_SLICE)
1445                         {
1446                             threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
1447                             threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
1448                         }
1449                         else
1450                         {
1451                             threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
1452                                 + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
1453                             threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
1454                                 + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
1455                         }
1456 
1457                         int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
1458                         if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
1459                         {
1460                             refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
1461                             refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
1462                             md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1463                             checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1464                             if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1465                                 bestInter = &md.pred[PRED_2NxN];
1466                         }
1467 
1468                         if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N)
1469                         {
1470                             refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
1471                             refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
1472                             md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1473                             checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
1474                             if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
1475                                 bestInter = &md.pred[PRED_Nx2N];
1476                         }
1477 
1478                         if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
1479                         {
1480                             refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
1481                             refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
1482                             md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1483                             checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1484                             if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1485                                 bestInter = &md.pred[PRED_2NxN];
1486                         }
1487                     }
1488 
1489                     if (m_slice->m_sps->maxAMPDepth > depth)
1490                     {
1491                         uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
1492                         uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
1493 
1494                         if (m_slice->m_sliceType == P_SLICE)
1495                         {
1496                             threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
1497                             threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
1498 
1499                             threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
1500                             threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
1501                         }
1502                         else
1503                         {
1504                             threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
1505                                 + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
1506                             threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
1507                                 + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
1508 
1509                             threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
1510                                 + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
1511                             threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
1512                                 + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
1513                         }
1514 
1515                         bool bHor = false, bVer = false;
1516                         if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
1517                             bHor = true;
1518                         else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
1519                             bVer = true;
1520                         else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
1521                             md.bestMode && md.bestMode->cu.getQtRootCbf(0))
1522                         {
1523                             bHor = true;
1524                             bVer = true;
1525                         }
1526 
1527                         if (bHor)
1528                         {
1529                             int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
1530                             if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
1531                             {
1532                                 refMasks[0] = allSplitRefs;                                    /* 75% top */
1533                                 refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1534                                 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1535                                 checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1536                                 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1537                                     bestInter = &md.pred[PRED_2NxnD];
1538                             }
1539 
1540                             if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU)
1541                             {
1542                                 refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
1543                                 refMasks[1] = allSplitRefs;                                    /* 75% bot */
1544                                 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
1545                                 checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
1546                                 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
1547                                     bestInter = &md.pred[PRED_2NxnU];
1548                             }
1549 
1550                             if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
1551                             {
1552                                 refMasks[0] = allSplitRefs;                                    /* 75% top */
1553                                 refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1554                                 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1555                                 checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1556                                 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1557                                     bestInter = &md.pred[PRED_2NxnD];
1558                             }
1559                         }
1560                         if (bVer)
1561                         {
1562                             int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
1563                             if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
1564                             {
1565                                 refMasks[0] = allSplitRefs;                                    /* 75% left  */
1566                                 refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1567                                 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1568                                 checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1569                                 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1570                                     bestInter = &md.pred[PRED_nRx2N];
1571                             }
1572 
1573                             if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N)
1574                             {
1575                                 refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
1576                                 refMasks[1] = allSplitRefs;                                    /* 75% right */
1577                                 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1578                                 checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
1579                                 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
1580                                     bestInter = &md.pred[PRED_nLx2N];
1581                             }
1582 
1583                             if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
1584                             {
1585                                 refMasks[0] = allSplitRefs;                                    /* 75% left  */
1586                                 refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1587                                 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1588                                 checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1589                                 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1590                                     bestInter = &md.pred[PRED_nRx2N];
1591                             }
1592                         }
1593                     }
1594                 }
1595                 bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && !((m_param->bCTUInfo & 4) && bCtuInfoCheck);
1596                 if (m_param->rdLevel >= 3)
1597                 {
1598                     /* Calculate RD cost of best inter option */
1599                     if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
1600                     {
1601                         uint32_t numPU = bestInter->cu.getNumPartInter(0);
1602                         for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1603                         {
1604                             PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
1605                             motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
1606                         }
1607                     }
1608 
1609                     if (!chooseMerge)
1610                     {
1611                         encodeResAndCalcRdInterCU(*bestInter, cuGeom);
1612                         checkBestMode(*bestInter, depth);
1613 
1614                         /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
1615                         if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
1616                             md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
1617                         {
1618                             uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
1619                             if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
1620                                 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1621                                 {
1622                                     PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
1623                                     motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
1624                                 }
1625                             encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1626                             checkBestMode(md.pred[PRED_BIDIR], depth);
1627                         }
1628                     }
1629 
1630                     if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
1631                         md.bestMode->sa8dCost == MAX_INT64)
1632                     {
1633                         if (!m_param->limitReferences || splitIntra)
1634                         {
1635                             ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1636                             md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1637                             checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
1638                             encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
1639                             checkBestMode(md.pred[PRED_INTRA], depth);
1640                         }
1641                         else
1642                         {
1643                             ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1644                         }
1645                     }
1646                 }
1647                 else
1648                 {
1649                     /* SA8D choice between merge/skip, inter, bidir, and intra */
1650                     if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
1651                         md.bestMode = bestInter;
1652 
1653                     if (m_slice->m_sliceType == B_SLICE &&
1654                         md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
1655                         md.bestMode = &md.pred[PRED_BIDIR];
1656 
1657                     if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
1658                     {
1659                         if (!m_param->limitReferences || splitIntra)
1660                         {
1661                             ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1662                             md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1663                             checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
1664                             if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
1665                                 md.bestMode = &md.pred[PRED_INTRA];
1666                         }
1667                         else
1668                         {
1669                             ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1670                         }
1671                     }
1672 
1673                     /* finally code the best mode selected by SA8D costs:
1674                      * RD level 2 - fully encode the best mode
1675                      * RD level 1 - generate recon pixels
1676                      * RD level 0 - generate chroma prediction */
1677                     if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
1678                     {
1679                         /* prediction already generated for this CU, and if rd level
1680                          * is not 0, it is already fully encoded */
1681                     }
1682                     else if (md.bestMode->cu.isInter(0))
1683                     {
1684                         uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
1685                         if (m_csp != X265_CSP_I400)
1686                         {
1687                             for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1688                             {
1689                                 PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
1690                                 motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
1691                             }
1692                         }
1693                         if (m_param->rdLevel == 2)
1694                             encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
1695                         else if (m_param->rdLevel == 1)
1696                         {
1697                             /* generate recon pixels with no rate distortion considerations */
1698                             CUData& cu = md.bestMode->cu;
1699 
1700                             uint32_t tuDepthRange[2];
1701                             cu.getInterTUQtDepthRange(tuDepthRange, 0);
1702                             m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize, m_frame->m_fencPic->m_picCsp);
1703                             residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
1704                             if (cu.getQtRootCbf(0))
1705                                 md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0], m_frame->m_fencPic->m_picCsp);
1706                             else
1707                             {
1708                                 md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
1709                                 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
1710                                     cu.setPredModeSubParts(MODE_SKIP);
1711                             }
1712                         }
1713                     }
1714                     else
1715                     {
1716                         if (m_param->rdLevel == 2)
1717                             encodeIntraInInter(*md.bestMode, cuGeom);
1718                         else if (m_param->rdLevel == 1)
1719                         {
1720                             /* generate recon pixels with no rate distortion considerations */
1721                             CUData& cu = md.bestMode->cu;
1722 
1723                             uint32_t tuDepthRange[2];
1724                             cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1725 
1726                             residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
1727                             if (m_csp != X265_CSP_I400)
1728                             {
1729                                 getBestIntraModeChroma(*md.bestMode, cuGeom);
1730                                 residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
1731                             }
1732                             md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
1733                         }
1734                     }
1735                 }
1736             } // !earlyskip
1737 
1738             if (m_bTryLossless)
1739                 tryLossless(cuGeom);
1740 
1741             if (mightSplit)
1742                 addSplitFlagCost(*md.bestMode, cuGeom.depth);
1743         }
1744 
1745         if (mightSplit && !skipRecursion)
1746         {
1747             Mode* splitPred = &md.pred[PRED_SPLIT];
1748             if (!md.bestMode)
1749                 md.bestMode = splitPred;
1750             else if (m_param->rdLevel > 1)
1751                 checkBestMode(*splitPred, cuGeom.depth);
1752             else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
1753                 md.bestMode = splitPred;
1754 
1755             checkDQPForSplitPred(*md.bestMode, cuGeom);
1756         }
1757 
1758         /* determine which motion references the parent CU should search */
1759         splitCUData.initSplitCUData();
1760 
1761         if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
1762         {
1763             if (md.bestMode == &md.pred[PRED_SPLIT])
1764                 splitCUData.splitRefs = allSplitRefs;
1765             else
1766             {
1767                 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1768                 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1769                 uint32_t numPU = cu.getNumPartInter(0);
1770                 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1771                     splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
1772             }
1773         }
1774 
1775         if (m_param->limitModes)
1776         {
1777             splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
1778             splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
1779             splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
1780         }
1781 
1782         if (mightNotSplit && md.bestMode->cu.isSkipped(0))
1783         {
1784             FrameData& curEncData = *m_frame->m_encData;
1785             FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1786             uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
1787             cuStat.count[depth] += 1;
1788             cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
1789         }
1790 
1791         /* Copy best data to encData CTU and recon */
1792         md.bestMode->cu.copyToPic(depth);
1793         if (m_param->rdLevel)
1794             md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
1795 
1796         if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
1797         {
1798             if (mightNotSplit)
1799             {
1800                 CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
1801                 int8_t maxTUDepth = -1;
1802                 for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
1803                     maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
1804                 ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
1805             }
1806         }
1807     }
1808     else
1809     {
1810         if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16)
1811         {
1812             qprdRefine(parentCTU, cuGeom, qp, qp);
1813 
1814             SplitData splitData[4];
1815             splitData[0].initSplitCUData();
1816             splitData[1].initSplitCUData();
1817             splitData[2].initSplitCUData();
1818             splitData[3].initSplitCUData();
1819 
1820             uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1821 
1822             splitCUData.initSplitCUData();
1823 
1824             if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
1825             {
1826                 if (md.bestMode == &md.pred[PRED_SPLIT])
1827                     splitCUData.splitRefs = allSplitRefs;
1828                 else
1829                 {
1830                     /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1831                     CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1832                     uint32_t numPU = cu.getNumPartInter(0);
1833                     for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1834                         splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
1835                 }
1836             }
1837 
1838             if (m_param->limitModes)
1839             {
1840                 splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
1841                 splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
1842                 splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
1843             }
1844         }
1845     }
1846 
1847     return splitCUData;
1848 }
1849 
compressInterCU_rd5_6(const CUData & parentCTU,const CUGeom & cuGeom,int32_t qp)1850 SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
1851 {
1852     if (parentCTU.m_vbvAffected && !calculateQpforCuSize(parentCTU, cuGeom, 1))
1853         return compressInterCU_rd0_4(parentCTU, cuGeom, qp);
1854 
1855     uint32_t depth = cuGeom.depth;
1856     ModeDepth& md = m_modeDepth[depth];
1857     md.bestMode = NULL;
1858 
1859     if (m_param->searchMethod == X265_SEA)
1860     {
1861         int numPredDir = m_slice->isInterP() ? 1 : 2;
1862         int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
1863         for (int list = 0; list < numPredDir; list++)
1864             for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
1865                 for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
1866                     m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
1867     }
1868 
1869     SplitData splitCUData;
1870 
1871     bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16);
1872     bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
1873     bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO);
1874 
1875     if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
1876     {
1877         bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1878         bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1879         bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
1880         bool skipRecursion = false;
1881         bool skipModes = false;
1882         bool splitIntra = true;
1883         bool skipRectAmp = false;
1884         bool bCtuInfoCheck = false;
1885         int sameContentRef = 0;
1886 
1887         if (m_evaluateInter)
1888         {
1889             if (m_refineLevel == 2)
1890             {
1891                 if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
1892                     skipModes = true;
1893                 if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1894                     skipRectAmp = true;
1895             }
1896             mightSplit &= false;
1897         }
1898 
1899         // avoid uninitialize value in below reference
1900         if (m_param->limitModes)
1901         {
1902             md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
1903             md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
1904             md.pred[PRED_2Nx2N].rdCost = 0;
1905         }
1906 
1907         if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
1908             m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
1909 
1910         SplitData splitData[4];
1911         splitData[0].initSplitCUData();
1912         splitData[1].initSplitCUData();
1913         splitData[2].initSplitCUData();
1914         splitData[3].initSplitCUData();
1915         uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1916         uint32_t refMasks[2];
1917         if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
1918         {
1919             if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
1920                 sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
1921             if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
1922             {
1923                 mightNotSplit &= bDecidedDepth;
1924                 bCtuInfoCheck = skipRecursion = false;
1925                 skipModes = true;
1926             }
1927             else if (mightNotSplit && bDecidedDepth)
1928             {
1929                 if (m_additionalCtuInfo[cuGeom.absPartIdx])
1930                 {
1931                     bCtuInfoCheck = skipRecursion = true;
1932                     refMasks[0] = allSplitRefs;
1933                     md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1934                     checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1935                     checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
1936                     if (!sameContentRef)
1937                     {
1938                         if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
1939                         {
1940                             qp -= int32_t(0.04 * qp);
1941                             setLambdaFromQP(parentCTU, qp);
1942                         }
1943                         if (m_param->bCTUInfo & 4)
1944                             skipModes = false;
1945                     }
1946                     if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
1947                     {
1948                         if (m_param->rdLevel)
1949                             skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1950                         if ((m_param->bCTUInfo & 4) && sameContentRef)
1951                             skipModes = md.bestMode && true;
1952                     }
1953                 }
1954                 else
1955                 {
1956                     md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1957                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1958                     checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1959                     skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
1960                     refMasks[0] = allSplitRefs;
1961                     md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1962                     checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1963                     checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
1964                 }
1965                 mightSplit &= !bDecidedDepth;
1966             }
1967         }
1968         if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)
1969         {
1970             if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1971             {
1972                 if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1973                 {
1974                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1975                     md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1976                     checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1977                     skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
1978                     refMasks[0] = allSplitRefs;
1979                     md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1980                     checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1981                     checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
1982 
1983                     if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
1984                         skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
1985                 }
1986                 if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1987                     skipRectAmp = true && !!md.bestMode;
1988             }
1989         }
1990 
1991         if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
1992         {
1993             if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1994             {
1995                 if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1996                 {
1997                     md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1998                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1999                     checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2000 
2001                     skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
2002                     refMasks[0] = allSplitRefs;
2003                     md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2004                     checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2005                     checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2006 
2007                     if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
2008                         skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2009                 }
2010             }
2011         }
2012         /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
2013         if ((mightNotSplit && !md.bestMode && !bCtuInfoCheck) ||
2014             (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
2015         {
2016             md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
2017             md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
2018             checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2019             skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2) &&
2020                 md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2021             refMasks[0] = allSplitRefs;
2022             md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2023             checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2024             checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2025 
2026             if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP && depth && m_modeDepth[depth - 1].bestMode)
2027                 skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2028             else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
2029                 skipRecursion = md.bestMode && complexityCheckCU(*md.bestMode);
2030         }
2031         if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
2032             skipRecursion = true;
2033         // estimate split cost
2034         /* Step 2. Evaluate each of the 4 split sub-blocks in series */
2035         if (mightSplit && !skipRecursion)
2036         {
2037             if (bCtuInfoCheck && m_param->bCTUInfo & 2)
2038                 qp = int((1 / 0.96) * qp + 0.5);
2039             Mode* splitPred = &md.pred[PRED_SPLIT];
2040             splitPred->initCosts();
2041             CUData* splitCU = &splitPred->cu;
2042             splitCU->initSubCU(parentCTU, cuGeom, qp);
2043 
2044             uint32_t nextDepth = depth + 1;
2045             ModeDepth& nd = m_modeDepth[nextDepth];
2046             invalidateContexts(nextDepth);
2047             Entropy* nextContext = &m_rqt[depth].cur;
2048             int nextQP = qp;
2049             splitIntra = false;
2050 
2051             for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
2052             {
2053                 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
2054                 if (childGeom.flags & CUGeom::PRESENT)
2055                 {
2056                     m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
2057                     m_rqt[nextDepth].cur.load(*nextContext);
2058 
2059                     if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
2060                         nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
2061 
2062                     splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, nextQP);
2063 
2064                     // Save best CU and pred data for this sub CU
2065                     splitIntra |= nd.bestMode->cu.isIntra(0);
2066                     splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
2067                     splitPred->addSubCosts(*nd.bestMode);
2068                     nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
2069                     nextContext = &nd.bestMode->contexts;
2070                 }
2071                 else
2072                 {
2073                     splitCU->setEmptyPart(childGeom, subPartIdx);
2074                 }
2075             }
2076             nextContext->store(splitPred->contexts);
2077             if (mightNotSplit)
2078                 addSplitFlagCost(*splitPred, cuGeom.depth);
2079             else
2080                 updateModeCost(*splitPred);
2081 
2082             checkDQPForSplitPred(*splitPred, cuGeom);
2083         }
2084         /* If analysis mode is simple do not Evaluate other modes */
2085         if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
2086         {
2087             if (m_slice->m_sliceType == P_SLICE)
2088             {
2089                 if (m_checkMergeAndSkipOnly[0])
2090                     skipModes = true;
2091             }
2092             else
2093             {
2094                 if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])
2095                     skipModes = true;
2096             }
2097         }
2098         /* Split CUs
2099          *   0  1
2100          *   2  3 */
2101         allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2102         /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
2103         if (mightNotSplit)
2104         {
2105             if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
2106                 setLambdaFromQP(parentCTU, qp);
2107 
2108             if (!skipModes)
2109             {
2110                 refMasks[0] = allSplitRefs;
2111 
2112                 if (m_param->limitReferences & X265_REF_LIMIT_CU)
2113                 {
2114                     CUData& cu = md.pred[PRED_2Nx2N].cu;
2115                     uint32_t refMask = cu.getBestRefIdx(0);
2116                     allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
2117                 }
2118 
2119                 if (m_slice->m_sliceType == B_SLICE)
2120                 {
2121                     md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
2122                     checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
2123                     if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
2124                     {
2125                         uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
2126                         if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
2127                             for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
2128                             {
2129                                 PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
2130                                 motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
2131                             }
2132                         encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
2133                         checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
2134                     }
2135                 }
2136 
2137                 if (!skipRectAmp)
2138                 {
2139                     if (m_param->bEnableRectInter)
2140                     {
2141                         uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
2142                         uint32_t threshold_2NxN, threshold_Nx2N;
2143 
2144                         if (m_slice->m_sliceType == P_SLICE)
2145                         {
2146                             threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
2147                             threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
2148                         }
2149                         else
2150                         {
2151                             threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
2152                                 + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
2153                             threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
2154                                 + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
2155                         }
2156 
2157                         int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
2158                         if (try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
2159                         {
2160                             refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
2161                             refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
2162                             md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2163                             checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
2164                             checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
2165                         }
2166 
2167                         if (splitCost < md.bestMode->rdCost + threshold_Nx2N)
2168                         {
2169                             refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
2170                             refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
2171                             md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2172                             checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
2173                             checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
2174                         }
2175 
2176                         if (!try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
2177                         {
2178                             refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
2179                             refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
2180                             md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2181                             checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
2182                             checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
2183                         }
2184                     }
2185 
2186                     // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
2187                     if (m_slice->m_sps->maxAMPDepth > depth)
2188                     {
2189                         uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
2190                         uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
2191 
2192                         if (m_slice->m_sliceType == P_SLICE)
2193                         {
2194                             threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
2195                             threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
2196 
2197                             threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
2198                             threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
2199                         }
2200                         else
2201                         {
2202                             threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
2203                                 + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
2204                             threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
2205                                 + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
2206 
2207                             threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
2208                                 + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
2209                             threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
2210                                 + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
2211                         }
2212 
2213                         bool bHor = false, bVer = false;
2214                         if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
2215                             bHor = true;
2216                         else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
2217                             bVer = true;
2218                         else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0])
2219                         {
2220                             bHor = true;
2221                             bVer = true;
2222                         }
2223 
2224                         if (bHor)
2225                         {
2226                             int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
2227                             if (try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
2228                             {
2229                                 refMasks[0] = allSplitRefs;                                    /* 75% top */
2230                                 refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
2231                                 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
2232                                 checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
2233                                 checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
2234                             }
2235 
2236                             if (splitCost < md.bestMode->rdCost + threshold_2NxnU)
2237                             {
2238                                 refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
2239                                 refMasks[1] = allSplitRefs;                                    /* 75% bot */
2240                                 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
2241                                 checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
2242                                 checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
2243                             }
2244 
2245                             if (!try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
2246                             {
2247                                 refMasks[0] = allSplitRefs;                                    /* 75% top */
2248                                 refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
2249                                 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
2250                                 checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
2251                                 checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
2252                             }
2253                         }
2254 
2255                         if (bVer)
2256                         {
2257                             int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
2258                             if (try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
2259                             {
2260                                 refMasks[0] = allSplitRefs;                                    /* 75% left  */
2261                                 refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
2262                                 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2263                                 checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
2264                                 checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
2265                             }
2266 
2267                             if (splitCost < md.bestMode->rdCost + threshold_nLx2N)
2268                             {
2269                                 refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
2270                                 refMasks[1] = allSplitRefs;                                    /* 75% right */
2271                                 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2272                                 checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
2273                                 checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
2274                             }
2275 
2276                             if (!try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
2277                             {
2278                                 refMasks[0] = allSplitRefs;                                    /* 75% left  */
2279                                 refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
2280                                 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2281                                 checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
2282                                 checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
2283                             }
2284                         }
2285                     }
2286                 }
2287 
2288                 if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE) && !((m_param->bCTUInfo & 4) && bCtuInfoCheck))
2289                 {
2290                     if (!m_param->limitReferences || splitIntra)
2291                     {
2292                         ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
2293                         md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
2294                         checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
2295                         checkBestMode(md.pred[PRED_INTRA], depth);
2296 
2297                         if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
2298                         {
2299                             md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2300                             checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
2301                             checkBestMode(md.pred[PRED_INTRA_NxN], depth);
2302                         }
2303                     }
2304                     else
2305                     {
2306                         ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
2307                     }
2308                 }
2309             }
2310 
2311             if ((md.bestMode->cu.isInter(0) && !(md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)) && (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400))
2312             {
2313                 uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
2314 
2315                 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
2316                 {
2317                     PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
2318                     motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, m_csp != X265_CSP_I400);
2319                 }
2320                 encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
2321             }
2322             if (m_bTryLossless)
2323                 tryLossless(cuGeom);
2324 
2325             if (mightSplit)
2326                 addSplitFlagCost(*md.bestMode, cuGeom.depth);
2327         }
2328 
2329         if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
2330         {
2331             if (mightNotSplit)
2332             {
2333                 CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
2334                 int8_t maxTUDepth = -1;
2335                 for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
2336                     maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
2337                 ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
2338             }
2339         }
2340 
2341         /* compare split RD cost against best cost */
2342         if (mightSplit && !skipRecursion)
2343             checkBestMode(md.pred[PRED_SPLIT], depth);
2344 
2345         if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
2346         {
2347             int cuIdx = (cuGeom.childOffset - 1) / 3;
2348             cacheCost[cuIdx] = md.bestMode->rdCost;
2349         }
2350 
2351         /* determine which motion references the parent CU should search */
2352         splitCUData.initSplitCUData();
2353         if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
2354         {
2355             if (md.bestMode == &md.pred[PRED_SPLIT])
2356                 splitCUData.splitRefs = allSplitRefs;
2357             else
2358             {
2359                 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
2360                 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
2361                 uint32_t numPU = cu.getNumPartInter(0);
2362                 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
2363                     splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
2364             }
2365         }
2366 
2367         if (m_param->limitModes)
2368         {
2369             splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2370             splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2371             splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost;
2372         }
2373 
2374         /* Copy best data to encData CTU and recon */
2375         md.bestMode->cu.copyToPic(depth);
2376         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
2377     }
2378     else
2379     {
2380         if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16)
2381         {
2382             qprdRefine(parentCTU, cuGeom, qp, qp);
2383 
2384             SplitData splitData[4];
2385             splitData[0].initSplitCUData();
2386             splitData[1].initSplitCUData();
2387             splitData[2].initSplitCUData();
2388             splitData[3].initSplitCUData();
2389 
2390             uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2391 
2392             splitCUData.initSplitCUData();
2393             if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
2394             {
2395                 if (md.bestMode == &md.pred[PRED_SPLIT])
2396                     splitCUData.splitRefs = allSplitRefs;
2397                 else
2398                 {
2399                     /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
2400                     CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
2401                     uint32_t numPU = cu.getNumPartInter(0);
2402                     for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
2403                         splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
2404                 }
2405             }
2406 
2407             if (m_param->limitModes)
2408             {
2409                 splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2410                 splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2411                 splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost;
2412             }
2413         }
2414     }
2415 
2416     return splitCUData;
2417 }
2418 
recodeCU(const CUData & parentCTU,const CUGeom & cuGeom,int32_t qp,int32_t lqp)2419 void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
2420 {
2421     uint32_t depth = cuGeom.depth;
2422     ModeDepth& md = m_modeDepth[depth];
2423     md.bestMode = NULL;
2424 
2425     m_evaluateInter = 0;
2426     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
2427     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
2428     bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
2429     int split = 0;
2430 
2431     TrainingData td;
2432     td.init(parentCTU, cuGeom);
2433 
2434     if (!m_param->bDynamicRefine)
2435         m_refineLevel = m_param->interRefine;
2436     else
2437         m_refineLevel = m_frame->m_classifyFrame ? 1 : 3;
2438 
2439     if (m_param->interRefine == 1)
2440         split = (m_param->scaleFactor && bDecidedDepth && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP && (!mightNotSplit ||
2441                 (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
2442     else
2443         split = (m_param->scaleFactor && bDecidedDepth && (!mightNotSplit ||
2444                 (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
2445     td.split = split;
2446 
2447     if ((bDecidedDepth && mightNotSplit) || (m_param->bAnalysisType == HEVC_INFO && parentCTU.m_cuDepth[cuGeom.absPartIdx] == 4))
2448     {
2449         setLambdaFromQP(parentCTU, qp, lqp);
2450 
2451         Mode& mode = md.pred[0];
2452         md.bestMode = &mode;
2453         mode.cu.initSubCU(parentCTU, cuGeom, qp);
2454         PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
2455         if (parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
2456         {
2457             if (m_param->intraRefine == 4)
2458                 compressIntraCU(parentCTU, cuGeom, qp);
2459             else
2460             {
2461                 bool reuseModes = !((m_param->intraRefine == 3) ||
2462                     (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
2463                 if (reuseModes)
2464                 {
2465                     memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
2466                     memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
2467                 }
2468                 checkIntra(mode, cuGeom, size);
2469             }
2470         }
2471         else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
2472         {
2473             mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
2474             uint32_t numPU = parentCTU.getNumPartInter(cuGeom.absPartIdx);
2475             for (uint32_t part = 0; part < numPU; part++)
2476             {
2477                 PredictionUnit pu(mode.cu, cuGeom, part);
2478                 if (m_param->analysisLoadReuseLevel == 10 || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7))
2479                 {
2480                     x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
2481                     int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx;
2482                     mode.cu.m_mergeFlag[pu.puAbsPartIdx] = interDataCTU->mergeFlag[cuIdx + part];
2483                     mode.cu.setPUInterDir(interDataCTU->interDir[cuIdx + part], pu.puAbsPartIdx, part);
2484                     for (int list = 0; list < m_slice->isInterB() + 1; list++)
2485                     {
2486                         mode.cu.setPUMv(list, interDataCTU->mv[list][cuIdx + part].word, pu.puAbsPartIdx, part);
2487                         mode.cu.setPURefIdx(list, interDataCTU->refIdx[list][cuIdx + part], pu.puAbsPartIdx, part);
2488                         mode.cu.m_mvpIdx[list][pu.puAbsPartIdx] = interDataCTU->mvpIdx[list][cuIdx + part];
2489                     }
2490                     if (!mode.cu.m_mergeFlag[pu.puAbsPartIdx])
2491                     {
2492                         if (m_param->interRefine == 1)
2493                             m_me.setSourcePU(*mode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, false);
2494                         //AMVP
2495                         MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2496                         mode.cu.getNeighbourMV(part, pu.puAbsPartIdx, mode.interNeighbours);
2497                         for (int list = 0; list < m_slice->isInterB() + 1; list++)
2498                         {
2499                             int ref = mode.cu.m_refIdx[list][pu.puAbsPartIdx];
2500                             if (ref == -1)
2501                                 continue;
2502                             MV mvp;
2503 
2504                             int numMvc = mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc);
2505                             mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]];
2506                             if (m_param->interRefine == 1)
2507                             {
2508                                 MV outmv, mvpSelect[3];
2509                                 mvpSelect[0] = interDataCTU->mv[list][cuIdx + part].word;
2510                                 if (m_param->mvRefine > 1)
2511                                 {
2512                                     mvpSelect[1] = mvp;
2513                                     if(m_param->mvRefine > 2)
2514                                         mvpSelect[2] = mode.amvpCand[list][ref][!(mode.cu.m_mvpIdx[list][pu.puAbsPartIdx])];
2515                                 }
2516                                 searchMV(mode, list, ref, outmv, mvpSelect, numMvc, mvc);
2517                                 mode.cu.setPUMv(list, outmv, pu.puAbsPartIdx, part);
2518                             }
2519                             mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]]/*mvp*/;
2520                         }
2521                     }
2522                     else
2523                     {
2524                         MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
2525                         uint8_t candDir[MRG_MAX_NUM_CANDS];
2526                         mode.cu.getInterMergeCandidates(pu.puAbsPartIdx, part, candMvField, candDir);
2527                         uint8_t mvpIdx = mode.cu.m_mvpIdx[0][pu.puAbsPartIdx];
2528                         if (mode.cu.isBipredRestriction())
2529                         {
2530                             /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
2531                             if (candDir[mvpIdx] == 3)
2532                             {
2533                                 candDir[mvpIdx] = 1;
2534                                 candMvField[mvpIdx][1].refIdx = REF_NOT_VALID;
2535                             }
2536                         }
2537                         mode.cu.setPUInterDir(candDir[mvpIdx], pu.puAbsPartIdx, part);
2538                         mode.cu.setPUMv(0, candMvField[mvpIdx][0].mv, pu.puAbsPartIdx, part);
2539                         mode.cu.setPUMv(1, candMvField[mvpIdx][1].mv, pu.puAbsPartIdx, part);
2540                         mode.cu.setPURefIdx(0, (int8_t)candMvField[mvpIdx][0].refIdx, pu.puAbsPartIdx, part);
2541                         mode.cu.setPURefIdx(1, (int8_t)candMvField[mvpIdx][1].refIdx, pu.puAbsPartIdx, part);
2542                     }
2543                 }
2544                 motionCompensation(mode.cu, pu, mode.predYuv, true, (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
2545             }
2546             if (!m_param->interRefine && !m_param->bDynamicRefine && parentCTU.isSkipped(cuGeom.absPartIdx))
2547                 encodeResAndCalcRdSkipCU(mode);
2548             else
2549                 encodeResAndCalcRdInterCU(mode, cuGeom);
2550 
2551             /* checkMerge2Nx2N function performs checkDQP after encoding residual, do the same */
2552             bool mergeInter2Nx2N = size == SIZE_2Nx2N && mode.cu.m_mergeFlag[0];
2553             if (parentCTU.isSkipped(cuGeom.absPartIdx) || mergeInter2Nx2N)
2554                 checkDQP(mode, cuGeom);
2555         }
2556 
2557         if (m_refineLevel < 2)
2558         {
2559             if (m_bTryLossless)
2560                 tryLossless(cuGeom);
2561 
2562             if (mightSplit)
2563                 addSplitFlagCost(*md.bestMode, cuGeom.depth);
2564 
2565             if (mightSplit && m_param->rdLevel < 5)
2566                 checkDQPForSplitPred(*md.bestMode, cuGeom);
2567         }
2568 
2569         if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
2570         {
2571             for (int list = 0; list < m_slice->isInterB() + 1; list++)
2572             {
2573                 m_modeFlag[list] = true;
2574                 if (parentCTU.m_skipFlag[list][cuGeom.absPartIdx] == 1 && cuGeom.numPartitions <= 16)
2575                     m_checkMergeAndSkipOnly[list] = true;
2576             }
2577             m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
2578             for (int list = 0; list < m_slice->isInterB() + 1; list++)
2579             {
2580                 m_modeFlag[list] = false;
2581                 m_checkMergeAndSkipOnly[list] = false;
2582             }
2583         }
2584 
2585         if (m_param->bDynamicRefine)
2586             classifyCU(parentCTU,cuGeom, *md.bestMode, td);
2587 
2588         if (m_refineLevel > 1 || (m_refineLevel && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP  && !mode.cu.isSkipped(0)))
2589         {
2590             if (parentCTU.m_cuDepth[cuGeom.absPartIdx] < 4 && mightNotSplit)
2591                 m_evaluateInter = 1;
2592             else
2593                 bDecidedDepth = true;
2594             m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
2595             m_evaluateInter = 0;
2596         }
2597     }
2598     if (!bDecidedDepth || split)
2599     {
2600         Mode* splitPred = &md.pred[PRED_SPLIT];
2601         if (!split)
2602             md.bestMode = splitPred;
2603         splitPred->initCosts();
2604         CUData* splitCU = &splitPred->cu;
2605         splitCU->initSubCU(parentCTU, cuGeom, qp);
2606 
2607         uint32_t nextDepth = depth + 1;
2608         ModeDepth& nd = m_modeDepth[nextDepth];
2609         invalidateContexts(nextDepth);
2610         Entropy* nextContext = &m_rqt[depth].cur;
2611         int nextQP = qp;
2612 
2613         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
2614         {
2615             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
2616             if (childGeom.flags & CUGeom::PRESENT)
2617             {
2618                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
2619                 m_rqt[nextDepth].cur.load(*nextContext);
2620 
2621                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
2622                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
2623 
2624                 int lamdaQP = (m_param->analysisLoadReuseLevel >= 7) ? nextQP : lqp;
2625 
2626                 if (split)
2627                     m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
2628                 else
2629                     qprdRefine(parentCTU, childGeom, nextQP, lamdaQP);
2630 
2631                 // Save best CU and pred data for this sub CU
2632                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
2633                 splitPred->addSubCosts(*nd.bestMode);
2634                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
2635                 nextContext = &nd.bestMode->contexts;
2636             }
2637             else
2638             {
2639                 splitCU->setEmptyPart(childGeom, subPartIdx);
2640                 // Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP
2641                 memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
2642             }
2643         }
2644         nextContext->store(splitPred->contexts);
2645         if (mightNotSplit)
2646             addSplitFlagCost(*splitPred, cuGeom.depth);
2647         else
2648             updateModeCost(*splitPred);
2649 
2650         if (m_refineLevel)
2651         {
2652             if (m_param->rdLevel > 1)
2653                 checkBestMode(*splitPred, cuGeom.depth);
2654             else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
2655                 md.bestMode = splitPred;
2656         }
2657 
2658         checkDQPForSplitPred(*splitPred, cuGeom);
2659 
2660         /* Copy best data to encData CTU and recon */
2661         md.bestMode->cu.copyToPic(depth);
2662         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
2663     }
2664     if (m_param->bDynamicRefine && bDecidedDepth)
2665         trainCU(parentCTU, cuGeom, *md.bestMode, td);
2666 }
2667 
classifyCU(const CUData & ctu,const CUGeom & cuGeom,const Mode & bestMode,TrainingData & trainData)2668 void Analysis::classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
2669 {
2670     uint32_t depth = cuGeom.depth;
2671     trainData.cuVariance = calculateCUVariance(ctu, cuGeom);
2672     if (m_frame->m_classifyFrame)
2673     {
2674         uint64_t diffRefine[X265_REFINE_INTER_LEVELS];
2675         uint64_t diffRefineRd[X265_REFINE_INTER_LEVELS];
2676         float probRefine[X265_REFINE_INTER_LEVELS] = { 0 };
2677         uint8_t varRefineLevel = 1;
2678         uint8_t rdRefineLevel = 1;
2679         uint64_t cuCost = bestMode.rdCost;
2680         int offset = (depth * X265_REFINE_INTER_LEVELS);
2681         if (cuCost < m_frame->m_classifyRd[offset])
2682             m_refineLevel = 1;
2683         else
2684         {
2685             uint64_t trainingCount = 0;
2686             for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
2687             {
2688                 offset = (depth * X265_REFINE_INTER_LEVELS) + i;
2689                 trainingCount += m_frame->m_classifyCount[offset];
2690             }
2691             for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
2692             {
2693                 offset = (depth * X265_REFINE_INTER_LEVELS) + i;
2694                 /* Calculate distance values */
2695                 diffRefine[i] = abs((int64_t)(trainData.cuVariance - m_frame->m_classifyVariance[offset]));
2696                 diffRefineRd[i] = abs((int64_t)(cuCost - m_frame->m_classifyRd[offset]));
2697 
2698                 /* Calculate prior probability - ranges between 0 and 1 */
2699                 if (trainingCount)
2700                     probRefine[i] = ((float)m_frame->m_classifyCount[offset] / (float)trainingCount);
2701 
2702                 /* Bayesian classification - P(c|x)P(x) = P(x|c)P(c)
2703                 P(c|x) is the posterior probability of class given predictor.
2704                 P(c) is the prior probability of class.
2705                 P(x|c) is the likelihood which is the probability of predictor given class.
2706                 P(x) is the prior probability of predictor.*/
2707                 int curRefineLevel = m_refineLevel - 1;
2708                 if ((diffRefine[i] * probRefine[curRefineLevel]) < (diffRefine[curRefineLevel] * probRefine[i]))
2709                     varRefineLevel = i + 1;
2710                 if ((diffRefineRd[i] * probRefine[curRefineLevel]) < (diffRefineRd[curRefineLevel] * probRefine[i]))
2711                     rdRefineLevel = i + 1;
2712             }
2713             m_refineLevel = X265_MAX(varRefineLevel, rdRefineLevel);
2714         }
2715     }
2716 }
2717 
trainCU(const CUData & ctu,const CUGeom & cuGeom,const Mode & bestMode,TrainingData & trainData)2718 void Analysis::trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
2719 {
2720     uint32_t depth = cuGeom.depth;
2721     int classify = 1;
2722     if (!m_frame->m_classifyFrame)
2723     {
2724         /* classify = 1 : CUs for which the save data matches with that after encoding with refine-inter 3
2725                           and CUs that has split.
2726            classify = 2 : CUs which are encoded as simple modes (Skip/Merge/2Nx2N).
2727            classify = 3 : CUs encoded as any other mode. */
2728 
2729         bool refineInter0 = (trainData.predMode == ctu.m_predMode[cuGeom.absPartIdx] &&
2730             trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx] &&
2731             trainData.mergeFlag == ctu.m_mergeFlag[cuGeom.absPartIdx]);
2732         bool refineInter1 = (depth == m_param->maxCUDepth - 1) && trainData.split;
2733         if (refineInter0 || refineInter1)
2734             classify = 1;
2735         else if (trainData.partSize == SIZE_2Nx2N && trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx])
2736             classify = 2;
2737         else
2738             classify = 3;
2739     }
2740     else
2741         classify = m_refineLevel;
2742     uint64_t cuCost = bestMode.rdCost;
2743     int offset = (depth * X265_REFINE_INTER_LEVELS) + classify - 1;
2744     ctu.m_collectCURd[offset] += cuCost;
2745     ctu.m_collectCUVariance[offset] += trainData.cuVariance;
2746     ctu.m_collectCUCount[offset]++;
2747 }
2748 
2749 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
checkMerge2Nx2N_rd0_4(Mode & skip,Mode & merge,const CUGeom & cuGeom)2750 void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
2751 {
2752     uint32_t depth = cuGeom.depth;
2753     ModeDepth& md = m_modeDepth[depth];
2754     Yuv *fencYuv = &md.fencYuv;
2755 
2756     /* Note that these two Mode instances are named MERGE and SKIP but they may
2757      * hold the reverse when the function returns. We toggle between the two modes */
2758     Mode* tempPred = &merge;
2759     Mode* bestPred = &skip;
2760 
2761     X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
2762 
2763     tempPred->initCosts();
2764     tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
2765     tempPred->cu.setPredModeSubParts(MODE_INTER);
2766     tempPred->cu.m_mergeFlag[0] = true;
2767 
2768     bestPred->initCosts();
2769     bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
2770     bestPred->cu.setPredModeSubParts(MODE_INTER);
2771     bestPred->cu.m_mergeFlag[0] = true;
2772 
2773     MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
2774     uint8_t candDir[MRG_MAX_NUM_CANDS];
2775     uint32_t numMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, candMvField, candDir);
2776     PredictionUnit pu(merge.cu, cuGeom, 0);
2777 
2778     bestPred->sa8dCost = MAX_INT64;
2779     int bestSadCand = -1;
2780     int sizeIdx = cuGeom.log2CUSize - 2;
2781     int safeX, maxSafeMv;
2782     if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
2783     {
2784         safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
2785         maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
2786     }
2787     for (uint32_t i = 0; i < numMergeCand; ++i)
2788     {
2789         if (m_bFrameParallel)
2790         {
2791             // Parallel slices bound check
2792             if (m_param->maxSlices > 1)
2793             {
2794                 // NOTE: First row in slice can't negative
2795                 if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
2796                     continue;
2797 
2798                 // Last row in slice can't reference beyond bound since it is another slice area
2799                 // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
2800                 if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
2801                     continue;
2802             }
2803 
2804             if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
2805                 candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
2806                 continue;
2807         }
2808 
2809         if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
2810             tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
2811             candMvField[i][0].mv.x > maxSafeMv)
2812             // skip merge candidates which reference beyond safe reference area
2813             continue;
2814 
2815         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
2816         X265_CHECK(m_slice->m_sliceType == B_SLICE || !(candDir[i] & 0x10), " invalid merge for P slice\n");
2817         tempPred->cu.m_interDir[0] = candDir[i];
2818         tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
2819         tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
2820         tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
2821         tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
2822         motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
2823 
2824         tempPred->sa8dBits = getTUBits(i, numMergeCand);
2825         tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
2826         if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
2827         {
2828             tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
2829             tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
2830         }
2831         tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);
2832 
2833         if (tempPred->sa8dCost < bestPred->sa8dCost)
2834         {
2835             bestSadCand = i;
2836             std::swap(tempPred, bestPred);
2837         }
2838     }
2839 
2840     /* force mode decision to take inter or intra */
2841     if (bestSadCand < 0)
2842         return;
2843 
2844     /* calculate the motion compensation for chroma for the best mode selected */
2845     if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* Chroma MC was done above */
2846         motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
2847 
2848     if (m_param->rdLevel)
2849     {
2850         if (m_param->bLossless)
2851             bestPred->rdCost = MAX_INT64;
2852         else
2853             encodeResAndCalcRdSkipCU(*bestPred);
2854 
2855         /* Encode with residual */
2856         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
2857         tempPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
2858         tempPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
2859         tempPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
2860         tempPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
2861         tempPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
2862         tempPred->sa8dCost = bestPred->sa8dCost;
2863         tempPred->sa8dBits = bestPred->sa8dBits;
2864         tempPred->predYuv.copyFromYuv(bestPred->predYuv);
2865 
2866         encodeResAndCalcRdInterCU(*tempPred, cuGeom);
2867 
2868         md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;
2869     }
2870     else
2871         md.bestMode = bestPred;
2872 
2873     /* broadcast sets of MV field data */
2874     md.bestMode->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
2875     md.bestMode->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
2876     md.bestMode->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
2877     md.bestMode->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
2878     md.bestMode->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
2879     checkDQP(*md.bestMode, cuGeom);
2880 }
2881 
2882 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
checkMerge2Nx2N_rd5_6(Mode & skip,Mode & merge,const CUGeom & cuGeom)2883 void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom)
2884 {
2885     uint32_t depth = cuGeom.depth;
2886 
2887     /* Note that these two Mode instances are named MERGE and SKIP but they may
2888      * hold the reverse when the function returns. We toggle between the two modes */
2889     Mode* tempPred = &merge;
2890     Mode* bestPred = &skip;
2891 
2892     merge.initCosts();
2893     merge.cu.setPredModeSubParts(MODE_INTER);
2894     merge.cu.setPartSizeSubParts(SIZE_2Nx2N);
2895     merge.cu.m_mergeFlag[0] = true;
2896 
2897     skip.initCosts();
2898     skip.cu.setPredModeSubParts(MODE_INTER);
2899     skip.cu.setPartSizeSubParts(SIZE_2Nx2N);
2900     skip.cu.m_mergeFlag[0] = true;
2901 
2902     MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
2903     uint8_t candDir[MRG_MAX_NUM_CANDS];
2904     uint32_t numMergeCand = merge.cu.getInterMergeCandidates(0, 0, candMvField, candDir);
2905     PredictionUnit pu(merge.cu, cuGeom, 0);
2906 
2907     bool foundCbf0Merge = false;
2908     bool triedPZero = false, triedBZero = false;
2909     bestPred->rdCost = MAX_INT64;
2910 
2911     int safeX, maxSafeMv;
2912     if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
2913     {
2914         safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
2915         maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
2916     }
2917     for (uint32_t i = 0; i < numMergeCand; i++)
2918     {
2919         if (m_bFrameParallel)
2920         {
2921             // Parallel slices bound check
2922             if (m_param->maxSlices > 1)
2923             {
2924                 // NOTE: First row in slice can't negative
2925                 if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
2926                     continue;
2927 
2928                 // Last row in slice can't reference beyond bound since it is another slice area
2929                 // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
2930                 if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
2931                     continue;
2932             }
2933 
2934             if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
2935                 candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
2936                 continue;
2937         }
2938 
2939         /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
2940         if (candDir[i] == 1 && !candMvField[i][0].mv.word && !candMvField[i][0].refIdx)
2941         {
2942             if (triedPZero)
2943                 continue;
2944             triedPZero = true;
2945         }
2946         else if (candDir[i] == 3 &&
2947             !candMvField[i][0].mv.word && !candMvField[i][0].refIdx &&
2948             !candMvField[i][1].mv.word && !candMvField[i][1].refIdx)
2949         {
2950             if (triedBZero)
2951                 continue;
2952             triedBZero = true;
2953         }
2954         if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
2955             tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
2956             candMvField[i][0].mv.x > maxSafeMv)
2957             // skip merge candidates which reference beyond safe reference area
2958             continue;
2959         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;    /* merge candidate ID is stored in L0 MVP idx */
2960         tempPred->cu.m_interDir[0] = candDir[i];
2961         tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
2962         tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
2963         tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
2964         tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
2965         tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
2966 
2967         motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_csp != X265_CSP_I400);
2968 
2969         uint8_t hasCbf = true;
2970         bool swapped = false;
2971         if (!foundCbf0Merge)
2972         {
2973             /* if the best prediction has CBF (not a skip) then try merge with residual */
2974 
2975             encodeResAndCalcRdInterCU(*tempPred, cuGeom);
2976             hasCbf = tempPred->cu.getQtRootCbf(0);
2977             foundCbf0Merge = !hasCbf;
2978 
2979             if (tempPred->rdCost < bestPred->rdCost)
2980             {
2981                 std::swap(tempPred, bestPred);
2982                 swapped = true;
2983             }
2984         }
2985         if (!m_param->bLossless && hasCbf)
2986         {
2987             /* try merge without residual (skip), if not lossless coding */
2988 
2989             if (swapped)
2990             {
2991                 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
2992                 tempPred->cu.m_interDir[0] = candDir[i];
2993                 tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
2994                 tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
2995                 tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
2996                 tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
2997                 tempPred->cu.setPredModeSubParts(MODE_INTER);
2998                 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
2999             }
3000 
3001             encodeResAndCalcRdSkipCU(*tempPred);
3002 
3003             if (tempPred->rdCost < bestPred->rdCost)
3004                 std::swap(tempPred, bestPred);
3005         }
3006     }
3007 
3008     if (bestPred->rdCost < MAX_INT64)
3009     {
3010         m_modeDepth[depth].bestMode = bestPred;
3011 
3012         /* broadcast sets of MV field data */
3013         uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
3014         bestPred->cu.setPUInterDir(candDir[bestCand], 0, 0);
3015         bestPred->cu.setPUMv(0, candMvField[bestCand][0].mv, 0, 0);
3016         bestPred->cu.setPUMv(1, candMvField[bestCand][1].mv, 0, 0);
3017         bestPred->cu.setPURefIdx(0, (int8_t)candMvField[bestCand][0].refIdx, 0, 0);
3018         bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestCand][1].refIdx, 0, 0);
3019         checkDQP(*bestPred, cuGeom);
3020     }
3021 }
3022 
checkInter_rd0_4(Mode & interMode,const CUGeom & cuGeom,PartSize partSize,uint32_t refMask[2])3023 void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
3024 {
3025     interMode.initCosts();
3026     interMode.cu.setPartSizeSubParts(partSize);
3027     interMode.cu.setPredModeSubParts(MODE_INTER);
3028     int numPredDir = m_slice->isInterP() ? 1 : 2;
3029 
3030     if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU)
3031     {
3032         int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3033         int index = 0;
3034 
3035         uint32_t numPU = interMode.cu.getNumPartInter(0);
3036         for (uint32_t part = 0; part < numPU; part++)
3037         {
3038             MotionData* bestME = interMode.bestME[part];
3039             for (int32_t i = 0; i < numPredDir; i++)
3040                 bestME[i].ref = m_reuseRef[refOffset + index++];
3041         }
3042     }
3043 
3044     if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
3045     {
3046         uint32_t numPU = interMode.cu.getNumPartInter(0);
3047         for (uint32_t part = 0; part < numPU; part++)
3048         {
3049             MotionData* bestME = interMode.bestME[part];
3050             for (int32_t i = 0; i < numPredDir; i++)
3051             {
3052                 int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
3053                 bestME[i].ref = ref[cuGeom.absPartIdx];
3054                 bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
3055                 bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
3056             }
3057         }
3058     }
3059     predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400), refMask);
3060 
3061     /* predInterSearch sets interMode.sa8dBits */
3062     const Yuv& fencYuv = *interMode.fencYuv;
3063     Yuv& predYuv = interMode.predYuv;
3064     int part = partitionFromLog2Size(cuGeom.log2CUSize);
3065     interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
3066     if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3067     {
3068         interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
3069         interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
3070     }
3071     interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);
3072 
3073     if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
3074     {
3075         int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3076         int index = 0;
3077 
3078         uint32_t numPU = interMode.cu.getNumPartInter(0);
3079         for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3080         {
3081             MotionData* bestME = interMode.bestME[puIdx];
3082             for (int32_t i = 0; i < numPredDir; i++)
3083                 m_reuseRef[refOffset + index++] = bestME[i].ref;
3084         }
3085     }
3086 }
3087 
checkInter_rd5_6(Mode & interMode,const CUGeom & cuGeom,PartSize partSize,uint32_t refMask[2])3088 void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
3089 {
3090     interMode.initCosts();
3091     interMode.cu.setPartSizeSubParts(partSize);
3092     interMode.cu.setPredModeSubParts(MODE_INTER);
3093     int numPredDir = m_slice->isInterP() ? 1 : 2;
3094 
3095     if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU)
3096     {
3097         int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3098         int index = 0;
3099 
3100         uint32_t numPU = interMode.cu.getNumPartInter(0);
3101         for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3102         {
3103             MotionData* bestME = interMode.bestME[puIdx];
3104             for (int32_t i = 0; i < numPredDir; i++)
3105                 bestME[i].ref = m_reuseRef[refOffset + index++];
3106         }
3107     }
3108 
3109     if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
3110     {
3111         uint32_t numPU = interMode.cu.getNumPartInter(0);
3112         for (uint32_t part = 0; part < numPU; part++)
3113         {
3114             MotionData* bestME = interMode.bestME[part];
3115             for (int32_t i = 0; i < numPredDir; i++)
3116             {
3117                 int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
3118                 bestME[i].ref = ref[cuGeom.absPartIdx];
3119                 bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
3120                 bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
3121             }
3122         }
3123     }
3124 
3125     predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, refMask);
3126 
3127     /* predInterSearch sets interMode.sa8dBits, but this is ignored */
3128     encodeResAndCalcRdInterCU(interMode, cuGeom);
3129 
3130     if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
3131     {
3132         int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3133         int index = 0;
3134 
3135         uint32_t numPU = interMode.cu.getNumPartInter(0);
3136         for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3137         {
3138             MotionData* bestME = interMode.bestME[puIdx];
3139             for (int32_t i = 0; i < numPredDir; i++)
3140                 m_reuseRef[refOffset + index++] = bestME[i].ref;
3141         }
3142     }
3143 }
3144 
checkBidir2Nx2N(Mode & inter2Nx2N,Mode & bidir2Nx2N,const CUGeom & cuGeom)3145 void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
3146 {
3147     CUData& cu = bidir2Nx2N.cu;
3148 
3149     if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
3150     {
3151         bidir2Nx2N.sa8dCost = MAX_INT64;
3152         bidir2Nx2N.rdCost = MAX_INT64;
3153         return;
3154     }
3155 
3156     const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
3157     MV   mvzero(0, 0);
3158     int  partEnum = cuGeom.log2CUSize - 2;
3159 
3160     bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
3161     bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
3162     MotionData* bestME = bidir2Nx2N.bestME[0];
3163     int ref0    = bestME[0].ref;
3164     MV  mvp0    = bestME[0].mvp;
3165     int mvpIdx0 = bestME[0].mvpIdx;
3166     int ref1    = bestME[1].ref;
3167     MV  mvp1    = bestME[1].mvp;
3168     int mvpIdx1 = bestME[1].mvpIdx;
3169 
3170     bidir2Nx2N.initCosts();
3171     cu.setPartSizeSubParts(SIZE_2Nx2N);
3172     cu.setPredModeSubParts(MODE_INTER);
3173     cu.setPUInterDir(3, 0, 0);
3174     cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
3175     cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
3176     cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
3177     cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
3178     cu.m_mergeFlag[0] = 0;
3179 
3180     /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
3181     cu.setPUMv(0, bestME[0].mv, 0, 0);
3182     cu.m_mvd[0][0] = bestME[0].mv - mvp0;
3183 
3184     cu.setPUMv(1, bestME[1].mv, 0, 0);
3185     cu.m_mvd[1][0] = bestME[1].mv - mvp1;
3186 
3187     PredictionUnit pu(cu, cuGeom, 0);
3188     motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
3189 
3190     int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
3191     if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3192     {
3193         /* Add in chroma distortion */
3194         sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
3195         sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
3196     }
3197     bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
3198     bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
3199 
3200     bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
3201     if (bTryZero)
3202     {
3203         /* Do not try zero MV if unidir motion predictors are beyond
3204          * valid search area */
3205         MV mvmin, mvmax;
3206         int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
3207         setSearchRange(cu, mvzero, merange, mvmin, mvmax);
3208         mvmax.y += 2; // there is some pad for subpel refine
3209         mvmin <<= 2;
3210         mvmax <<= 2;
3211 
3212         bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
3213         bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
3214     }
3215     if (bTryZero)
3216     {
3217         /* Estimate cost of BIDIR using coincident blocks */
3218         Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
3219 
3220         int zsa8d;
3221 
3222         if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3223         {
3224             cu.m_mv[0][0] = mvzero;
3225             cu.m_mv[1][0] = mvzero;
3226 
3227             motionCompensation(cu, pu, tmpPredYuv, true, true);
3228             zsa8d  = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
3229             zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
3230             zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
3231 
3232         }
3233         else
3234         {
3235             pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
3236             pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
3237             intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
3238             primitives.pu[partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
3239             zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
3240         }
3241         uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
3242         uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
3243         uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
3244 
3245         /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
3246         mvp0 = checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvpIdx0, bits0, zcost);
3247         mvp1 = checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvpIdx1, bits1, zcost);
3248 
3249         uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
3250         zcost = zsa8d + m_rdCost.getCost(zbits);
3251 
3252         if (zcost < bidir2Nx2N.sa8dCost)
3253         {
3254             bidir2Nx2N.sa8dBits = zbits;
3255             bidir2Nx2N.sa8dCost = zcost;
3256 
3257             cu.setPUMv(0, mvzero, 0, 0);
3258             cu.m_mvd[0][0] = mvzero - mvp0;
3259             cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
3260 
3261             cu.setPUMv(1, mvzero, 0, 0);
3262             cu.m_mvd[1][0] = mvzero - mvp1;
3263             cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
3264 
3265             if (m_bChromaSa8d) /* real MC was already performed */
3266                 bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
3267             else
3268                 motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400);
3269         }
3270         else if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3271         {
3272             /* recover overwritten motion vectors */
3273             cu.m_mv[0][0] = bestME[0].mv;
3274             cu.m_mv[1][0] = bestME[1].mv;
3275         }
3276     }
3277 }
3278 
encodeResidue(const CUData & ctu,const CUGeom & cuGeom)3279 void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
3280 {
3281     if (cuGeom.depth < ctu.m_cuDepth[cuGeom.absPartIdx] && cuGeom.depth < ctu.m_encData->m_param->maxCUDepth)
3282     {
3283         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
3284         {
3285             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
3286             if (childGeom.flags & CUGeom::PRESENT)
3287                 encodeResidue(ctu, childGeom);
3288         }
3289         return;
3290     }
3291 
3292     uint32_t absPartIdx = cuGeom.absPartIdx;
3293     int sizeIdx = cuGeom.log2CUSize - 2;
3294 
3295     /* reuse the bestMode data structures at the current depth */
3296     Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
3297     CUData& cu = bestMode->cu;
3298 
3299     cu.copyFromPic(ctu, cuGeom, m_csp);
3300 
3301     PicYuv& reconPic = *m_frame->m_reconPic;
3302 
3303     Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
3304     if (cuGeom.depth)
3305         m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
3306     X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
3307 
3308     if (cu.isIntra(0))
3309     {
3310         ProfileCUScope(ctu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); // not really RDO, but close enough
3311 
3312         uint32_t tuDepthRange[2];
3313         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
3314 
3315         residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
3316         if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3317         {
3318             getBestIntraModeChroma(*bestMode, cuGeom);
3319             residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
3320         }
3321     }
3322     else // if (cu.isInter(0))
3323     {
3324         ProfileCUScope(ctu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]); // not really RDO, but close enough
3325 
3326         X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
3327 
3328         /* Calculate residual for current CU part into depth sized resiYuv */
3329 
3330         ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
3331 
3332         /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
3333         Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
3334         pixel* predY = predYuv.getLumaAddr(absPartIdx);
3335 
3336         primitives.cu[sizeIdx].sub_ps(resiYuv.m_buf[0], resiYuv.m_size,
3337                                       fencYuv.m_buf[0], predY,
3338                                       fencYuv.m_size, predYuv.m_size);
3339 
3340         if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3341         {
3342             pixel* predU = predYuv.getCbAddr(absPartIdx);
3343             pixel* predV = predYuv.getCrAddr(absPartIdx);
3344             primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize,
3345                                                  fencYuv.m_buf[1], predU,
3346                                                  fencYuv.m_csize, predYuv.m_csize);
3347 
3348             primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize,
3349                                                  fencYuv.m_buf[2], predV,
3350                                                  fencYuv.m_csize, predYuv.m_csize);
3351         }
3352 
3353         uint32_t tuDepthRange[2];
3354         cu.getInterTUQtDepthRange(tuDepthRange, 0);
3355 
3356         residualTransformQuantInter(*bestMode, cuGeom, 0, 0, tuDepthRange);
3357 
3358         if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
3359             cu.setPredModeSubParts(MODE_SKIP);
3360 
3361         /* residualTransformQuantInter() wrote transformed residual back into
3362          * resiYuv. Generate the recon pixels by adding it to the prediction */
3363 
3364         if (cu.m_cbf[0][0])
3365         {
3366             bool reconPicAlign = (reconPic.m_cuOffsetY[cu.m_cuAddr] + reconPic.m_buOffsetY[absPartIdx]) % 64 == 0;
3367             bool predYalign = predYuv.getAddrOffset(absPartIdx, predYuv.m_size) % 64 == 0;
3368             primitives.cu[sizeIdx].add_ps[reconPicAlign && predYalign && (reconPic.m_stride % 64 == 0) && (predYuv.m_size % 64 == 0) &&
3369                 (resiYuv.m_size % 64 == 0)](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
3370         }
3371         else
3372             primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
3373                                            predY, predYuv.m_size);
3374         if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3375         {
3376              pixel* predU = predYuv.getCbAddr(absPartIdx);
3377              pixel* predV = predYuv.getCrAddr(absPartIdx);
3378              if (cu.m_cbf[1][0])
3379              {
3380                  bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
3381                  bool predUalign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
3382                  primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predUalign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
3383                      (resiYuv.m_csize % 64 == 0)](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
3384              }
3385             else
3386                 primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
3387                                                          predU, predYuv.m_csize);
3388 
3389             if (cu.m_cbf[2][0])
3390             {
3391                 bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
3392                 bool predValign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
3393                 primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predValign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
3394                     (resiYuv.m_csize % 64 == 0)](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
3395             }
3396             else
3397                 primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
3398                                                          predV, predYuv.m_csize);
3399         }
3400     }
3401 
3402     cu.updatePic(cuGeom.depth, m_frame->m_fencPic->m_picCsp);
3403 }
3404 
addSplitFlagCost(Mode & mode,uint32_t depth)3405 void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
3406 {
3407     if (m_param->rdLevel >= 3)
3408     {
3409         /* code the split flag (0 or 1) and update bit costs */
3410         mode.contexts.resetBits();
3411         mode.contexts.codeSplitFlag(mode.cu, 0, depth);
3412         uint32_t bits = mode.contexts.getNumberOfWrittenBits();
3413         mode.totalBits += bits;
3414         updateModeCost(mode);
3415     }
3416     else if (m_param->rdLevel <= 1)
3417     {
3418         mode.sa8dBits++;
3419         mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
3420     }
3421     else
3422     {
3423         mode.totalBits++;
3424         updateModeCost(mode);
3425     }
3426 }
3427 
topSkipMinDepth(const CUData & parentCTU,const CUGeom & cuGeom)3428 uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom)
3429 {
3430     /* Do not attempt to code a block larger than the largest block in the
3431      * co-located CTUs in L0 and L1 */
3432     int currentQP = parentCTU.m_qp[0];
3433     int previousQP = currentQP;
3434     uint32_t minDepth0 = 4, minDepth1 = 4;
3435     uint32_t sum = 0;
3436     int numRefs = 0;
3437     if (m_slice->m_numRefIdx[0])
3438     {
3439         numRefs++;
3440         const CUData& cu = *m_slice->m_refFrameList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
3441         previousQP = cu.m_qp[0];
3442         if (!cu.m_cuDepth[cuGeom.absPartIdx])
3443             return 0;
3444         for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
3445         {
3446             uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
3447             minDepth0 = X265_MIN(d, minDepth0);
3448             sum += d;
3449         }
3450     }
3451     if (m_slice->m_numRefIdx[1])
3452     {
3453         numRefs++;
3454         const CUData& cu = *m_slice->m_refFrameList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
3455         if (!cu.m_cuDepth[cuGeom.absPartIdx])
3456             return 0;
3457         for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
3458         {
3459             uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
3460             minDepth1 = X265_MIN(d, minDepth1);
3461             sum += d;
3462         }
3463     }
3464     if (!numRefs)
3465         return 0;
3466 
3467     uint32_t minDepth = X265_MIN(minDepth0, minDepth1);
3468     uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2);
3469 
3470     /* allow block size growth if QP is raising or avg depth is
3471      * less than 1.5 of min depth */
3472     if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1)))
3473         minDepth -= 1;
3474 
3475     return minDepth;
3476 }
3477 
3478 /* returns true if recursion should be stopped */
recursionDepthCheck(const CUData & parentCTU,const CUGeom & cuGeom,const Mode & bestMode)3479 bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode)
3480 {
3481     /* early exit when the RD cost of best mode at depth n is less than the sum
3482      * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright,
3483      * left, colocated) and avg cost of that CU at depth "n" with weightage for
3484      * each quantity */
3485 
3486     uint32_t depth = cuGeom.depth;
3487     FrameData& curEncData = *m_frame->m_encData;
3488     FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
3489     uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
3490     uint64_t cuCount = cuStat.count[depth];
3491 
3492     uint64_t neighCost = 0, neighCount = 0;
3493     const CUData* above = parentCTU.m_cuAbove;
3494     if (above)
3495     {
3496         FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr];
3497         neighCost += astat.avgCost[depth] * astat.count[depth];
3498         neighCount += astat.count[depth];
3499 
3500         const CUData* aboveLeft = parentCTU.m_cuAboveLeft;
3501         if (aboveLeft)
3502         {
3503             FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr];
3504             neighCost += lstat.avgCost[depth] * lstat.count[depth];
3505             neighCount += lstat.count[depth];
3506         }
3507 
3508         const CUData* aboveRight = parentCTU.m_cuAboveRight;
3509         if (aboveRight)
3510         {
3511             FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr];
3512             neighCost += rstat.avgCost[depth] * rstat.count[depth];
3513             neighCount += rstat.count[depth];
3514         }
3515     }
3516     const CUData* left = parentCTU.m_cuLeft;
3517     if (left)
3518     {
3519         FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr];
3520         neighCost += nstat.avgCost[depth] * nstat.count[depth];
3521         neighCount += nstat.count[depth];
3522     }
3523 
3524     // give 60% weight to all CU's and 40% weight to neighbour CU's
3525     if (neighCount + cuCount)
3526     {
3527         uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
3528         uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
3529         if (curCost < avgCost && avgCost)
3530             return true;
3531     }
3532 
3533     return false;
3534 }
3535 
complexityCheckCU(const Mode & bestMode)3536 bool Analysis::complexityCheckCU(const Mode& bestMode)
3537 {
3538     if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
3539     {
3540         uint32_t mean = 0;
3541         uint32_t homo = 0;
3542         uint32_t cuSize = bestMode.fencYuv->m_size;
3543         for (uint32_t y = 0; y < cuSize; y++) {
3544             for (uint32_t x = 0; x < cuSize; x++) {
3545                 mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
3546             }
3547         }
3548         mean = mean / (cuSize * cuSize);
3549         for (uint32_t y = 0; y < cuSize; y++) {
3550             for (uint32_t x = 0; x < cuSize; x++) {
3551                 homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
3552             }
3553         }
3554         homo = homo / (cuSize * cuSize);
3555 
3556         if (homo < (.1 * mean))
3557             return true;
3558 
3559         return false;
3560     }
3561     else
3562     {
3563         int blockType = bestMode.cu.m_log2CUSize[0] - LOG2_UNIT_SIZE;
3564         int shift = bestMode.cu.m_log2CUSize[0] * LOG2_UNIT_SIZE;
3565         intptr_t stride = m_frame->m_fencPic->m_stride;
3566         intptr_t blockOffsetLuma = bestMode.cu.m_cuPelX + bestMode.cu.m_cuPelY * stride;
3567         uint64_t sum_ss = primitives.cu[blockType].var(m_frame->m_edgeBitPic + blockOffsetLuma, stride);
3568         uint32_t sum = (uint32_t)sum_ss;
3569         uint32_t ss = (uint32_t)(sum_ss >> 32);
3570         uint32_t pixelCount = 1 << shift;
3571         double cuEdgeVariance = (ss - ((double)sum * sum / pixelCount)) / pixelCount;
3572 
3573         if (cuEdgeVariance > (double)m_param->edgeVarThreshold)
3574             return false;
3575         else
3576             return true;
3577     }
3578  }
3579 
calculateCUVariance(const CUData & ctu,const CUGeom & cuGeom)3580 uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom)
3581 {
3582     uint32_t cuVariance = 0;
3583     uint32_t *blockVariance = m_frame->m_lowres.blockVariance;
3584     int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
3585 
3586     uint32_t width = m_frame->m_fencPic->m_picWidth;
3587     uint32_t height = m_frame->m_fencPic->m_picHeight;
3588     uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
3589     uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
3590     uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
3591     uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
3592     uint32_t cnt = 0;
3593 
3594     for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
3595     {
3596         for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
3597         {
3598             uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
3599             cuVariance += blockVariance[idx];
3600             cnt++;
3601         }
3602     }
3603     return cuVariance / cnt;
3604 }
3605 
aqQPOffset(const CUData & ctu,const CUGeom & cuGeom)3606 double Analysis::aqQPOffset(const CUData& ctu, const CUGeom& cuGeom)
3607 {
3608     uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1);
3609     PicQPAdaptationLayer* pQPLayer = &m_frame->m_lowres.pAQLayer[aqDepth];
3610 
3611     uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pQPLayer->aqPartWidth;
3612     uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pQPLayer->aqPartHeight;
3613 
3614     uint32_t aqStride = pQPLayer->numAQPartInWidth;
3615 
3616     double dQpOffset = pQPLayer->dQpOffset[aqPosY * aqStride + aqPosX];
3617     return dQpOffset;
3618 }
3619 
cuTreeQPOffset(const CUData & ctu,const CUGeom & cuGeom)3620 double Analysis::cuTreeQPOffset(const CUData& ctu, const CUGeom& cuGeom)
3621 {
3622     uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1);
3623     PicQPAdaptationLayer* pcAQLayer = &m_frame->m_lowres.pAQLayer[aqDepth];
3624 
3625     uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pcAQLayer->aqPartWidth;
3626     uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pcAQLayer->aqPartHeight;
3627 
3628     uint32_t aqStride = pcAQLayer->numAQPartInWidth;
3629 
3630     double dQpOffset = pcAQLayer->dCuTreeOffset[aqPosY * aqStride + aqPosX];
3631     return dQpOffset;
3632 }
3633 
calculateQpforCuSize(const CUData & ctu,const CUGeom & cuGeom,int32_t complexCheck,double baseQp)3634 int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp)
3635 {
3636     FrameData& curEncData = *m_frame->m_encData;
3637     double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
3638     bool bCuTreeOffset = IS_REFERENCED(m_frame) && m_param->rc.cuTree && !complexCheck;
3639 
3640     if ((m_param->analysisMultiPassDistortion && m_param->rc.bStatRead) || (m_param->ctuDistortionRefine && m_param->analysisLoad))
3641     {
3642         x265_analysis_distortion_data* distortionData = m_frame->m_analysisData.distortionData;
3643         if ((distortionData->threshold[ctu.m_cuAddr] < 0.9 || distortionData->threshold[ctu.m_cuAddr] > 1.1)
3644             && distortionData->highDistortionCtuCount && distortionData->lowDistortionCtuCount)
3645             qp += distortionData->offset[ctu.m_cuAddr];
3646     }
3647 
3648     if (m_param->analysisLoadReuseLevel == 10 && m_param->rc.cuTree)
3649     {
3650         int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
3651         if (ctu.m_slice->m_sliceType == I_SLICE)
3652             return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_intra_data*)m_frame->m_analysisData.intraData)->cuQPOff[cuIdx]));
3653         else
3654             return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_inter_data*)m_frame->m_analysisData.interData)->cuQPOff[cuIdx]));
3655     }
3656     if (m_param->rc.hevcAq)
3657     {
3658         /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
3659         double dQpOffset = 0;
3660         if (bCuTreeOffset)
3661         {
3662             dQpOffset = cuTreeQPOffset(ctu, cuGeom);
3663         }
3664         else
3665         {
3666             dQpOffset = aqQPOffset(ctu, cuGeom);
3667             if (complexCheck)
3668             {
3669                 int32_t offset = (int32_t)(dQpOffset * 100 + .5);
3670                 double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
3671                 int32_t max_threshold = (int32_t)(threshold * 100 + .5);
3672                 return (offset < max_threshold);
3673             }
3674         }
3675         qp += dQpOffset;
3676     }
3677     else
3678     {
3679         int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
3680         /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
3681         double *qpoffs = bCuTreeOffset ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
3682         if (qpoffs)
3683         {
3684             uint32_t width = m_frame->m_fencPic->m_picWidth;
3685             uint32_t height = m_frame->m_fencPic->m_picHeight;
3686             uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
3687             uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
3688             uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
3689             uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
3690             double dQpOffset = 0;
3691             uint32_t cnt = 0;
3692             for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
3693             {
3694                 for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
3695                 {
3696                     uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
3697                     dQpOffset += qpoffs[idx];
3698                     cnt++;
3699                 }
3700             }
3701             dQpOffset /= cnt;
3702             qp += dQpOffset;
3703             if (complexCheck)
3704             {
3705                 int32_t offset = (int32_t)(dQpOffset * 100 + .5);
3706                 double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
3707                 int32_t max_threshold = (int32_t)(threshold * 100 + .5);
3708                 return (offset < max_threshold);
3709             }
3710         }
3711     }
3712 
3713     return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)(qp + 0.5));
3714 }
3715 
normFactor(const pixel * src,uint32_t blockSize,CUData & ctu,int qp,TextType ttype)3716 void Analysis::normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype)
3717 {
3718     static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); // 416
3719     static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); // 235963
3720     int shift = (X265_DEPTH - 8);
3721 
3722     double s = 1 + 0.005 * qp;
3723 
3724     // Calculate denominator of normalization factor
3725     uint64_t fDc_den = 0, fAc_den = 0;
3726 
3727     // 1. Calculate dc component
3728     uint64_t z_o = 0;
3729     for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 4)
3730     {
3731         for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 4)
3732         {
3733             uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
3734             z_o += temp * temp; // 2 * (Z(0)) pow(2)
3735         }
3736     }
3737     fDc_den = (2 * z_o)  + (blockSize * blockSize * ssim_c1); // 2 * (Z(0)) pow(2) + N * C1
3738     fDc_den /= ((blockSize >> 2) * (blockSize >> 2));
3739 
3740     // 2. Calculate ac component
3741     uint64_t z_k = 0;
3742     int block = (int)(((log(blockSize) / log(2)) - 2) + 0.5);
3743     primitives.cu[block].normFact(src, blockSize, shift, &z_k);
3744 
3745     // Remove the DC part
3746     z_k -= z_o;
3747 
3748     fAc_den = z_k + int(s * z_k) + ssim_c2;
3749     fAc_den /= ((blockSize >> 2) * (blockSize >> 2));
3750 
3751     ctu.m_fAc_den[ttype] = fAc_den;
3752     ctu.m_fDc_den[ttype] = fDc_den;
3753 }
3754 
calculateNormFactor(CUData & ctu,int qp)3755 void Analysis::calculateNormFactor(CUData& ctu, int qp)
3756 {
3757     const pixel* srcY = m_modeDepth[0].fencYuv.m_buf[0];
3758     uint32_t blockSize = m_modeDepth[0].fencYuv.m_size;
3759 
3760     normFactor(srcY, blockSize, ctu, qp, TEXT_LUMA);
3761 
3762     if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3763     {
3764         const pixel* srcU = m_modeDepth[0].fencYuv.m_buf[1];
3765         const pixel* srcV = m_modeDepth[0].fencYuv.m_buf[2];
3766         uint32_t blockSizeC = m_modeDepth[0].fencYuv.m_csize;
3767 
3768         normFactor(srcU, blockSizeC, ctu, qp, TEXT_CHROMA_U);
3769         normFactor(srcV, blockSizeC, ctu, qp, TEXT_CHROMA_V);
3770     }
3771 }
3772 
findSameContentRefCount(const CUData & parentCTU,const CUGeom & cuGeom)3773 int Analysis::findSameContentRefCount(const CUData& parentCTU, const CUGeom& cuGeom)
3774 {
3775     int sameContentRef = 0;
3776     int m_curPoc = parentCTU.m_slice->m_poc;
3777     int prevChange = m_prevCtuInfoChange[cuGeom.absPartIdx];
3778     int numPredDir = m_slice->isInterP() ? 1 : 2;
3779     for (int list = 0; list < numPredDir; list++)
3780     {
3781         for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
3782         {
3783             int refPoc = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_poc;
3784             int refPrevChange = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_addOnPrevChange[parentCTU.m_cuAddr][cuGeom.absPartIdx];
3785             if ((refPoc < prevChange && refPoc < m_curPoc) || (refPoc > m_curPoc && prevChange < m_curPoc && refPrevChange > m_curPoc) || ((refPoc == prevChange) && (m_additionalCtuInfo[cuGeom.absPartIdx] == CTU_INFO_CHANGE)))
3786                 sameContentRef++;    /* Content changed */
3787         }
3788     }
3789     return sameContentRef;
3790 }
3791