1 /*****************************************************************************
2 * Copyright (C) 2013-2020 MulticoreWare, Inc
3 *
4 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5 * Steve Borho <steve@borho.org>
6 * Min Chen <chenm003@163.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
25
26 #include "common.h"
27 #include "frame.h"
28 #include "framedata.h"
29 #include "picyuv.h"
30 #include "primitives.h"
31 #include "threading.h"
32
33 #include "analysis.h"
34 #include "rdcost.h"
35 #include "encoder.h"
36
37 using namespace X265_NS;
38
39 /* An explanation of rate distortion levels (--rd-level)
40 *
41 * rd-level 0 generates no recon per CU (NO RDO or Quant)
42 *
43 * sa8d selection between merge / skip / inter / intra and split
44 * no recon pixels generated until CTU analysis is complete, requiring
45 * intra predictions to use source pixels
46 *
47 * rd-level 1 uses RDO for merge and skip, sa8d for all else
48 *
49 * RDO selection between merge and skip
50 * sa8d selection between (merge/skip) / inter modes / intra and split
51 * intra prediction uses reconstructed pixels
52 *
53 * rd-level 2 uses RDO for merge/skip and split
54 *
55 * RDO selection between merge and skip
56 * sa8d selection between (merge/skip) / inter modes / intra
57 * RDO split decisions
58 *
59 * rd-level 3 uses RDO for merge/skip/best inter/intra
60 *
61 * RDO selection between merge and skip
62 * sa8d selection of best inter mode
63 * sa8d decisions include chroma residual cost
64 * RDO selection between (merge/skip) / best inter mode / intra / split
65 *
66 * rd-level 4 enables RDOQuant
67 * chroma residual cost included in satd decisions, including subpel refine
68 * (as a result of --subme 3 being used by preset slow)
69 *
70 * rd-level 5,6 does RDO for each inter mode
71 */
72
Analysis()73 Analysis::Analysis()
74 {
75 m_reuseInterDataCTU = NULL;
76 m_reuseRef = NULL;
77 m_bHD = false;
78 m_modeFlag[0] = false;
79 m_modeFlag[1] = false;
80 m_checkMergeAndSkipOnly[0] = false;
81 m_checkMergeAndSkipOnly[1] = false;
82 m_evaluateInter = 0;
83 }
84
create(ThreadLocalData * tld)85 bool Analysis::create(ThreadLocalData *tld)
86 {
87 m_tld = tld;
88 m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
89
90 int costArrSize = 1;
91 uint32_t maxDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
92 for (uint32_t i = 1; i <= maxDQPDepth; i++)
93 costArrSize += (1 << (i * 2));
94 cacheCost = X265_MALLOC(uint64_t, costArrSize);
95
96 int csp = m_param->internalCsp;
97 uint32_t cuSize = m_param->maxCUSize;
98
99 bool ok = true;
100 for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++, cuSize >>= 1)
101 {
102 ModeDepth &md = m_modeDepth[depth];
103 ok &= md.cuMemPool.create(depth, csp, MAX_PRED_TYPES, *m_param);
104 ok &= md.fencYuv.create(cuSize, csp);
105 if (ok)
106 {
107 for (int j = 0; j < MAX_PRED_TYPES; j++)
108 {
109 md.pred[j].cu.initialize(md.cuMemPool, depth, *m_param, j);
110 ok &= md.pred[j].predYuv.create(cuSize, csp);
111 ok &= md.pred[j].reconYuv.create(cuSize, csp);
112 md.pred[j].fencYuv = &md.fencYuv;
113 }
114 }
115 }
116 if (m_param->sourceHeight >= 1080)
117 m_bHD = true;
118
119 return ok;
120 }
121
destroy()122 void Analysis::destroy()
123 {
124 for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
125 {
126 m_modeDepth[i].cuMemPool.destroy();
127 m_modeDepth[i].fencYuv.destroy();
128
129 for (int j = 0; j < MAX_PRED_TYPES; j++)
130 {
131 m_modeDepth[i].pred[j].predYuv.destroy();
132 m_modeDepth[i].pred[j].reconYuv.destroy();
133 }
134 }
135 X265_FREE(cacheCost);
136 }
137
compressCTU(CUData & ctu,Frame & frame,const CUGeom & cuGeom,const Entropy & initialContext)138 Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
139 {
140 m_slice = ctu.m_slice;
141 m_frame = &frame;
142 m_bChromaSa8d = m_param->rdLevel >= 3;
143 m_param = m_frame->m_param;
144
145 #if _DEBUG || CHECKED_BUILD
146 invalidateContexts(0);
147 #endif
148
149 int qp = setLambdaFromQP(ctu, m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, cuGeom) : m_slice->m_sliceQp);
150 ctu.setQPSubParts((int8_t)qp, 0, 0);
151
152 m_rqt[0].cur.load(initialContext);
153 ctu.m_meanQP = initialContext.m_meanQP;
154 m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
155
156 if (m_param->bSsimRd)
157 calculateNormFactor(ctu, qp);
158
159 uint32_t numPartition = ctu.m_numPartitions;
160 if (m_param->bCTUInfo && (*m_frame->m_ctuInfo + ctu.m_cuAddr))
161 {
162 x265_ctu_info_t* ctuTemp = *m_frame->m_ctuInfo + ctu.m_cuAddr;
163 int32_t depthIdx = 0;
164 uint32_t maxNum8x8Partitions = 64;
165 uint8_t* depthInfoPtr = m_frame->m_addOnDepth[ctu.m_cuAddr];
166 uint8_t* contentInfoPtr = m_frame->m_addOnCtuInfo[ctu.m_cuAddr];
167 int* prevCtuInfoChangePtr = m_frame->m_addOnPrevChange[ctu.m_cuAddr];
168 do
169 {
170 uint8_t depth = (uint8_t)ctuTemp->ctuPartitions[depthIdx];
171 uint8_t content = (uint8_t)(*((int32_t *)ctuTemp->ctuInfo + depthIdx));
172 int prevCtuInfoChange = m_frame->m_prevCtuInfoChange[ctu.m_cuAddr * maxNum8x8Partitions + depthIdx];
173 memset(depthInfoPtr, depth, sizeof(uint8_t) * numPartition >> 2 * depth);
174 memset(contentInfoPtr, content, sizeof(uint8_t) * numPartition >> 2 * depth);
175 memset(prevCtuInfoChangePtr, 0, sizeof(int) * numPartition >> 2 * depth);
176 for (uint32_t l = 0; l < numPartition >> 2 * depth; l++)
177 prevCtuInfoChangePtr[l] = prevCtuInfoChange;
178 depthInfoPtr += ctu.m_numPartitions >> 2 * depth;
179 contentInfoPtr += ctu.m_numPartitions >> 2 * depth;
180 prevCtuInfoChangePtr += ctu.m_numPartitions >> 2 * depth;
181 depthIdx++;
182 } while (ctuTemp->ctuPartitions[depthIdx] != 0);
183
184 m_additionalCtuInfo = m_frame->m_addOnCtuInfo[ctu.m_cuAddr];
185 m_prevCtuInfoChange = m_frame->m_addOnPrevChange[ctu.m_cuAddr];
186 memcpy(ctu.m_cuDepth, m_frame->m_addOnDepth[ctu.m_cuAddr], sizeof(uint8_t) * numPartition);
187 //Calculate log2CUSize from depth
188 for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
189 ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
190 }
191 if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && (m_slice->m_sliceType != I_SLICE))
192 {
193 int numPredDir = m_slice->isInterP() ? 1 : 2;
194 m_reuseInterDataCTU = m_frame->m_analysisData.interData;
195 for (int dir = 0; dir < numPredDir; dir++)
196 {
197 m_reuseMv[dir] = &m_reuseInterDataCTU->mv[dir][ctu.m_cuAddr * ctu.m_numPartitions];
198 m_reuseMvpIdx[dir] = &m_reuseInterDataCTU->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions];
199 }
200 m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * ctu.m_numPartitions];
201 m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
202 m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
203 }
204
205 int reuseLevel = X265_MAX(m_param->analysisSaveReuseLevel, m_param->analysisLoadReuseLevel);
206 if ((m_param->analysisSave || m_param->analysisLoad) && m_slice->m_sliceType != I_SLICE && reuseLevel > 1 && reuseLevel < 10)
207 {
208 int numPredDir = m_slice->isInterP() ? 1 : 2;
209 m_reuseInterDataCTU = m_frame->m_analysisData.interData;
210 if (((m_param->analysisSaveReuseLevel > 1) && (m_param->analysisSaveReuseLevel < 7)) ||
211 ((m_param->analysisLoadReuseLevel > 1) && (m_param->analysisLoadReuseLevel < 7)))
212 m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
213 m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
214 m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
215 if (reuseLevel > 4)
216 {
217 m_reusePartSize = &m_reuseInterDataCTU->partSize[ctu.m_cuAddr * ctu.m_numPartitions];
218 m_reuseMergeFlag = &m_reuseInterDataCTU->mergeFlag[ctu.m_cuAddr * ctu.m_numPartitions];
219 }
220 if (m_param->analysisSave && !m_param->analysisLoad)
221 for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * numPredDir; i++)
222 m_reuseRef[i] = -1;
223 }
224 ProfileCUScope(ctu, totalCTUTime, totalCTUs);
225
226 if (m_slice->m_sliceType == I_SLICE)
227 {
228 x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
229 if (m_param->analysisLoadReuseLevel > 1)
230 {
231 memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
232 memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
233 memcpy(ctu.m_partSize, &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * numPartition);
234 memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
235 }
236 compressIntraCU(ctu, cuGeom, qp);
237 }
238 else
239 {
240 bool bCopyAnalysis = ((m_param->analysisLoadReuseLevel == 10) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16));
241 bool bCompressInterCUrd0_4 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel <= 4);
242 bool bCompressInterCUrd5_6 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel >= 5 && m_param->rdLevel <= 6);
243 bCopyAnalysis = bCopyAnalysis || bCompressInterCUrd0_4 || bCompressInterCUrd5_6;
244
245 if (bCopyAnalysis)
246 {
247 x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
248 int posCTU = ctu.m_cuAddr * numPartition;
249 memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
250 memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
251 memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
252 for (int list = 0; list < m_slice->isInterB() + 1; list++)
253 memcpy(ctu.m_skipFlag[list], &m_frame->m_analysisData.modeFlag[list][posCTU], sizeof(uint8_t) * numPartition);
254
255 if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO))
256 {
257 x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
258 memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
259 memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
260 }
261 //Calculate log2CUSize from depth
262 for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
263 ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
264 }
265
266 if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
267 ctu.m_cuPelX / m_param->maxCUSize >= frame.m_encData->m_pir.pirStartCol
268 && ctu.m_cuPelX / m_param->maxCUSize < frame.m_encData->m_pir.pirEndCol)
269 compressIntraCU(ctu, cuGeom, qp);
270 else if (!m_param->rdLevel)
271 {
272 /* In RD Level 0/1, copy source pixels into the reconstructed block so
273 * they are available for intra predictions */
274 m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
275
276 compressInterCU_rd0_4(ctu, cuGeom, qp);
277
278 /* generate residual for entire CTU at once and copy to reconPic */
279 encodeResidue(ctu, cuGeom);
280 }
281 else if ((m_param->analysisLoadReuseLevel == 10 && (!(m_param->bAnalysisType == HEVC_INFO) || m_slice->m_sliceType != P_SLICE)) ||
282 ((m_param->bAnalysisType == AVC_INFO) && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16))
283 {
284 x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
285 int posCTU = ctu.m_cuAddr * numPartition;
286 memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
287 memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
288 memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
289 if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO))
290 {
291 x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
292 memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
293 memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
294 }
295 //Calculate log2CUSize from depth
296 for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
297 ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
298
299 qprdRefine (ctu, cuGeom, qp, qp);
300 return *m_modeDepth[0].bestMode;
301 }
302 else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
303 compressInterCU_dist(ctu, cuGeom, qp);
304 else if (m_param->rdLevel <= 4)
305 compressInterCU_rd0_4(ctu, cuGeom, qp);
306 else
307 compressInterCU_rd5_6(ctu, cuGeom, qp);
308 }
309
310 if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP)
311 qprdRefine(ctu, cuGeom, qp, qp);
312
313 if (m_param->csvLogLevel >= 2)
314 collectPUStatistics(ctu, cuGeom);
315
316 return *m_modeDepth[0].bestMode;
317 }
318
collectPUStatistics(const CUData & ctu,const CUGeom & cuGeom)319 void Analysis::collectPUStatistics(const CUData& ctu, const CUGeom& cuGeom)
320 {
321 uint8_t depth = 0;
322 uint8_t partSize = 0;
323 for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
324 {
325 depth = ctu.m_cuDepth[absPartIdx];
326 partSize = ctu.m_partSize[absPartIdx];
327 uint32_t numPU = nbPartsTable[(int)partSize];
328 int shift = 2 * (m_param->maxCUDepth + 1 - depth);
329 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
330 {
331 PredictionUnit pu(ctu, cuGeom, puIdx);
332 int puabsPartIdx = ctu.getPUOffset(puIdx, absPartIdx);
333 int mode = 1;
334 if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_Nx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxN)
335 mode = 2;
336 else if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnU || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnD || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nLx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nRx2N)
337 mode = 3;
338 if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_SKIP)
339 {
340 ctu.m_encData->m_frameStats.cntSkipPu[depth] += 1ULL << shift;
341 ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
342 }
343 else if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_INTRA)
344 {
345 if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_NxN)
346 {
347 ctu.m_encData->m_frameStats.cnt4x4++;
348 ctu.m_encData->m_frameStats.totalPu[4]++;
349 }
350 else
351 {
352 ctu.m_encData->m_frameStats.cntIntraPu[depth] += 1ULL << shift;
353 ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
354 }
355 }
356 else if (mode == 3)
357 {
358 ctu.m_encData->m_frameStats.cntAmp[depth] += 1ULL << shift;
359 ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
360 break;
361 }
362 else
363 {
364 if (ctu.m_mergeFlag[puabsPartIdx + absPartIdx])
365 ctu.m_encData->m_frameStats.cntMergePu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode;
366 else
367 ctu.m_encData->m_frameStats.cntInterPu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode;
368
369 ctu.m_encData->m_frameStats.totalPu[depth] += (1 << shift) / mode;
370 }
371 }
372 }
373 }
374
loadTUDepth(CUGeom cuGeom,CUData parentCTU)375 int32_t Analysis::loadTUDepth(CUGeom cuGeom, CUData parentCTU)
376 {
377 float predDepth = 0;
378 CUData* neighbourCU;
379 uint8_t count = 0;
380 int32_t maxTUDepth = -1;
381 neighbourCU = &m_slice->m_refFrameList[0][0]->m_encData->m_picCTU[parentCTU.m_cuAddr];
382 predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
383 count++;
384 if (m_slice->isInterB())
385 {
386 neighbourCU = &m_slice->m_refFrameList[1][0]->m_encData->m_picCTU[parentCTU.m_cuAddr];
387 predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
388 count++;
389 }
390 if (parentCTU.m_cuAbove)
391 {
392 predDepth += parentCTU.m_cuAbove->m_refTuDepth[cuGeom.geomRecurId];
393 count++;
394 if (parentCTU.m_cuAboveLeft)
395 {
396 predDepth += parentCTU.m_cuAboveLeft->m_refTuDepth[cuGeom.geomRecurId];
397 count++;
398 }
399 if (parentCTU.m_cuAboveRight)
400 {
401 predDepth += parentCTU.m_cuAboveRight->m_refTuDepth[cuGeom.geomRecurId];
402 count++;
403 }
404 }
405 if (parentCTU.m_cuLeft)
406 {
407 predDepth += parentCTU.m_cuLeft->m_refTuDepth[cuGeom.geomRecurId];
408 count++;
409 }
410 predDepth /= count;
411
412 if (predDepth == 0)
413 maxTUDepth = 0;
414 else if (predDepth < 1)
415 maxTUDepth = 1;
416 else if (predDepth >= 1 && predDepth <= 1.5)
417 maxTUDepth = 2;
418 else if (predDepth > 1.5 && predDepth <= 2.5)
419 maxTUDepth = 3;
420 else
421 maxTUDepth = -1;
422
423 return maxTUDepth;
424 }
425
tryLossless(const CUGeom & cuGeom)426 void Analysis::tryLossless(const CUGeom& cuGeom)
427 {
428 ModeDepth& md = m_modeDepth[cuGeom.depth];
429
430 if (!md.bestMode->distortion)
431 /* already lossless */
432 return;
433 else if (md.bestMode->cu.isIntra(0))
434 {
435 md.pred[PRED_LOSSLESS].initCosts();
436 md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
437 PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
438 checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size);
439 checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
440 }
441 else
442 {
443 md.pred[PRED_LOSSLESS].initCosts();
444 md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
445 md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
446 encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
447 checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
448 }
449 }
450
qprdRefine(const CUData & parentCTU,const CUGeom & cuGeom,int32_t qp,int32_t lqp)451 void Analysis::qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
452 {
453 uint32_t depth = cuGeom.depth;
454 ModeDepth& md = m_modeDepth[depth];
455 md.bestMode = NULL;
456
457 bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
458
459 int bestCUQP = qp;
460 int lambdaQP = lqp;
461 bool doQPRefine = (bDecidedDepth && depth <= m_slice->m_pps->maxCuDQPDepth) || (!bDecidedDepth && depth == m_slice->m_pps->maxCuDQPDepth);
462 if (m_param->analysisLoadReuseLevel >= 7)
463 doQPRefine = false;
464 if (doQPRefine)
465 {
466 uint64_t bestCUCost, origCUCost, cuCost, cuPrevCost;
467
468 int cuIdx = (cuGeom.childOffset - 1) / 3;
469 bestCUCost = origCUCost = cacheCost[cuIdx];
470
471 int direction = m_param->bOptCUDeltaQP ? 1 : 2;
472
473 for (int dir = direction; dir >= -direction; dir -= (direction * 2))
474 {
475 if (m_param->bOptCUDeltaQP && ((dir != 1) || ((qp + 3) >= (int32_t)parentCTU.m_meanQP)))
476 break;
477
478 int threshold = 1;
479 int failure = 0;
480 cuPrevCost = origCUCost;
481
482 int modCUQP = qp + dir;
483 while (modCUQP >= m_param->rc.qpMin && modCUQP <= QP_MAX_SPEC)
484 {
485 if (m_param->bOptCUDeltaQP && modCUQP > (int32_t)parentCTU.m_meanQP)
486 break;
487
488 recodeCU(parentCTU, cuGeom, modCUQP, qp);
489 cuCost = md.bestMode->rdCost;
490
491 COPY2_IF_LT(bestCUCost, cuCost, bestCUQP, modCUQP);
492 if (cuCost < cuPrevCost)
493 failure = 0;
494 else
495 failure++;
496
497 if (failure > threshold)
498 break;
499
500 cuPrevCost = cuCost;
501 modCUQP += dir;
502 }
503 }
504 lambdaQP = bestCUQP;
505 }
506
507 recodeCU(parentCTU, cuGeom, bestCUQP, lambdaQP);
508
509 /* Copy best data to encData CTU and recon */
510 md.bestMode->cu.copyToPic(depth);
511 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
512 }
513
compressIntraCU(const CUData & parentCTU,const CUGeom & cuGeom,int32_t qp)514 uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
515 {
516 uint32_t depth = cuGeom.depth;
517 ModeDepth& md = m_modeDepth[depth];
518 md.bestMode = NULL;
519
520 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
521 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
522
523 bool bAlreadyDecided = m_param->intraRefine != 4 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX && !(m_param->bAnalysisType == HEVC_INFO);
524 bool bDecidedDepth = m_param->intraRefine != 4 && parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
525 int split = 0;
526 if (m_param->intraRefine && m_param->intraRefine != 4)
527 {
528 split = m_param->scaleFactor && bDecidedDepth && (!mightNotSplit ||
529 ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
530 if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth)
531 bAlreadyDecided = false;
532 }
533
534 if (bAlreadyDecided)
535 {
536 if (bDecidedDepth && mightNotSplit)
537 {
538 Mode& mode = md.pred[0];
539 md.bestMode = &mode;
540 mode.cu.initSubCU(parentCTU, cuGeom, qp);
541 bool reuseModes = !((m_param->intraRefine == 3) ||
542 (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
543 if (reuseModes)
544 {
545 memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
546 memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
547 }
548 checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
549
550 if (m_bTryLossless)
551 tryLossless(cuGeom);
552
553 if (mightSplit)
554 addSplitFlagCost(*md.bestMode, cuGeom.depth);
555 }
556 }
557 else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
558 {
559 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
560 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
561 checkBestMode(md.pred[PRED_INTRA], depth);
562
563 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
564 {
565 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
566 checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
567 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
568 }
569
570 if (m_bTryLossless)
571 tryLossless(cuGeom);
572
573 if (mightSplit)
574 addSplitFlagCost(*md.bestMode, cuGeom.depth);
575 }
576
577 // stop recursion if we reach the depth of previous analysis decision
578 mightSplit &= !(bAlreadyDecided && bDecidedDepth) || split;
579
580 if (mightSplit)
581 {
582 Mode* splitPred = &md.pred[PRED_SPLIT];
583 splitPred->initCosts();
584 CUData* splitCU = &splitPred->cu;
585 splitCU->initSubCU(parentCTU, cuGeom, qp);
586
587 uint32_t nextDepth = depth + 1;
588 ModeDepth& nd = m_modeDepth[nextDepth];
589 invalidateContexts(nextDepth);
590 Entropy* nextContext = &m_rqt[depth].cur;
591 int32_t nextQP = qp;
592 uint64_t curCost = 0;
593 int skipSplitCheck = 0;
594
595 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
596 {
597 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
598 if (childGeom.flags & CUGeom::PRESENT)
599 {
600 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
601 m_rqt[nextDepth].cur.load(*nextContext);
602
603 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
604 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
605
606 if (m_param->bEnableSplitRdSkip)
607 {
608 curCost += compressIntraCU(parentCTU, childGeom, nextQP);
609 if (m_modeDepth[depth].bestMode && curCost > m_modeDepth[depth].bestMode->rdCost)
610 {
611 skipSplitCheck = 1;
612 break;
613 }
614 }
615 else
616 compressIntraCU(parentCTU, childGeom, nextQP);
617
618 // Save best CU and pred data for this sub CU
619 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
620 splitPred->addSubCosts(*nd.bestMode);
621 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
622 nextContext = &nd.bestMode->contexts;
623 }
624 else
625 {
626 /* record the depth of this non-present sub-CU */
627 splitCU->setEmptyPart(childGeom, subPartIdx);
628
629 /* Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP */
630 if (bAlreadyDecided)
631 memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
632 }
633 }
634 if (!skipSplitCheck)
635 {
636 nextContext->store(splitPred->contexts);
637 if (mightNotSplit)
638 addSplitFlagCost(*splitPred, cuGeom.depth);
639 else
640 updateModeCost(*splitPred);
641
642 checkDQPForSplitPred(*splitPred, cuGeom);
643 checkBestMode(*splitPred, depth);
644 }
645 }
646
647 if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
648 {
649 int cuIdx = (cuGeom.childOffset - 1) / 3;
650 cacheCost[cuIdx] = md.bestMode->rdCost;
651 }
652
653 if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
654 {
655 CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
656 int8_t maxTUDepth = -1;
657 for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
658 maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
659 ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
660 }
661
662 /* Copy best data to encData CTU and recon */
663 md.bestMode->cu.copyToPic(depth);
664 if (md.bestMode != &md.pred[PRED_SPLIT])
665 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
666
667 return md.bestMode->rdCost;
668 }
669
processTasks(int workerThreadId)670 void Analysis::PMODE::processTasks(int workerThreadId)
671 {
672 #if DETAILED_CU_STATS
673 int fe = master.m_modeDepth[cuGeom.depth].pred[PRED_2Nx2N].cu.m_encData->m_frameEncoderID;
674 master.m_stats[fe].countPModeTasks++;
675 ScopedElapsedTime pmodeTime(master.m_stats[fe].pmodeTime);
676 #endif
677 ProfileScopeEvent(pmode);
678 master.processPmode(*this, master.m_tld[workerThreadId].analysis);
679 }
680
681 /* process pmode jobs until none remain; may be called by the master thread or by
682 * a bonded peer (slave) thread via pmodeTasks() */
processPmode(PMODE & pmode,Analysis & slave)683 void Analysis::processPmode(PMODE& pmode, Analysis& slave)
684 {
685 /* acquire a mode task, else exit early */
686 int task;
687 pmode.m_lock.acquire();
688 if (pmode.m_jobTotal > pmode.m_jobAcquired)
689 {
690 task = pmode.m_jobAcquired++;
691 pmode.m_lock.release();
692 }
693 else
694 {
695 pmode.m_lock.release();
696 return;
697 }
698
699 ModeDepth& md = m_modeDepth[pmode.cuGeom.depth];
700
701 /* setup slave Analysis */
702 if (&slave != this)
703 {
704 slave.m_slice = m_slice;
705 slave.m_frame = m_frame;
706 slave.m_param = m_param;
707 slave.m_bChromaSa8d = m_param->rdLevel >= 3;
708 slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp);
709 slave.invalidateContexts(0);
710 slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
711 }
712
713 /* perform Mode task, repeat until no more work is available */
714 do
715 {
716 uint32_t refMasks[2] = { 0, 0 };
717
718 if (m_param->rdLevel <= 4)
719 {
720 switch (pmode.modes[task])
721 {
722 case PRED_INTRA:
723 slave.checkIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
724 if (m_param->rdLevel > 2)
725 slave.encodeIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
726 break;
727
728 case PRED_2Nx2N:
729 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
730
731 slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
732 if (m_slice->m_sliceType == B_SLICE)
733 slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
734 break;
735
736 case PRED_Nx2N:
737 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
738 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
739
740 slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
741 break;
742
743 case PRED_2NxN:
744 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
745 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
746
747 slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
748 break;
749
750 case PRED_2NxnU:
751 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
752 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
753
754 slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
755 break;
756
757 case PRED_2NxnD:
758 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
759 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
760
761 slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
762 break;
763
764 case PRED_nLx2N:
765 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
766 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
767
768 slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
769 break;
770
771 case PRED_nRx2N:
772 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
773 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
774
775 slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
776 break;
777
778 default:
779 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
780 break;
781 }
782 }
783 else
784 {
785 switch (pmode.modes[task])
786 {
787 case PRED_INTRA:
788 slave.checkIntra(md.pred[PRED_INTRA], pmode.cuGeom, SIZE_2Nx2N);
789 if (pmode.cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
790 slave.checkIntra(md.pred[PRED_INTRA_NxN], pmode.cuGeom, SIZE_NxN);
791 break;
792
793 case PRED_2Nx2N:
794 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
795
796 slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
797 md.pred[PRED_BIDIR].rdCost = MAX_INT64;
798 if (m_slice->m_sliceType == B_SLICE)
799 {
800 slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
801 if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
802 slave.encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], pmode.cuGeom);
803 }
804 break;
805
806 case PRED_Nx2N:
807 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
808 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
809
810 slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
811 break;
812
813 case PRED_2NxN:
814 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
815 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
816
817 slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
818 break;
819
820 case PRED_2NxnU:
821 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
822 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
823
824 slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
825 break;
826
827 case PRED_2NxnD:
828 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
829 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
830 slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
831 break;
832
833 case PRED_nLx2N:
834 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
835 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
836
837 slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
838 break;
839
840 case PRED_nRx2N:
841 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
842 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
843 slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
844 break;
845
846 default:
847 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
848 break;
849 }
850 }
851
852 task = -1;
853 pmode.m_lock.acquire();
854 if (pmode.m_jobTotal > pmode.m_jobAcquired)
855 task = pmode.m_jobAcquired++;
856 pmode.m_lock.release();
857 }
858 while (task >= 0);
859 }
860
compressInterCU_dist(const CUData & parentCTU,const CUGeom & cuGeom,int32_t qp)861 uint32_t Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
862 {
863 uint32_t depth = cuGeom.depth;
864 uint32_t cuAddr = parentCTU.m_cuAddr;
865 ModeDepth& md = m_modeDepth[depth];
866 md.bestMode = NULL;
867
868 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
869 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
870 uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0;
871 uint32_t splitRefs[4] = { 0, 0, 0, 0 };
872
873 X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n");
874
875 PMODE pmode(*this, cuGeom);
876
877 if (mightNotSplit && depth >= minDepth)
878 {
879 /* Initialize all prediction CUs based on parentCTU */
880 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
881 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
882
883 if (m_param->rdLevel <= 4)
884 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
885 else
886 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
887 }
888
889 bool bNoSplit = false;
890 bool splitIntra = true;
891 if (md.bestMode)
892 {
893 bNoSplit = md.bestMode->cu.isSkipped(0);
894 if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
895 bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
896 }
897
898 if (mightSplit && !bNoSplit)
899 {
900 Mode* splitPred = &md.pred[PRED_SPLIT];
901 splitPred->initCosts();
902 CUData* splitCU = &splitPred->cu;
903 splitCU->initSubCU(parentCTU, cuGeom, qp);
904
905 uint32_t nextDepth = depth + 1;
906 ModeDepth& nd = m_modeDepth[nextDepth];
907 invalidateContexts(nextDepth);
908 Entropy* nextContext = &m_rqt[depth].cur;
909 int nextQP = qp;
910 splitIntra = false;
911
912 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
913 {
914 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
915 if (childGeom.flags & CUGeom::PRESENT)
916 {
917 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
918 m_rqt[nextDepth].cur.load(*nextContext);
919
920 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
921 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
922
923 splitRefs[subPartIdx] = compressInterCU_dist(parentCTU, childGeom, nextQP);
924
925 // Save best CU and pred data for this sub CU
926 splitIntra |= nd.bestMode->cu.isIntra(0);
927 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
928 splitPred->addSubCosts(*nd.bestMode);
929
930 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
931 nextContext = &nd.bestMode->contexts;
932 }
933 else
934 splitCU->setEmptyPart(childGeom, subPartIdx);
935 }
936 nextContext->store(splitPred->contexts);
937
938 if (mightNotSplit)
939 addSplitFlagCost(*splitPred, cuGeom.depth);
940 else
941 updateModeCost(*splitPred);
942
943 checkDQPForSplitPred(*splitPred, cuGeom);
944 }
945
946 if (mightNotSplit && depth >= minDepth)
947 {
948 int bTryAmp = m_slice->m_sps->maxAMPDepth > depth;
949 int bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (!m_param->limitReferences || splitIntra) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE);
950
951 if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
952 setLambdaFromQP(parentCTU, qp);
953
954 if (bTryIntra)
955 {
956 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
957 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3 && m_param->rdLevel >= 5)
958 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
959 pmode.modes[pmode.m_jobTotal++] = PRED_INTRA;
960 }
961 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2Nx2N;
962 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
963 if (m_param->bEnableRectInter)
964 {
965 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxN;
966 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_Nx2N;
967 }
968 if (bTryAmp)
969 {
970 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnU;
971 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnD;
972 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nLx2N;
973 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N;
974 }
975
976 m_splitRefIdx[0] = splitRefs[0]; m_splitRefIdx[1] = splitRefs[1]; m_splitRefIdx[2] = splitRefs[2]; m_splitRefIdx[3] = splitRefs[3];
977
978 pmode.tryBondPeers(*m_frame->m_encData->m_jobProvider, pmode.m_jobTotal);
979
980 /* participate in processing jobs, until all are distributed */
981 processPmode(pmode, *this);
982
983 /* the master worker thread (this one) does merge analysis. By doing
984 * merge after all the other jobs are at least started, we usually avoid
985 * blocking on another thread */
986
987 if (m_param->rdLevel <= 4)
988 {
989 {
990 ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
991 pmode.waitForExit();
992 }
993
994 /* select best inter mode based on sa8d cost */
995 Mode *bestInter = &md.pred[PRED_2Nx2N];
996
997 if (m_param->bEnableRectInter)
998 {
999 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
1000 bestInter = &md.pred[PRED_Nx2N];
1001 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1002 bestInter = &md.pred[PRED_2NxN];
1003 }
1004
1005 if (bTryAmp)
1006 {
1007 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
1008 bestInter = &md.pred[PRED_2NxnU];
1009 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1010 bestInter = &md.pred[PRED_2NxnD];
1011 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
1012 bestInter = &md.pred[PRED_nLx2N];
1013 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1014 bestInter = &md.pred[PRED_nRx2N];
1015 }
1016
1017 if (m_param->rdLevel > 2)
1018 {
1019 /* RD selection between merge, inter, bidir and intra */
1020 if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
1021 {
1022 uint32_t numPU = bestInter->cu.getNumPartInter(0);
1023 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1024 {
1025 PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
1026 motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
1027 }
1028 }
1029 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
1030 checkBestMode(*bestInter, depth);
1031
1032 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
1033 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
1034 md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
1035 {
1036 encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1037 checkBestMode(md.pred[PRED_BIDIR], depth);
1038 }
1039
1040 if (bTryIntra)
1041 checkBestMode(md.pred[PRED_INTRA], depth);
1042 }
1043 else /* m_param->rdLevel == 2 */
1044 {
1045 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
1046 md.bestMode = bestInter;
1047
1048 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
1049 md.bestMode = &md.pred[PRED_BIDIR];
1050
1051 if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
1052 {
1053 md.bestMode = &md.pred[PRED_INTRA];
1054 encodeIntraInInter(*md.bestMode, cuGeom);
1055 }
1056 else if (!md.bestMode->cu.m_mergeFlag[0])
1057 {
1058 /* finally code the best mode selected from SA8D costs */
1059 uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
1060 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1061 {
1062 PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
1063 motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
1064 }
1065 encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
1066 }
1067 }
1068 }
1069 else
1070 {
1071 {
1072 ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
1073 pmode.waitForExit();
1074 }
1075
1076 checkBestMode(md.pred[PRED_2Nx2N], depth);
1077 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
1078 checkBestMode(md.pred[PRED_BIDIR], depth);
1079
1080 if (m_param->bEnableRectInter)
1081 {
1082 checkBestMode(md.pred[PRED_Nx2N], depth);
1083 checkBestMode(md.pred[PRED_2NxN], depth);
1084 }
1085
1086 if (bTryAmp)
1087 {
1088 checkBestMode(md.pred[PRED_2NxnU], depth);
1089 checkBestMode(md.pred[PRED_2NxnD], depth);
1090 checkBestMode(md.pred[PRED_nLx2N], depth);
1091 checkBestMode(md.pred[PRED_nRx2N], depth);
1092 }
1093
1094 if (bTryIntra)
1095 {
1096 checkBestMode(md.pred[PRED_INTRA], depth);
1097 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
1098 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
1099 }
1100 }
1101
1102 if (m_bTryLossless)
1103 tryLossless(cuGeom);
1104
1105 if (mightSplit)
1106 addSplitFlagCost(*md.bestMode, cuGeom.depth);
1107 }
1108
1109 /* compare split RD cost against best cost */
1110 if (mightSplit && !bNoSplit)
1111 checkBestMode(md.pred[PRED_SPLIT], depth);
1112
1113 /* determine which motion references the parent CU should search */
1114 uint32_t refMask;
1115 if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
1116 refMask = 0;
1117 else if (md.bestMode == &md.pred[PRED_SPLIT])
1118 refMask = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
1119 else
1120 {
1121 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1122 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1123 uint32_t numPU = cu.getNumPartInter(0);
1124 refMask = 0;
1125 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1126 refMask |= cu.getBestRefIdx(subPartIdx);
1127 }
1128
1129 if (mightNotSplit)
1130 {
1131 /* early-out statistics */
1132 FrameData& curEncData = *m_frame->m_encData;
1133 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1134 uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
1135 cuStat.count[depth] += 1;
1136 cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
1137 }
1138
1139 /* Copy best data to encData CTU and recon */
1140 md.bestMode->cu.copyToPic(depth);
1141 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
1142
1143 return refMask;
1144 }
1145
compressInterCU_rd0_4(const CUData & parentCTU,const CUGeom & cuGeom,int32_t qp)1146 SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
1147 {
1148 if (parentCTU.m_vbvAffected && calculateQpforCuSize(parentCTU, cuGeom, 1))
1149 return compressInterCU_rd5_6(parentCTU, cuGeom, qp);
1150
1151 uint32_t depth = cuGeom.depth;
1152 uint32_t cuAddr = parentCTU.m_cuAddr;
1153 ModeDepth& md = m_modeDepth[depth];
1154
1155
1156 if (m_param->searchMethod == X265_SEA)
1157 {
1158 int numPredDir = m_slice->isInterP() ? 1 : 2;
1159 int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
1160 for (int list = 0; list < numPredDir; list++)
1161 for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
1162 for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
1163 m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
1164 }
1165
1166 PicYuv& reconPic = *m_frame->m_reconPic;
1167 SplitData splitCUData;
1168
1169 bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16);
1170 bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
1171 bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO);
1172
1173 if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
1174 {
1175 md.bestMode = NULL;
1176 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1177 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1178 uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
1179 bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
1180 bool skipModes = false; /* Skip any remaining mode analyses at current depth */
1181 bool skipRecursion = false; /* Skip recursion */
1182 bool splitIntra = true;
1183 bool skipRectAmp = false;
1184 bool chooseMerge = false;
1185 bool bCtuInfoCheck = false;
1186 int sameContentRef = 0;
1187
1188 if (m_evaluateInter)
1189 {
1190 if (m_refineLevel == 2)
1191 {
1192 if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
1193 skipModes = true;
1194 if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1195 skipRectAmp = true;
1196 }
1197 mightSplit &= false;
1198 minDepth = depth;
1199 }
1200
1201 if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
1202 m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
1203
1204 SplitData splitData[4];
1205 splitData[0].initSplitCUData();
1206 splitData[1].initSplitCUData();
1207 splitData[2].initSplitCUData();
1208 splitData[3].initSplitCUData();
1209
1210 // avoid uninitialize value in below reference
1211 if (m_param->limitModes)
1212 {
1213 md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
1214 md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
1215 md.pred[PRED_2Nx2N].sa8dCost = 0;
1216 }
1217
1218 if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
1219 {
1220 if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
1221 sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
1222 if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
1223 {
1224 mightNotSplit &= bDecidedDepth;
1225 bCtuInfoCheck = skipRecursion = false;
1226 skipModes = true;
1227 }
1228 else if (mightNotSplit && bDecidedDepth)
1229 {
1230 if (m_additionalCtuInfo[cuGeom.absPartIdx])
1231 {
1232 bCtuInfoCheck = skipRecursion = true;
1233 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1234 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1235 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1236 if (!sameContentRef)
1237 {
1238 if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
1239 {
1240 qp -= int32_t(0.04 * qp);
1241 setLambdaFromQP(parentCTU, qp);
1242 }
1243 if (m_param->bCTUInfo & 4)
1244 skipModes = false;
1245 }
1246 if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
1247 {
1248 if (m_param->rdLevel)
1249 skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1250 if ((m_param->bCTUInfo & 4) && sameContentRef)
1251 skipModes = md.bestMode && true;
1252 }
1253 }
1254 else
1255 {
1256 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1257 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1258 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1259 if (m_param->rdLevel)
1260 skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1261 }
1262 mightSplit &= !bDecidedDepth;
1263 }
1264 }
1265 if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10))
1266 {
1267 if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1268 {
1269 if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1270 {
1271 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1272 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1273 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1274
1275 skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
1276 if (m_param->rdLevel)
1277 skipModes = m_param->bEnableEarlySkip && md.bestMode;
1278 }
1279 if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1280 {
1281 if (m_reuseModes[cuGeom.absPartIdx] != MODE_INTRA && m_reuseModes[cuGeom.absPartIdx] != 4)
1282 {
1283 skipRectAmp = true && !!md.bestMode;
1284 chooseMerge = !!m_reuseMergeFlag[cuGeom.absPartIdx] && !!md.bestMode;
1285 }
1286 }
1287 }
1288 }
1289 if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
1290 {
1291 if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1292 {
1293 if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1294 {
1295 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1296 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1297 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1298
1299 skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
1300 if (m_param->rdLevel)
1301 skipModes = m_param->bEnableEarlySkip && md.bestMode;
1302 }
1303 }
1304 }
1305 /* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */
1306 if ((mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
1307 /* TODO: Re-evaluate if analysis load/save still works */
1308 {
1309 /* Compute Merge Cost */
1310 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1311 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1312 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1313 if (m_param->rdLevel)
1314 skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
1315 && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
1316 }
1317 if (md.bestMode && m_param->recursionSkipMode && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
1318 {
1319 skipRecursion = md.bestMode->cu.isSkipped(0);
1320 if (mightSplit && !skipRecursion)
1321 {
1322 if (depth >= minDepth && m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
1323 {
1324 if (depth)
1325 skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
1326 if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
1327 skipRecursion = complexityCheckCU(*md.bestMode);
1328 }
1329 else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
1330 {
1331 skipRecursion = complexityCheckCU(*md.bestMode);
1332 }
1333
1334 }
1335 }
1336 if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
1337 skipRecursion = true;
1338 /* Step 2. Evaluate each of the 4 split sub-blocks in series */
1339 if (mightSplit && !skipRecursion)
1340 {
1341 if (bCtuInfoCheck && m_param->bCTUInfo & 2)
1342 qp = int((1 / 0.96) * qp + 0.5);
1343 Mode* splitPred = &md.pred[PRED_SPLIT];
1344 splitPred->initCosts();
1345 CUData* splitCU = &splitPred->cu;
1346 splitCU->initSubCU(parentCTU, cuGeom, qp);
1347
1348 uint32_t nextDepth = depth + 1;
1349 ModeDepth& nd = m_modeDepth[nextDepth];
1350 invalidateContexts(nextDepth);
1351 Entropy* nextContext = &m_rqt[depth].cur;
1352 int nextQP = qp;
1353 splitIntra = false;
1354
1355 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1356 {
1357 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1358 if (childGeom.flags & CUGeom::PRESENT)
1359 {
1360 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
1361 m_rqt[nextDepth].cur.load(*nextContext);
1362
1363 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
1364 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
1365
1366 splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
1367
1368 // Save best CU and pred data for this sub CU
1369 splitIntra |= nd.bestMode->cu.isIntra(0);
1370 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
1371 splitPred->addSubCosts(*nd.bestMode);
1372
1373 if (m_param->rdLevel)
1374 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
1375 else
1376 nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
1377 if (m_param->rdLevel > 1)
1378 nextContext = &nd.bestMode->contexts;
1379 }
1380 else
1381 splitCU->setEmptyPart(childGeom, subPartIdx);
1382 }
1383 nextContext->store(splitPred->contexts);
1384
1385 if (mightNotSplit)
1386 addSplitFlagCost(*splitPred, cuGeom.depth);
1387 else if (m_param->rdLevel > 1)
1388 updateModeCost(*splitPred);
1389 else
1390 splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);
1391 }
1392 /* If analysis mode is simple do not Evaluate other modes */
1393 if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
1394 {
1395 if (m_slice->m_sliceType == P_SLICE)
1396 {
1397 if (m_checkMergeAndSkipOnly[0])
1398 skipModes = true;
1399 }
1400 else
1401 {
1402 if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])
1403 skipModes = true;
1404 }
1405 }
1406 /* Split CUs
1407 * 0 1
1408 * 2 3 */
1409 uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1410 /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
1411 if (mightNotSplit && (depth >= minDepth || (m_param->bCTUInfo && !md.bestMode)))
1412 {
1413 if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
1414 setLambdaFromQP(parentCTU, qp);
1415
1416 if (!skipModes)
1417 {
1418 uint32_t refMasks[2];
1419 refMasks[0] = allSplitRefs;
1420 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1421 checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1422
1423 if (m_param->limitReferences & X265_REF_LIMIT_CU)
1424 {
1425 CUData& cu = md.pred[PRED_2Nx2N].cu;
1426 uint32_t refMask = cu.getBestRefIdx(0);
1427 allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
1428 }
1429
1430 if (m_slice->m_sliceType == B_SLICE)
1431 {
1432 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
1433 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
1434 }
1435
1436 Mode *bestInter = &md.pred[PRED_2Nx2N];
1437 if (!skipRectAmp)
1438 {
1439 if (m_param->bEnableRectInter)
1440 {
1441 uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
1442 uint32_t threshold_2NxN, threshold_Nx2N;
1443
1444 if (m_slice->m_sliceType == P_SLICE)
1445 {
1446 threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
1447 threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
1448 }
1449 else
1450 {
1451 threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
1452 + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
1453 threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
1454 + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
1455 }
1456
1457 int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
1458 if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
1459 {
1460 refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
1461 refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
1462 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1463 checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1464 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1465 bestInter = &md.pred[PRED_2NxN];
1466 }
1467
1468 if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N)
1469 {
1470 refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
1471 refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
1472 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1473 checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
1474 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
1475 bestInter = &md.pred[PRED_Nx2N];
1476 }
1477
1478 if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
1479 {
1480 refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
1481 refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
1482 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1483 checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1484 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1485 bestInter = &md.pred[PRED_2NxN];
1486 }
1487 }
1488
1489 if (m_slice->m_sps->maxAMPDepth > depth)
1490 {
1491 uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
1492 uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
1493
1494 if (m_slice->m_sliceType == P_SLICE)
1495 {
1496 threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
1497 threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
1498
1499 threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
1500 threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
1501 }
1502 else
1503 {
1504 threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
1505 + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
1506 threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
1507 + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
1508
1509 threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
1510 + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
1511 threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
1512 + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
1513 }
1514
1515 bool bHor = false, bVer = false;
1516 if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
1517 bHor = true;
1518 else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
1519 bVer = true;
1520 else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
1521 md.bestMode && md.bestMode->cu.getQtRootCbf(0))
1522 {
1523 bHor = true;
1524 bVer = true;
1525 }
1526
1527 if (bHor)
1528 {
1529 int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
1530 if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
1531 {
1532 refMasks[0] = allSplitRefs; /* 75% top */
1533 refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1534 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1535 checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1536 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1537 bestInter = &md.pred[PRED_2NxnD];
1538 }
1539
1540 if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU)
1541 {
1542 refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
1543 refMasks[1] = allSplitRefs; /* 75% bot */
1544 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
1545 checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
1546 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
1547 bestInter = &md.pred[PRED_2NxnU];
1548 }
1549
1550 if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
1551 {
1552 refMasks[0] = allSplitRefs; /* 75% top */
1553 refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1554 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1555 checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1556 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1557 bestInter = &md.pred[PRED_2NxnD];
1558 }
1559 }
1560 if (bVer)
1561 {
1562 int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
1563 if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
1564 {
1565 refMasks[0] = allSplitRefs; /* 75% left */
1566 refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1567 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1568 checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1569 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1570 bestInter = &md.pred[PRED_nRx2N];
1571 }
1572
1573 if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N)
1574 {
1575 refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left */
1576 refMasks[1] = allSplitRefs; /* 75% right */
1577 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1578 checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
1579 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
1580 bestInter = &md.pred[PRED_nLx2N];
1581 }
1582
1583 if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
1584 {
1585 refMasks[0] = allSplitRefs; /* 75% left */
1586 refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1587 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1588 checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1589 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1590 bestInter = &md.pred[PRED_nRx2N];
1591 }
1592 }
1593 }
1594 }
1595 bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && !((m_param->bCTUInfo & 4) && bCtuInfoCheck);
1596 if (m_param->rdLevel >= 3)
1597 {
1598 /* Calculate RD cost of best inter option */
1599 if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
1600 {
1601 uint32_t numPU = bestInter->cu.getNumPartInter(0);
1602 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1603 {
1604 PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
1605 motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
1606 }
1607 }
1608
1609 if (!chooseMerge)
1610 {
1611 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
1612 checkBestMode(*bestInter, depth);
1613
1614 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
1615 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
1616 md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
1617 {
1618 uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
1619 if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
1620 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1621 {
1622 PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
1623 motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
1624 }
1625 encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1626 checkBestMode(md.pred[PRED_BIDIR], depth);
1627 }
1628 }
1629
1630 if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
1631 md.bestMode->sa8dCost == MAX_INT64)
1632 {
1633 if (!m_param->limitReferences || splitIntra)
1634 {
1635 ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1636 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1637 checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
1638 encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
1639 checkBestMode(md.pred[PRED_INTRA], depth);
1640 }
1641 else
1642 {
1643 ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1644 }
1645 }
1646 }
1647 else
1648 {
1649 /* SA8D choice between merge/skip, inter, bidir, and intra */
1650 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
1651 md.bestMode = bestInter;
1652
1653 if (m_slice->m_sliceType == B_SLICE &&
1654 md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
1655 md.bestMode = &md.pred[PRED_BIDIR];
1656
1657 if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
1658 {
1659 if (!m_param->limitReferences || splitIntra)
1660 {
1661 ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1662 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1663 checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
1664 if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
1665 md.bestMode = &md.pred[PRED_INTRA];
1666 }
1667 else
1668 {
1669 ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1670 }
1671 }
1672
1673 /* finally code the best mode selected by SA8D costs:
1674 * RD level 2 - fully encode the best mode
1675 * RD level 1 - generate recon pixels
1676 * RD level 0 - generate chroma prediction */
1677 if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
1678 {
1679 /* prediction already generated for this CU, and if rd level
1680 * is not 0, it is already fully encoded */
1681 }
1682 else if (md.bestMode->cu.isInter(0))
1683 {
1684 uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
1685 if (m_csp != X265_CSP_I400)
1686 {
1687 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1688 {
1689 PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
1690 motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
1691 }
1692 }
1693 if (m_param->rdLevel == 2)
1694 encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
1695 else if (m_param->rdLevel == 1)
1696 {
1697 /* generate recon pixels with no rate distortion considerations */
1698 CUData& cu = md.bestMode->cu;
1699
1700 uint32_t tuDepthRange[2];
1701 cu.getInterTUQtDepthRange(tuDepthRange, 0);
1702 m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize, m_frame->m_fencPic->m_picCsp);
1703 residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
1704 if (cu.getQtRootCbf(0))
1705 md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0], m_frame->m_fencPic->m_picCsp);
1706 else
1707 {
1708 md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
1709 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
1710 cu.setPredModeSubParts(MODE_SKIP);
1711 }
1712 }
1713 }
1714 else
1715 {
1716 if (m_param->rdLevel == 2)
1717 encodeIntraInInter(*md.bestMode, cuGeom);
1718 else if (m_param->rdLevel == 1)
1719 {
1720 /* generate recon pixels with no rate distortion considerations */
1721 CUData& cu = md.bestMode->cu;
1722
1723 uint32_t tuDepthRange[2];
1724 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1725
1726 residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
1727 if (m_csp != X265_CSP_I400)
1728 {
1729 getBestIntraModeChroma(*md.bestMode, cuGeom);
1730 residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
1731 }
1732 md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
1733 }
1734 }
1735 }
1736 } // !earlyskip
1737
1738 if (m_bTryLossless)
1739 tryLossless(cuGeom);
1740
1741 if (mightSplit)
1742 addSplitFlagCost(*md.bestMode, cuGeom.depth);
1743 }
1744
1745 if (mightSplit && !skipRecursion)
1746 {
1747 Mode* splitPred = &md.pred[PRED_SPLIT];
1748 if (!md.bestMode)
1749 md.bestMode = splitPred;
1750 else if (m_param->rdLevel > 1)
1751 checkBestMode(*splitPred, cuGeom.depth);
1752 else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
1753 md.bestMode = splitPred;
1754
1755 checkDQPForSplitPred(*md.bestMode, cuGeom);
1756 }
1757
1758 /* determine which motion references the parent CU should search */
1759 splitCUData.initSplitCUData();
1760
1761 if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
1762 {
1763 if (md.bestMode == &md.pred[PRED_SPLIT])
1764 splitCUData.splitRefs = allSplitRefs;
1765 else
1766 {
1767 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1768 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1769 uint32_t numPU = cu.getNumPartInter(0);
1770 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1771 splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
1772 }
1773 }
1774
1775 if (m_param->limitModes)
1776 {
1777 splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
1778 splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
1779 splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
1780 }
1781
1782 if (mightNotSplit && md.bestMode->cu.isSkipped(0))
1783 {
1784 FrameData& curEncData = *m_frame->m_encData;
1785 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1786 uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
1787 cuStat.count[depth] += 1;
1788 cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
1789 }
1790
1791 /* Copy best data to encData CTU and recon */
1792 md.bestMode->cu.copyToPic(depth);
1793 if (m_param->rdLevel)
1794 md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
1795
1796 if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
1797 {
1798 if (mightNotSplit)
1799 {
1800 CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
1801 int8_t maxTUDepth = -1;
1802 for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
1803 maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
1804 ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
1805 }
1806 }
1807 }
1808 else
1809 {
1810 if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16)
1811 {
1812 qprdRefine(parentCTU, cuGeom, qp, qp);
1813
1814 SplitData splitData[4];
1815 splitData[0].initSplitCUData();
1816 splitData[1].initSplitCUData();
1817 splitData[2].initSplitCUData();
1818 splitData[3].initSplitCUData();
1819
1820 uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1821
1822 splitCUData.initSplitCUData();
1823
1824 if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
1825 {
1826 if (md.bestMode == &md.pred[PRED_SPLIT])
1827 splitCUData.splitRefs = allSplitRefs;
1828 else
1829 {
1830 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1831 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1832 uint32_t numPU = cu.getNumPartInter(0);
1833 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1834 splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
1835 }
1836 }
1837
1838 if (m_param->limitModes)
1839 {
1840 splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
1841 splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
1842 splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
1843 }
1844 }
1845 }
1846
1847 return splitCUData;
1848 }
1849
compressInterCU_rd5_6(const CUData & parentCTU,const CUGeom & cuGeom,int32_t qp)1850 SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
1851 {
1852 if (parentCTU.m_vbvAffected && !calculateQpforCuSize(parentCTU, cuGeom, 1))
1853 return compressInterCU_rd0_4(parentCTU, cuGeom, qp);
1854
1855 uint32_t depth = cuGeom.depth;
1856 ModeDepth& md = m_modeDepth[depth];
1857 md.bestMode = NULL;
1858
1859 if (m_param->searchMethod == X265_SEA)
1860 {
1861 int numPredDir = m_slice->isInterP() ? 1 : 2;
1862 int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
1863 for (int list = 0; list < numPredDir; list++)
1864 for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
1865 for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
1866 m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
1867 }
1868
1869 SplitData splitCUData;
1870
1871 bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16);
1872 bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
1873 bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO);
1874
1875 if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
1876 {
1877 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1878 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1879 bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
1880 bool skipRecursion = false;
1881 bool skipModes = false;
1882 bool splitIntra = true;
1883 bool skipRectAmp = false;
1884 bool bCtuInfoCheck = false;
1885 int sameContentRef = 0;
1886
1887 if (m_evaluateInter)
1888 {
1889 if (m_refineLevel == 2)
1890 {
1891 if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
1892 skipModes = true;
1893 if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1894 skipRectAmp = true;
1895 }
1896 mightSplit &= false;
1897 }
1898
1899 // avoid uninitialize value in below reference
1900 if (m_param->limitModes)
1901 {
1902 md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
1903 md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
1904 md.pred[PRED_2Nx2N].rdCost = 0;
1905 }
1906
1907 if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
1908 m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
1909
1910 SplitData splitData[4];
1911 splitData[0].initSplitCUData();
1912 splitData[1].initSplitCUData();
1913 splitData[2].initSplitCUData();
1914 splitData[3].initSplitCUData();
1915 uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1916 uint32_t refMasks[2];
1917 if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
1918 {
1919 if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
1920 sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
1921 if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
1922 {
1923 mightNotSplit &= bDecidedDepth;
1924 bCtuInfoCheck = skipRecursion = false;
1925 skipModes = true;
1926 }
1927 else if (mightNotSplit && bDecidedDepth)
1928 {
1929 if (m_additionalCtuInfo[cuGeom.absPartIdx])
1930 {
1931 bCtuInfoCheck = skipRecursion = true;
1932 refMasks[0] = allSplitRefs;
1933 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1934 checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1935 checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
1936 if (!sameContentRef)
1937 {
1938 if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
1939 {
1940 qp -= int32_t(0.04 * qp);
1941 setLambdaFromQP(parentCTU, qp);
1942 }
1943 if (m_param->bCTUInfo & 4)
1944 skipModes = false;
1945 }
1946 if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
1947 {
1948 if (m_param->rdLevel)
1949 skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1950 if ((m_param->bCTUInfo & 4) && sameContentRef)
1951 skipModes = md.bestMode && true;
1952 }
1953 }
1954 else
1955 {
1956 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1957 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1958 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1959 skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
1960 refMasks[0] = allSplitRefs;
1961 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1962 checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1963 checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
1964 }
1965 mightSplit &= !bDecidedDepth;
1966 }
1967 }
1968 if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)
1969 {
1970 if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1971 {
1972 if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1973 {
1974 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1975 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1976 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1977 skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
1978 refMasks[0] = allSplitRefs;
1979 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1980 checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1981 checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
1982
1983 if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
1984 skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
1985 }
1986 if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1987 skipRectAmp = true && !!md.bestMode;
1988 }
1989 }
1990
1991 if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
1992 {
1993 if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1994 {
1995 if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1996 {
1997 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1998 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1999 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2000
2001 skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
2002 refMasks[0] = allSplitRefs;
2003 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2004 checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2005 checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2006
2007 if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
2008 skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2009 }
2010 }
2011 }
2012 /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
2013 if ((mightNotSplit && !md.bestMode && !bCtuInfoCheck) ||
2014 (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
2015 {
2016 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
2017 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
2018 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2019 skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2) &&
2020 md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2021 refMasks[0] = allSplitRefs;
2022 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2023 checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2024 checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2025
2026 if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP && depth && m_modeDepth[depth - 1].bestMode)
2027 skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2028 else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
2029 skipRecursion = md.bestMode && complexityCheckCU(*md.bestMode);
2030 }
2031 if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
2032 skipRecursion = true;
2033 // estimate split cost
2034 /* Step 2. Evaluate each of the 4 split sub-blocks in series */
2035 if (mightSplit && !skipRecursion)
2036 {
2037 if (bCtuInfoCheck && m_param->bCTUInfo & 2)
2038 qp = int((1 / 0.96) * qp + 0.5);
2039 Mode* splitPred = &md.pred[PRED_SPLIT];
2040 splitPred->initCosts();
2041 CUData* splitCU = &splitPred->cu;
2042 splitCU->initSubCU(parentCTU, cuGeom, qp);
2043
2044 uint32_t nextDepth = depth + 1;
2045 ModeDepth& nd = m_modeDepth[nextDepth];
2046 invalidateContexts(nextDepth);
2047 Entropy* nextContext = &m_rqt[depth].cur;
2048 int nextQP = qp;
2049 splitIntra = false;
2050
2051 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
2052 {
2053 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
2054 if (childGeom.flags & CUGeom::PRESENT)
2055 {
2056 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
2057 m_rqt[nextDepth].cur.load(*nextContext);
2058
2059 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
2060 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
2061
2062 splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, nextQP);
2063
2064 // Save best CU and pred data for this sub CU
2065 splitIntra |= nd.bestMode->cu.isIntra(0);
2066 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
2067 splitPred->addSubCosts(*nd.bestMode);
2068 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
2069 nextContext = &nd.bestMode->contexts;
2070 }
2071 else
2072 {
2073 splitCU->setEmptyPart(childGeom, subPartIdx);
2074 }
2075 }
2076 nextContext->store(splitPred->contexts);
2077 if (mightNotSplit)
2078 addSplitFlagCost(*splitPred, cuGeom.depth);
2079 else
2080 updateModeCost(*splitPred);
2081
2082 checkDQPForSplitPred(*splitPred, cuGeom);
2083 }
2084 /* If analysis mode is simple do not Evaluate other modes */
2085 if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
2086 {
2087 if (m_slice->m_sliceType == P_SLICE)
2088 {
2089 if (m_checkMergeAndSkipOnly[0])
2090 skipModes = true;
2091 }
2092 else
2093 {
2094 if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])
2095 skipModes = true;
2096 }
2097 }
2098 /* Split CUs
2099 * 0 1
2100 * 2 3 */
2101 allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2102 /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
2103 if (mightNotSplit)
2104 {
2105 if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
2106 setLambdaFromQP(parentCTU, qp);
2107
2108 if (!skipModes)
2109 {
2110 refMasks[0] = allSplitRefs;
2111
2112 if (m_param->limitReferences & X265_REF_LIMIT_CU)
2113 {
2114 CUData& cu = md.pred[PRED_2Nx2N].cu;
2115 uint32_t refMask = cu.getBestRefIdx(0);
2116 allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
2117 }
2118
2119 if (m_slice->m_sliceType == B_SLICE)
2120 {
2121 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
2122 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
2123 if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
2124 {
2125 uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
2126 if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
2127 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
2128 {
2129 PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
2130 motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
2131 }
2132 encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
2133 checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
2134 }
2135 }
2136
2137 if (!skipRectAmp)
2138 {
2139 if (m_param->bEnableRectInter)
2140 {
2141 uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
2142 uint32_t threshold_2NxN, threshold_Nx2N;
2143
2144 if (m_slice->m_sliceType == P_SLICE)
2145 {
2146 threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
2147 threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
2148 }
2149 else
2150 {
2151 threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
2152 + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
2153 threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
2154 + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
2155 }
2156
2157 int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
2158 if (try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
2159 {
2160 refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
2161 refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
2162 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2163 checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
2164 checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
2165 }
2166
2167 if (splitCost < md.bestMode->rdCost + threshold_Nx2N)
2168 {
2169 refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
2170 refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
2171 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2172 checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
2173 checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
2174 }
2175
2176 if (!try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
2177 {
2178 refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
2179 refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
2180 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2181 checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
2182 checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
2183 }
2184 }
2185
2186 // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
2187 if (m_slice->m_sps->maxAMPDepth > depth)
2188 {
2189 uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
2190 uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
2191
2192 if (m_slice->m_sliceType == P_SLICE)
2193 {
2194 threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
2195 threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
2196
2197 threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
2198 threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
2199 }
2200 else
2201 {
2202 threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
2203 + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
2204 threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
2205 + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
2206
2207 threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
2208 + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
2209 threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
2210 + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
2211 }
2212
2213 bool bHor = false, bVer = false;
2214 if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
2215 bHor = true;
2216 else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
2217 bVer = true;
2218 else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0])
2219 {
2220 bHor = true;
2221 bVer = true;
2222 }
2223
2224 if (bHor)
2225 {
2226 int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
2227 if (try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
2228 {
2229 refMasks[0] = allSplitRefs; /* 75% top */
2230 refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
2231 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
2232 checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
2233 checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
2234 }
2235
2236 if (splitCost < md.bestMode->rdCost + threshold_2NxnU)
2237 {
2238 refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
2239 refMasks[1] = allSplitRefs; /* 75% bot */
2240 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
2241 checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
2242 checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
2243 }
2244
2245 if (!try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
2246 {
2247 refMasks[0] = allSplitRefs; /* 75% top */
2248 refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
2249 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
2250 checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
2251 checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
2252 }
2253 }
2254
2255 if (bVer)
2256 {
2257 int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
2258 if (try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
2259 {
2260 refMasks[0] = allSplitRefs; /* 75% left */
2261 refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
2262 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2263 checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
2264 checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
2265 }
2266
2267 if (splitCost < md.bestMode->rdCost + threshold_nLx2N)
2268 {
2269 refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left */
2270 refMasks[1] = allSplitRefs; /* 75% right */
2271 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2272 checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
2273 checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
2274 }
2275
2276 if (!try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
2277 {
2278 refMasks[0] = allSplitRefs; /* 75% left */
2279 refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
2280 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2281 checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
2282 checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
2283 }
2284 }
2285 }
2286 }
2287
2288 if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE) && !((m_param->bCTUInfo & 4) && bCtuInfoCheck))
2289 {
2290 if (!m_param->limitReferences || splitIntra)
2291 {
2292 ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
2293 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
2294 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
2295 checkBestMode(md.pred[PRED_INTRA], depth);
2296
2297 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
2298 {
2299 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2300 checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
2301 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
2302 }
2303 }
2304 else
2305 {
2306 ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
2307 }
2308 }
2309 }
2310
2311 if ((md.bestMode->cu.isInter(0) && !(md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)) && (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400))
2312 {
2313 uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
2314
2315 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
2316 {
2317 PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
2318 motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, m_csp != X265_CSP_I400);
2319 }
2320 encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
2321 }
2322 if (m_bTryLossless)
2323 tryLossless(cuGeom);
2324
2325 if (mightSplit)
2326 addSplitFlagCost(*md.bestMode, cuGeom.depth);
2327 }
2328
2329 if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
2330 {
2331 if (mightNotSplit)
2332 {
2333 CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
2334 int8_t maxTUDepth = -1;
2335 for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
2336 maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
2337 ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
2338 }
2339 }
2340
2341 /* compare split RD cost against best cost */
2342 if (mightSplit && !skipRecursion)
2343 checkBestMode(md.pred[PRED_SPLIT], depth);
2344
2345 if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
2346 {
2347 int cuIdx = (cuGeom.childOffset - 1) / 3;
2348 cacheCost[cuIdx] = md.bestMode->rdCost;
2349 }
2350
2351 /* determine which motion references the parent CU should search */
2352 splitCUData.initSplitCUData();
2353 if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
2354 {
2355 if (md.bestMode == &md.pred[PRED_SPLIT])
2356 splitCUData.splitRefs = allSplitRefs;
2357 else
2358 {
2359 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
2360 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
2361 uint32_t numPU = cu.getNumPartInter(0);
2362 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
2363 splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
2364 }
2365 }
2366
2367 if (m_param->limitModes)
2368 {
2369 splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2370 splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2371 splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost;
2372 }
2373
2374 /* Copy best data to encData CTU and recon */
2375 md.bestMode->cu.copyToPic(depth);
2376 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
2377 }
2378 else
2379 {
2380 if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16)
2381 {
2382 qprdRefine(parentCTU, cuGeom, qp, qp);
2383
2384 SplitData splitData[4];
2385 splitData[0].initSplitCUData();
2386 splitData[1].initSplitCUData();
2387 splitData[2].initSplitCUData();
2388 splitData[3].initSplitCUData();
2389
2390 uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2391
2392 splitCUData.initSplitCUData();
2393 if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
2394 {
2395 if (md.bestMode == &md.pred[PRED_SPLIT])
2396 splitCUData.splitRefs = allSplitRefs;
2397 else
2398 {
2399 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
2400 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
2401 uint32_t numPU = cu.getNumPartInter(0);
2402 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
2403 splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
2404 }
2405 }
2406
2407 if (m_param->limitModes)
2408 {
2409 splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2410 splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2411 splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost;
2412 }
2413 }
2414 }
2415
2416 return splitCUData;
2417 }
2418
recodeCU(const CUData & parentCTU,const CUGeom & cuGeom,int32_t qp,int32_t lqp)2419 void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
2420 {
2421 uint32_t depth = cuGeom.depth;
2422 ModeDepth& md = m_modeDepth[depth];
2423 md.bestMode = NULL;
2424
2425 m_evaluateInter = 0;
2426 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
2427 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
2428 bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
2429 int split = 0;
2430
2431 TrainingData td;
2432 td.init(parentCTU, cuGeom);
2433
2434 if (!m_param->bDynamicRefine)
2435 m_refineLevel = m_param->interRefine;
2436 else
2437 m_refineLevel = m_frame->m_classifyFrame ? 1 : 3;
2438
2439 if (m_param->interRefine == 1)
2440 split = (m_param->scaleFactor && bDecidedDepth && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP && (!mightNotSplit ||
2441 (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
2442 else
2443 split = (m_param->scaleFactor && bDecidedDepth && (!mightNotSplit ||
2444 (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
2445 td.split = split;
2446
2447 if ((bDecidedDepth && mightNotSplit) || (m_param->bAnalysisType == HEVC_INFO && parentCTU.m_cuDepth[cuGeom.absPartIdx] == 4))
2448 {
2449 setLambdaFromQP(parentCTU, qp, lqp);
2450
2451 Mode& mode = md.pred[0];
2452 md.bestMode = &mode;
2453 mode.cu.initSubCU(parentCTU, cuGeom, qp);
2454 PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
2455 if (parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
2456 {
2457 if (m_param->intraRefine == 4)
2458 compressIntraCU(parentCTU, cuGeom, qp);
2459 else
2460 {
2461 bool reuseModes = !((m_param->intraRefine == 3) ||
2462 (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
2463 if (reuseModes)
2464 {
2465 memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
2466 memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
2467 }
2468 checkIntra(mode, cuGeom, size);
2469 }
2470 }
2471 else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
2472 {
2473 mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
2474 uint32_t numPU = parentCTU.getNumPartInter(cuGeom.absPartIdx);
2475 for (uint32_t part = 0; part < numPU; part++)
2476 {
2477 PredictionUnit pu(mode.cu, cuGeom, part);
2478 if (m_param->analysisLoadReuseLevel == 10 || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7))
2479 {
2480 x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
2481 int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx;
2482 mode.cu.m_mergeFlag[pu.puAbsPartIdx] = interDataCTU->mergeFlag[cuIdx + part];
2483 mode.cu.setPUInterDir(interDataCTU->interDir[cuIdx + part], pu.puAbsPartIdx, part);
2484 for (int list = 0; list < m_slice->isInterB() + 1; list++)
2485 {
2486 mode.cu.setPUMv(list, interDataCTU->mv[list][cuIdx + part].word, pu.puAbsPartIdx, part);
2487 mode.cu.setPURefIdx(list, interDataCTU->refIdx[list][cuIdx + part], pu.puAbsPartIdx, part);
2488 mode.cu.m_mvpIdx[list][pu.puAbsPartIdx] = interDataCTU->mvpIdx[list][cuIdx + part];
2489 }
2490 if (!mode.cu.m_mergeFlag[pu.puAbsPartIdx])
2491 {
2492 if (m_param->interRefine == 1)
2493 m_me.setSourcePU(*mode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, false);
2494 //AMVP
2495 MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2496 mode.cu.getNeighbourMV(part, pu.puAbsPartIdx, mode.interNeighbours);
2497 for (int list = 0; list < m_slice->isInterB() + 1; list++)
2498 {
2499 int ref = mode.cu.m_refIdx[list][pu.puAbsPartIdx];
2500 if (ref == -1)
2501 continue;
2502 MV mvp;
2503
2504 int numMvc = mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc);
2505 mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]];
2506 if (m_param->interRefine == 1)
2507 {
2508 MV outmv, mvpSelect[3];
2509 mvpSelect[0] = interDataCTU->mv[list][cuIdx + part].word;
2510 if (m_param->mvRefine > 1)
2511 {
2512 mvpSelect[1] = mvp;
2513 if(m_param->mvRefine > 2)
2514 mvpSelect[2] = mode.amvpCand[list][ref][!(mode.cu.m_mvpIdx[list][pu.puAbsPartIdx])];
2515 }
2516 searchMV(mode, list, ref, outmv, mvpSelect, numMvc, mvc);
2517 mode.cu.setPUMv(list, outmv, pu.puAbsPartIdx, part);
2518 }
2519 mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]]/*mvp*/;
2520 }
2521 }
2522 else
2523 {
2524 MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
2525 uint8_t candDir[MRG_MAX_NUM_CANDS];
2526 mode.cu.getInterMergeCandidates(pu.puAbsPartIdx, part, candMvField, candDir);
2527 uint8_t mvpIdx = mode.cu.m_mvpIdx[0][pu.puAbsPartIdx];
2528 if (mode.cu.isBipredRestriction())
2529 {
2530 /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
2531 if (candDir[mvpIdx] == 3)
2532 {
2533 candDir[mvpIdx] = 1;
2534 candMvField[mvpIdx][1].refIdx = REF_NOT_VALID;
2535 }
2536 }
2537 mode.cu.setPUInterDir(candDir[mvpIdx], pu.puAbsPartIdx, part);
2538 mode.cu.setPUMv(0, candMvField[mvpIdx][0].mv, pu.puAbsPartIdx, part);
2539 mode.cu.setPUMv(1, candMvField[mvpIdx][1].mv, pu.puAbsPartIdx, part);
2540 mode.cu.setPURefIdx(0, (int8_t)candMvField[mvpIdx][0].refIdx, pu.puAbsPartIdx, part);
2541 mode.cu.setPURefIdx(1, (int8_t)candMvField[mvpIdx][1].refIdx, pu.puAbsPartIdx, part);
2542 }
2543 }
2544 motionCompensation(mode.cu, pu, mode.predYuv, true, (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
2545 }
2546 if (!m_param->interRefine && !m_param->bDynamicRefine && parentCTU.isSkipped(cuGeom.absPartIdx))
2547 encodeResAndCalcRdSkipCU(mode);
2548 else
2549 encodeResAndCalcRdInterCU(mode, cuGeom);
2550
2551 /* checkMerge2Nx2N function performs checkDQP after encoding residual, do the same */
2552 bool mergeInter2Nx2N = size == SIZE_2Nx2N && mode.cu.m_mergeFlag[0];
2553 if (parentCTU.isSkipped(cuGeom.absPartIdx) || mergeInter2Nx2N)
2554 checkDQP(mode, cuGeom);
2555 }
2556
2557 if (m_refineLevel < 2)
2558 {
2559 if (m_bTryLossless)
2560 tryLossless(cuGeom);
2561
2562 if (mightSplit)
2563 addSplitFlagCost(*md.bestMode, cuGeom.depth);
2564
2565 if (mightSplit && m_param->rdLevel < 5)
2566 checkDQPForSplitPred(*md.bestMode, cuGeom);
2567 }
2568
2569 if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
2570 {
2571 for (int list = 0; list < m_slice->isInterB() + 1; list++)
2572 {
2573 m_modeFlag[list] = true;
2574 if (parentCTU.m_skipFlag[list][cuGeom.absPartIdx] == 1 && cuGeom.numPartitions <= 16)
2575 m_checkMergeAndSkipOnly[list] = true;
2576 }
2577 m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
2578 for (int list = 0; list < m_slice->isInterB() + 1; list++)
2579 {
2580 m_modeFlag[list] = false;
2581 m_checkMergeAndSkipOnly[list] = false;
2582 }
2583 }
2584
2585 if (m_param->bDynamicRefine)
2586 classifyCU(parentCTU,cuGeom, *md.bestMode, td);
2587
2588 if (m_refineLevel > 1 || (m_refineLevel && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP && !mode.cu.isSkipped(0)))
2589 {
2590 if (parentCTU.m_cuDepth[cuGeom.absPartIdx] < 4 && mightNotSplit)
2591 m_evaluateInter = 1;
2592 else
2593 bDecidedDepth = true;
2594 m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
2595 m_evaluateInter = 0;
2596 }
2597 }
2598 if (!bDecidedDepth || split)
2599 {
2600 Mode* splitPred = &md.pred[PRED_SPLIT];
2601 if (!split)
2602 md.bestMode = splitPred;
2603 splitPred->initCosts();
2604 CUData* splitCU = &splitPred->cu;
2605 splitCU->initSubCU(parentCTU, cuGeom, qp);
2606
2607 uint32_t nextDepth = depth + 1;
2608 ModeDepth& nd = m_modeDepth[nextDepth];
2609 invalidateContexts(nextDepth);
2610 Entropy* nextContext = &m_rqt[depth].cur;
2611 int nextQP = qp;
2612
2613 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
2614 {
2615 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
2616 if (childGeom.flags & CUGeom::PRESENT)
2617 {
2618 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
2619 m_rqt[nextDepth].cur.load(*nextContext);
2620
2621 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
2622 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
2623
2624 int lamdaQP = (m_param->analysisLoadReuseLevel >= 7) ? nextQP : lqp;
2625
2626 if (split)
2627 m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
2628 else
2629 qprdRefine(parentCTU, childGeom, nextQP, lamdaQP);
2630
2631 // Save best CU and pred data for this sub CU
2632 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
2633 splitPred->addSubCosts(*nd.bestMode);
2634 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
2635 nextContext = &nd.bestMode->contexts;
2636 }
2637 else
2638 {
2639 splitCU->setEmptyPart(childGeom, subPartIdx);
2640 // Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP
2641 memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
2642 }
2643 }
2644 nextContext->store(splitPred->contexts);
2645 if (mightNotSplit)
2646 addSplitFlagCost(*splitPred, cuGeom.depth);
2647 else
2648 updateModeCost(*splitPred);
2649
2650 if (m_refineLevel)
2651 {
2652 if (m_param->rdLevel > 1)
2653 checkBestMode(*splitPred, cuGeom.depth);
2654 else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
2655 md.bestMode = splitPred;
2656 }
2657
2658 checkDQPForSplitPred(*splitPred, cuGeom);
2659
2660 /* Copy best data to encData CTU and recon */
2661 md.bestMode->cu.copyToPic(depth);
2662 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
2663 }
2664 if (m_param->bDynamicRefine && bDecidedDepth)
2665 trainCU(parentCTU, cuGeom, *md.bestMode, td);
2666 }
2667
classifyCU(const CUData & ctu,const CUGeom & cuGeom,const Mode & bestMode,TrainingData & trainData)2668 void Analysis::classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
2669 {
2670 uint32_t depth = cuGeom.depth;
2671 trainData.cuVariance = calculateCUVariance(ctu, cuGeom);
2672 if (m_frame->m_classifyFrame)
2673 {
2674 uint64_t diffRefine[X265_REFINE_INTER_LEVELS];
2675 uint64_t diffRefineRd[X265_REFINE_INTER_LEVELS];
2676 float probRefine[X265_REFINE_INTER_LEVELS] = { 0 };
2677 uint8_t varRefineLevel = 1;
2678 uint8_t rdRefineLevel = 1;
2679 uint64_t cuCost = bestMode.rdCost;
2680 int offset = (depth * X265_REFINE_INTER_LEVELS);
2681 if (cuCost < m_frame->m_classifyRd[offset])
2682 m_refineLevel = 1;
2683 else
2684 {
2685 uint64_t trainingCount = 0;
2686 for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
2687 {
2688 offset = (depth * X265_REFINE_INTER_LEVELS) + i;
2689 trainingCount += m_frame->m_classifyCount[offset];
2690 }
2691 for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
2692 {
2693 offset = (depth * X265_REFINE_INTER_LEVELS) + i;
2694 /* Calculate distance values */
2695 diffRefine[i] = abs((int64_t)(trainData.cuVariance - m_frame->m_classifyVariance[offset]));
2696 diffRefineRd[i] = abs((int64_t)(cuCost - m_frame->m_classifyRd[offset]));
2697
2698 /* Calculate prior probability - ranges between 0 and 1 */
2699 if (trainingCount)
2700 probRefine[i] = ((float)m_frame->m_classifyCount[offset] / (float)trainingCount);
2701
2702 /* Bayesian classification - P(c|x)P(x) = P(x|c)P(c)
2703 P(c|x) is the posterior probability of class given predictor.
2704 P(c) is the prior probability of class.
2705 P(x|c) is the likelihood which is the probability of predictor given class.
2706 P(x) is the prior probability of predictor.*/
2707 int curRefineLevel = m_refineLevel - 1;
2708 if ((diffRefine[i] * probRefine[curRefineLevel]) < (diffRefine[curRefineLevel] * probRefine[i]))
2709 varRefineLevel = i + 1;
2710 if ((diffRefineRd[i] * probRefine[curRefineLevel]) < (diffRefineRd[curRefineLevel] * probRefine[i]))
2711 rdRefineLevel = i + 1;
2712 }
2713 m_refineLevel = X265_MAX(varRefineLevel, rdRefineLevel);
2714 }
2715 }
2716 }
2717
trainCU(const CUData & ctu,const CUGeom & cuGeom,const Mode & bestMode,TrainingData & trainData)2718 void Analysis::trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
2719 {
2720 uint32_t depth = cuGeom.depth;
2721 int classify = 1;
2722 if (!m_frame->m_classifyFrame)
2723 {
2724 /* classify = 1 : CUs for which the save data matches with that after encoding with refine-inter 3
2725 and CUs that has split.
2726 classify = 2 : CUs which are encoded as simple modes (Skip/Merge/2Nx2N).
2727 classify = 3 : CUs encoded as any other mode. */
2728
2729 bool refineInter0 = (trainData.predMode == ctu.m_predMode[cuGeom.absPartIdx] &&
2730 trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx] &&
2731 trainData.mergeFlag == ctu.m_mergeFlag[cuGeom.absPartIdx]);
2732 bool refineInter1 = (depth == m_param->maxCUDepth - 1) && trainData.split;
2733 if (refineInter0 || refineInter1)
2734 classify = 1;
2735 else if (trainData.partSize == SIZE_2Nx2N && trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx])
2736 classify = 2;
2737 else
2738 classify = 3;
2739 }
2740 else
2741 classify = m_refineLevel;
2742 uint64_t cuCost = bestMode.rdCost;
2743 int offset = (depth * X265_REFINE_INTER_LEVELS) + classify - 1;
2744 ctu.m_collectCURd[offset] += cuCost;
2745 ctu.m_collectCUVariance[offset] += trainData.cuVariance;
2746 ctu.m_collectCUCount[offset]++;
2747 }
2748
2749 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
checkMerge2Nx2N_rd0_4(Mode & skip,Mode & merge,const CUGeom & cuGeom)2750 void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
2751 {
2752 uint32_t depth = cuGeom.depth;
2753 ModeDepth& md = m_modeDepth[depth];
2754 Yuv *fencYuv = &md.fencYuv;
2755
2756 /* Note that these two Mode instances are named MERGE and SKIP but they may
2757 * hold the reverse when the function returns. We toggle between the two modes */
2758 Mode* tempPred = &merge;
2759 Mode* bestPred = &skip;
2760
2761 X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
2762
2763 tempPred->initCosts();
2764 tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
2765 tempPred->cu.setPredModeSubParts(MODE_INTER);
2766 tempPred->cu.m_mergeFlag[0] = true;
2767
2768 bestPred->initCosts();
2769 bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
2770 bestPred->cu.setPredModeSubParts(MODE_INTER);
2771 bestPred->cu.m_mergeFlag[0] = true;
2772
2773 MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
2774 uint8_t candDir[MRG_MAX_NUM_CANDS];
2775 uint32_t numMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, candMvField, candDir);
2776 PredictionUnit pu(merge.cu, cuGeom, 0);
2777
2778 bestPred->sa8dCost = MAX_INT64;
2779 int bestSadCand = -1;
2780 int sizeIdx = cuGeom.log2CUSize - 2;
2781 int safeX, maxSafeMv;
2782 if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
2783 {
2784 safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
2785 maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
2786 }
2787 for (uint32_t i = 0; i < numMergeCand; ++i)
2788 {
2789 if (m_bFrameParallel)
2790 {
2791 // Parallel slices bound check
2792 if (m_param->maxSlices > 1)
2793 {
2794 // NOTE: First row in slice can't negative
2795 if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
2796 continue;
2797
2798 // Last row in slice can't reference beyond bound since it is another slice area
2799 // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
2800 if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
2801 continue;
2802 }
2803
2804 if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
2805 candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
2806 continue;
2807 }
2808
2809 if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
2810 tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
2811 candMvField[i][0].mv.x > maxSafeMv)
2812 // skip merge candidates which reference beyond safe reference area
2813 continue;
2814
2815 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
2816 X265_CHECK(m_slice->m_sliceType == B_SLICE || !(candDir[i] & 0x10), " invalid merge for P slice\n");
2817 tempPred->cu.m_interDir[0] = candDir[i];
2818 tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
2819 tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
2820 tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
2821 tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
2822 motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
2823
2824 tempPred->sa8dBits = getTUBits(i, numMergeCand);
2825 tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
2826 if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
2827 {
2828 tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
2829 tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
2830 }
2831 tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);
2832
2833 if (tempPred->sa8dCost < bestPred->sa8dCost)
2834 {
2835 bestSadCand = i;
2836 std::swap(tempPred, bestPred);
2837 }
2838 }
2839
2840 /* force mode decision to take inter or intra */
2841 if (bestSadCand < 0)
2842 return;
2843
2844 /* calculate the motion compensation for chroma for the best mode selected */
2845 if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* Chroma MC was done above */
2846 motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
2847
2848 if (m_param->rdLevel)
2849 {
2850 if (m_param->bLossless)
2851 bestPred->rdCost = MAX_INT64;
2852 else
2853 encodeResAndCalcRdSkipCU(*bestPred);
2854
2855 /* Encode with residual */
2856 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
2857 tempPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
2858 tempPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
2859 tempPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
2860 tempPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
2861 tempPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
2862 tempPred->sa8dCost = bestPred->sa8dCost;
2863 tempPred->sa8dBits = bestPred->sa8dBits;
2864 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
2865
2866 encodeResAndCalcRdInterCU(*tempPred, cuGeom);
2867
2868 md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;
2869 }
2870 else
2871 md.bestMode = bestPred;
2872
2873 /* broadcast sets of MV field data */
2874 md.bestMode->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
2875 md.bestMode->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
2876 md.bestMode->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
2877 md.bestMode->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
2878 md.bestMode->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
2879 checkDQP(*md.bestMode, cuGeom);
2880 }
2881
2882 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
checkMerge2Nx2N_rd5_6(Mode & skip,Mode & merge,const CUGeom & cuGeom)2883 void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom)
2884 {
2885 uint32_t depth = cuGeom.depth;
2886
2887 /* Note that these two Mode instances are named MERGE and SKIP but they may
2888 * hold the reverse when the function returns. We toggle between the two modes */
2889 Mode* tempPred = &merge;
2890 Mode* bestPred = &skip;
2891
2892 merge.initCosts();
2893 merge.cu.setPredModeSubParts(MODE_INTER);
2894 merge.cu.setPartSizeSubParts(SIZE_2Nx2N);
2895 merge.cu.m_mergeFlag[0] = true;
2896
2897 skip.initCosts();
2898 skip.cu.setPredModeSubParts(MODE_INTER);
2899 skip.cu.setPartSizeSubParts(SIZE_2Nx2N);
2900 skip.cu.m_mergeFlag[0] = true;
2901
2902 MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
2903 uint8_t candDir[MRG_MAX_NUM_CANDS];
2904 uint32_t numMergeCand = merge.cu.getInterMergeCandidates(0, 0, candMvField, candDir);
2905 PredictionUnit pu(merge.cu, cuGeom, 0);
2906
2907 bool foundCbf0Merge = false;
2908 bool triedPZero = false, triedBZero = false;
2909 bestPred->rdCost = MAX_INT64;
2910
2911 int safeX, maxSafeMv;
2912 if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
2913 {
2914 safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
2915 maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
2916 }
2917 for (uint32_t i = 0; i < numMergeCand; i++)
2918 {
2919 if (m_bFrameParallel)
2920 {
2921 // Parallel slices bound check
2922 if (m_param->maxSlices > 1)
2923 {
2924 // NOTE: First row in slice can't negative
2925 if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
2926 continue;
2927
2928 // Last row in slice can't reference beyond bound since it is another slice area
2929 // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
2930 if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
2931 continue;
2932 }
2933
2934 if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
2935 candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
2936 continue;
2937 }
2938
2939 /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
2940 if (candDir[i] == 1 && !candMvField[i][0].mv.word && !candMvField[i][0].refIdx)
2941 {
2942 if (triedPZero)
2943 continue;
2944 triedPZero = true;
2945 }
2946 else if (candDir[i] == 3 &&
2947 !candMvField[i][0].mv.word && !candMvField[i][0].refIdx &&
2948 !candMvField[i][1].mv.word && !candMvField[i][1].refIdx)
2949 {
2950 if (triedBZero)
2951 continue;
2952 triedBZero = true;
2953 }
2954 if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
2955 tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
2956 candMvField[i][0].mv.x > maxSafeMv)
2957 // skip merge candidates which reference beyond safe reference area
2958 continue;
2959 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */
2960 tempPred->cu.m_interDir[0] = candDir[i];
2961 tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
2962 tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
2963 tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
2964 tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
2965 tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
2966
2967 motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_csp != X265_CSP_I400);
2968
2969 uint8_t hasCbf = true;
2970 bool swapped = false;
2971 if (!foundCbf0Merge)
2972 {
2973 /* if the best prediction has CBF (not a skip) then try merge with residual */
2974
2975 encodeResAndCalcRdInterCU(*tempPred, cuGeom);
2976 hasCbf = tempPred->cu.getQtRootCbf(0);
2977 foundCbf0Merge = !hasCbf;
2978
2979 if (tempPred->rdCost < bestPred->rdCost)
2980 {
2981 std::swap(tempPred, bestPred);
2982 swapped = true;
2983 }
2984 }
2985 if (!m_param->bLossless && hasCbf)
2986 {
2987 /* try merge without residual (skip), if not lossless coding */
2988
2989 if (swapped)
2990 {
2991 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
2992 tempPred->cu.m_interDir[0] = candDir[i];
2993 tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
2994 tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
2995 tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
2996 tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
2997 tempPred->cu.setPredModeSubParts(MODE_INTER);
2998 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
2999 }
3000
3001 encodeResAndCalcRdSkipCU(*tempPred);
3002
3003 if (tempPred->rdCost < bestPred->rdCost)
3004 std::swap(tempPred, bestPred);
3005 }
3006 }
3007
3008 if (bestPred->rdCost < MAX_INT64)
3009 {
3010 m_modeDepth[depth].bestMode = bestPred;
3011
3012 /* broadcast sets of MV field data */
3013 uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
3014 bestPred->cu.setPUInterDir(candDir[bestCand], 0, 0);
3015 bestPred->cu.setPUMv(0, candMvField[bestCand][0].mv, 0, 0);
3016 bestPred->cu.setPUMv(1, candMvField[bestCand][1].mv, 0, 0);
3017 bestPred->cu.setPURefIdx(0, (int8_t)candMvField[bestCand][0].refIdx, 0, 0);
3018 bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestCand][1].refIdx, 0, 0);
3019 checkDQP(*bestPred, cuGeom);
3020 }
3021 }
3022
checkInter_rd0_4(Mode & interMode,const CUGeom & cuGeom,PartSize partSize,uint32_t refMask[2])3023 void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
3024 {
3025 interMode.initCosts();
3026 interMode.cu.setPartSizeSubParts(partSize);
3027 interMode.cu.setPredModeSubParts(MODE_INTER);
3028 int numPredDir = m_slice->isInterP() ? 1 : 2;
3029
3030 if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU)
3031 {
3032 int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3033 int index = 0;
3034
3035 uint32_t numPU = interMode.cu.getNumPartInter(0);
3036 for (uint32_t part = 0; part < numPU; part++)
3037 {
3038 MotionData* bestME = interMode.bestME[part];
3039 for (int32_t i = 0; i < numPredDir; i++)
3040 bestME[i].ref = m_reuseRef[refOffset + index++];
3041 }
3042 }
3043
3044 if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
3045 {
3046 uint32_t numPU = interMode.cu.getNumPartInter(0);
3047 for (uint32_t part = 0; part < numPU; part++)
3048 {
3049 MotionData* bestME = interMode.bestME[part];
3050 for (int32_t i = 0; i < numPredDir; i++)
3051 {
3052 int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
3053 bestME[i].ref = ref[cuGeom.absPartIdx];
3054 bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
3055 bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
3056 }
3057 }
3058 }
3059 predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400), refMask);
3060
3061 /* predInterSearch sets interMode.sa8dBits */
3062 const Yuv& fencYuv = *interMode.fencYuv;
3063 Yuv& predYuv = interMode.predYuv;
3064 int part = partitionFromLog2Size(cuGeom.log2CUSize);
3065 interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
3066 if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3067 {
3068 interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
3069 interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
3070 }
3071 interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);
3072
3073 if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
3074 {
3075 int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3076 int index = 0;
3077
3078 uint32_t numPU = interMode.cu.getNumPartInter(0);
3079 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3080 {
3081 MotionData* bestME = interMode.bestME[puIdx];
3082 for (int32_t i = 0; i < numPredDir; i++)
3083 m_reuseRef[refOffset + index++] = bestME[i].ref;
3084 }
3085 }
3086 }
3087
checkInter_rd5_6(Mode & interMode,const CUGeom & cuGeom,PartSize partSize,uint32_t refMask[2])3088 void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
3089 {
3090 interMode.initCosts();
3091 interMode.cu.setPartSizeSubParts(partSize);
3092 interMode.cu.setPredModeSubParts(MODE_INTER);
3093 int numPredDir = m_slice->isInterP() ? 1 : 2;
3094
3095 if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU)
3096 {
3097 int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3098 int index = 0;
3099
3100 uint32_t numPU = interMode.cu.getNumPartInter(0);
3101 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3102 {
3103 MotionData* bestME = interMode.bestME[puIdx];
3104 for (int32_t i = 0; i < numPredDir; i++)
3105 bestME[i].ref = m_reuseRef[refOffset + index++];
3106 }
3107 }
3108
3109 if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
3110 {
3111 uint32_t numPU = interMode.cu.getNumPartInter(0);
3112 for (uint32_t part = 0; part < numPU; part++)
3113 {
3114 MotionData* bestME = interMode.bestME[part];
3115 for (int32_t i = 0; i < numPredDir; i++)
3116 {
3117 int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
3118 bestME[i].ref = ref[cuGeom.absPartIdx];
3119 bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
3120 bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
3121 }
3122 }
3123 }
3124
3125 predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, refMask);
3126
3127 /* predInterSearch sets interMode.sa8dBits, but this is ignored */
3128 encodeResAndCalcRdInterCU(interMode, cuGeom);
3129
3130 if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
3131 {
3132 int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3133 int index = 0;
3134
3135 uint32_t numPU = interMode.cu.getNumPartInter(0);
3136 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3137 {
3138 MotionData* bestME = interMode.bestME[puIdx];
3139 for (int32_t i = 0; i < numPredDir; i++)
3140 m_reuseRef[refOffset + index++] = bestME[i].ref;
3141 }
3142 }
3143 }
3144
checkBidir2Nx2N(Mode & inter2Nx2N,Mode & bidir2Nx2N,const CUGeom & cuGeom)3145 void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
3146 {
3147 CUData& cu = bidir2Nx2N.cu;
3148
3149 if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
3150 {
3151 bidir2Nx2N.sa8dCost = MAX_INT64;
3152 bidir2Nx2N.rdCost = MAX_INT64;
3153 return;
3154 }
3155
3156 const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
3157 MV mvzero(0, 0);
3158 int partEnum = cuGeom.log2CUSize - 2;
3159
3160 bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
3161 bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
3162 MotionData* bestME = bidir2Nx2N.bestME[0];
3163 int ref0 = bestME[0].ref;
3164 MV mvp0 = bestME[0].mvp;
3165 int mvpIdx0 = bestME[0].mvpIdx;
3166 int ref1 = bestME[1].ref;
3167 MV mvp1 = bestME[1].mvp;
3168 int mvpIdx1 = bestME[1].mvpIdx;
3169
3170 bidir2Nx2N.initCosts();
3171 cu.setPartSizeSubParts(SIZE_2Nx2N);
3172 cu.setPredModeSubParts(MODE_INTER);
3173 cu.setPUInterDir(3, 0, 0);
3174 cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
3175 cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
3176 cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
3177 cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
3178 cu.m_mergeFlag[0] = 0;
3179
3180 /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
3181 cu.setPUMv(0, bestME[0].mv, 0, 0);
3182 cu.m_mvd[0][0] = bestME[0].mv - mvp0;
3183
3184 cu.setPUMv(1, bestME[1].mv, 0, 0);
3185 cu.m_mvd[1][0] = bestME[1].mv - mvp1;
3186
3187 PredictionUnit pu(cu, cuGeom, 0);
3188 motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
3189
3190 int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
3191 if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3192 {
3193 /* Add in chroma distortion */
3194 sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
3195 sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
3196 }
3197 bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
3198 bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
3199
3200 bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
3201 if (bTryZero)
3202 {
3203 /* Do not try zero MV if unidir motion predictors are beyond
3204 * valid search area */
3205 MV mvmin, mvmax;
3206 int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
3207 setSearchRange(cu, mvzero, merange, mvmin, mvmax);
3208 mvmax.y += 2; // there is some pad for subpel refine
3209 mvmin <<= 2;
3210 mvmax <<= 2;
3211
3212 bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
3213 bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
3214 }
3215 if (bTryZero)
3216 {
3217 /* Estimate cost of BIDIR using coincident blocks */
3218 Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
3219
3220 int zsa8d;
3221
3222 if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3223 {
3224 cu.m_mv[0][0] = mvzero;
3225 cu.m_mv[1][0] = mvzero;
3226
3227 motionCompensation(cu, pu, tmpPredYuv, true, true);
3228 zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
3229 zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
3230 zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
3231
3232 }
3233 else
3234 {
3235 pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
3236 pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
3237 intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
3238 primitives.pu[partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
3239 zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
3240 }
3241 uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
3242 uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
3243 uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
3244
3245 /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
3246 mvp0 = checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvpIdx0, bits0, zcost);
3247 mvp1 = checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvpIdx1, bits1, zcost);
3248
3249 uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
3250 zcost = zsa8d + m_rdCost.getCost(zbits);
3251
3252 if (zcost < bidir2Nx2N.sa8dCost)
3253 {
3254 bidir2Nx2N.sa8dBits = zbits;
3255 bidir2Nx2N.sa8dCost = zcost;
3256
3257 cu.setPUMv(0, mvzero, 0, 0);
3258 cu.m_mvd[0][0] = mvzero - mvp0;
3259 cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
3260
3261 cu.setPUMv(1, mvzero, 0, 0);
3262 cu.m_mvd[1][0] = mvzero - mvp1;
3263 cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
3264
3265 if (m_bChromaSa8d) /* real MC was already performed */
3266 bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
3267 else
3268 motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400);
3269 }
3270 else if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3271 {
3272 /* recover overwritten motion vectors */
3273 cu.m_mv[0][0] = bestME[0].mv;
3274 cu.m_mv[1][0] = bestME[1].mv;
3275 }
3276 }
3277 }
3278
encodeResidue(const CUData & ctu,const CUGeom & cuGeom)3279 void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
3280 {
3281 if (cuGeom.depth < ctu.m_cuDepth[cuGeom.absPartIdx] && cuGeom.depth < ctu.m_encData->m_param->maxCUDepth)
3282 {
3283 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
3284 {
3285 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
3286 if (childGeom.flags & CUGeom::PRESENT)
3287 encodeResidue(ctu, childGeom);
3288 }
3289 return;
3290 }
3291
3292 uint32_t absPartIdx = cuGeom.absPartIdx;
3293 int sizeIdx = cuGeom.log2CUSize - 2;
3294
3295 /* reuse the bestMode data structures at the current depth */
3296 Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
3297 CUData& cu = bestMode->cu;
3298
3299 cu.copyFromPic(ctu, cuGeom, m_csp);
3300
3301 PicYuv& reconPic = *m_frame->m_reconPic;
3302
3303 Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
3304 if (cuGeom.depth)
3305 m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
3306 X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
3307
3308 if (cu.isIntra(0))
3309 {
3310 ProfileCUScope(ctu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); // not really RDO, but close enough
3311
3312 uint32_t tuDepthRange[2];
3313 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
3314
3315 residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
3316 if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3317 {
3318 getBestIntraModeChroma(*bestMode, cuGeom);
3319 residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
3320 }
3321 }
3322 else // if (cu.isInter(0))
3323 {
3324 ProfileCUScope(ctu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]); // not really RDO, but close enough
3325
3326 X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
3327
3328 /* Calculate residual for current CU part into depth sized resiYuv */
3329
3330 ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
3331
3332 /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
3333 Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
3334 pixel* predY = predYuv.getLumaAddr(absPartIdx);
3335
3336 primitives.cu[sizeIdx].sub_ps(resiYuv.m_buf[0], resiYuv.m_size,
3337 fencYuv.m_buf[0], predY,
3338 fencYuv.m_size, predYuv.m_size);
3339
3340 if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3341 {
3342 pixel* predU = predYuv.getCbAddr(absPartIdx);
3343 pixel* predV = predYuv.getCrAddr(absPartIdx);
3344 primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize,
3345 fencYuv.m_buf[1], predU,
3346 fencYuv.m_csize, predYuv.m_csize);
3347
3348 primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize,
3349 fencYuv.m_buf[2], predV,
3350 fencYuv.m_csize, predYuv.m_csize);
3351 }
3352
3353 uint32_t tuDepthRange[2];
3354 cu.getInterTUQtDepthRange(tuDepthRange, 0);
3355
3356 residualTransformQuantInter(*bestMode, cuGeom, 0, 0, tuDepthRange);
3357
3358 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
3359 cu.setPredModeSubParts(MODE_SKIP);
3360
3361 /* residualTransformQuantInter() wrote transformed residual back into
3362 * resiYuv. Generate the recon pixels by adding it to the prediction */
3363
3364 if (cu.m_cbf[0][0])
3365 {
3366 bool reconPicAlign = (reconPic.m_cuOffsetY[cu.m_cuAddr] + reconPic.m_buOffsetY[absPartIdx]) % 64 == 0;
3367 bool predYalign = predYuv.getAddrOffset(absPartIdx, predYuv.m_size) % 64 == 0;
3368 primitives.cu[sizeIdx].add_ps[reconPicAlign && predYalign && (reconPic.m_stride % 64 == 0) && (predYuv.m_size % 64 == 0) &&
3369 (resiYuv.m_size % 64 == 0)](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
3370 }
3371 else
3372 primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
3373 predY, predYuv.m_size);
3374 if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3375 {
3376 pixel* predU = predYuv.getCbAddr(absPartIdx);
3377 pixel* predV = predYuv.getCrAddr(absPartIdx);
3378 if (cu.m_cbf[1][0])
3379 {
3380 bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
3381 bool predUalign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
3382 primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predUalign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
3383 (resiYuv.m_csize % 64 == 0)](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
3384 }
3385 else
3386 primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
3387 predU, predYuv.m_csize);
3388
3389 if (cu.m_cbf[2][0])
3390 {
3391 bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
3392 bool predValign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
3393 primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predValign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
3394 (resiYuv.m_csize % 64 == 0)](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
3395 }
3396 else
3397 primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
3398 predV, predYuv.m_csize);
3399 }
3400 }
3401
3402 cu.updatePic(cuGeom.depth, m_frame->m_fencPic->m_picCsp);
3403 }
3404
addSplitFlagCost(Mode & mode,uint32_t depth)3405 void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
3406 {
3407 if (m_param->rdLevel >= 3)
3408 {
3409 /* code the split flag (0 or 1) and update bit costs */
3410 mode.contexts.resetBits();
3411 mode.contexts.codeSplitFlag(mode.cu, 0, depth);
3412 uint32_t bits = mode.contexts.getNumberOfWrittenBits();
3413 mode.totalBits += bits;
3414 updateModeCost(mode);
3415 }
3416 else if (m_param->rdLevel <= 1)
3417 {
3418 mode.sa8dBits++;
3419 mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
3420 }
3421 else
3422 {
3423 mode.totalBits++;
3424 updateModeCost(mode);
3425 }
3426 }
3427
topSkipMinDepth(const CUData & parentCTU,const CUGeom & cuGeom)3428 uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom)
3429 {
3430 /* Do not attempt to code a block larger than the largest block in the
3431 * co-located CTUs in L0 and L1 */
3432 int currentQP = parentCTU.m_qp[0];
3433 int previousQP = currentQP;
3434 uint32_t minDepth0 = 4, minDepth1 = 4;
3435 uint32_t sum = 0;
3436 int numRefs = 0;
3437 if (m_slice->m_numRefIdx[0])
3438 {
3439 numRefs++;
3440 const CUData& cu = *m_slice->m_refFrameList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
3441 previousQP = cu.m_qp[0];
3442 if (!cu.m_cuDepth[cuGeom.absPartIdx])
3443 return 0;
3444 for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
3445 {
3446 uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
3447 minDepth0 = X265_MIN(d, minDepth0);
3448 sum += d;
3449 }
3450 }
3451 if (m_slice->m_numRefIdx[1])
3452 {
3453 numRefs++;
3454 const CUData& cu = *m_slice->m_refFrameList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
3455 if (!cu.m_cuDepth[cuGeom.absPartIdx])
3456 return 0;
3457 for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
3458 {
3459 uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
3460 minDepth1 = X265_MIN(d, minDepth1);
3461 sum += d;
3462 }
3463 }
3464 if (!numRefs)
3465 return 0;
3466
3467 uint32_t minDepth = X265_MIN(minDepth0, minDepth1);
3468 uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2);
3469
3470 /* allow block size growth if QP is raising or avg depth is
3471 * less than 1.5 of min depth */
3472 if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1)))
3473 minDepth -= 1;
3474
3475 return minDepth;
3476 }
3477
3478 /* returns true if recursion should be stopped */
recursionDepthCheck(const CUData & parentCTU,const CUGeom & cuGeom,const Mode & bestMode)3479 bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode)
3480 {
3481 /* early exit when the RD cost of best mode at depth n is less than the sum
3482 * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright,
3483 * left, colocated) and avg cost of that CU at depth "n" with weightage for
3484 * each quantity */
3485
3486 uint32_t depth = cuGeom.depth;
3487 FrameData& curEncData = *m_frame->m_encData;
3488 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
3489 uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
3490 uint64_t cuCount = cuStat.count[depth];
3491
3492 uint64_t neighCost = 0, neighCount = 0;
3493 const CUData* above = parentCTU.m_cuAbove;
3494 if (above)
3495 {
3496 FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr];
3497 neighCost += astat.avgCost[depth] * astat.count[depth];
3498 neighCount += astat.count[depth];
3499
3500 const CUData* aboveLeft = parentCTU.m_cuAboveLeft;
3501 if (aboveLeft)
3502 {
3503 FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr];
3504 neighCost += lstat.avgCost[depth] * lstat.count[depth];
3505 neighCount += lstat.count[depth];
3506 }
3507
3508 const CUData* aboveRight = parentCTU.m_cuAboveRight;
3509 if (aboveRight)
3510 {
3511 FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr];
3512 neighCost += rstat.avgCost[depth] * rstat.count[depth];
3513 neighCount += rstat.count[depth];
3514 }
3515 }
3516 const CUData* left = parentCTU.m_cuLeft;
3517 if (left)
3518 {
3519 FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr];
3520 neighCost += nstat.avgCost[depth] * nstat.count[depth];
3521 neighCount += nstat.count[depth];
3522 }
3523
3524 // give 60% weight to all CU's and 40% weight to neighbour CU's
3525 if (neighCount + cuCount)
3526 {
3527 uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
3528 uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
3529 if (curCost < avgCost && avgCost)
3530 return true;
3531 }
3532
3533 return false;
3534 }
3535
complexityCheckCU(const Mode & bestMode)3536 bool Analysis::complexityCheckCU(const Mode& bestMode)
3537 {
3538 if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
3539 {
3540 uint32_t mean = 0;
3541 uint32_t homo = 0;
3542 uint32_t cuSize = bestMode.fencYuv->m_size;
3543 for (uint32_t y = 0; y < cuSize; y++) {
3544 for (uint32_t x = 0; x < cuSize; x++) {
3545 mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
3546 }
3547 }
3548 mean = mean / (cuSize * cuSize);
3549 for (uint32_t y = 0; y < cuSize; y++) {
3550 for (uint32_t x = 0; x < cuSize; x++) {
3551 homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
3552 }
3553 }
3554 homo = homo / (cuSize * cuSize);
3555
3556 if (homo < (.1 * mean))
3557 return true;
3558
3559 return false;
3560 }
3561 else
3562 {
3563 int blockType = bestMode.cu.m_log2CUSize[0] - LOG2_UNIT_SIZE;
3564 int shift = bestMode.cu.m_log2CUSize[0] * LOG2_UNIT_SIZE;
3565 intptr_t stride = m_frame->m_fencPic->m_stride;
3566 intptr_t blockOffsetLuma = bestMode.cu.m_cuPelX + bestMode.cu.m_cuPelY * stride;
3567 uint64_t sum_ss = primitives.cu[blockType].var(m_frame->m_edgeBitPic + blockOffsetLuma, stride);
3568 uint32_t sum = (uint32_t)sum_ss;
3569 uint32_t ss = (uint32_t)(sum_ss >> 32);
3570 uint32_t pixelCount = 1 << shift;
3571 double cuEdgeVariance = (ss - ((double)sum * sum / pixelCount)) / pixelCount;
3572
3573 if (cuEdgeVariance > (double)m_param->edgeVarThreshold)
3574 return false;
3575 else
3576 return true;
3577 }
3578 }
3579
calculateCUVariance(const CUData & ctu,const CUGeom & cuGeom)3580 uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom)
3581 {
3582 uint32_t cuVariance = 0;
3583 uint32_t *blockVariance = m_frame->m_lowres.blockVariance;
3584 int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
3585
3586 uint32_t width = m_frame->m_fencPic->m_picWidth;
3587 uint32_t height = m_frame->m_fencPic->m_picHeight;
3588 uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
3589 uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
3590 uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
3591 uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
3592 uint32_t cnt = 0;
3593
3594 for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
3595 {
3596 for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
3597 {
3598 uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
3599 cuVariance += blockVariance[idx];
3600 cnt++;
3601 }
3602 }
3603 return cuVariance / cnt;
3604 }
3605
aqQPOffset(const CUData & ctu,const CUGeom & cuGeom)3606 double Analysis::aqQPOffset(const CUData& ctu, const CUGeom& cuGeom)
3607 {
3608 uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1);
3609 PicQPAdaptationLayer* pQPLayer = &m_frame->m_lowres.pAQLayer[aqDepth];
3610
3611 uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pQPLayer->aqPartWidth;
3612 uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pQPLayer->aqPartHeight;
3613
3614 uint32_t aqStride = pQPLayer->numAQPartInWidth;
3615
3616 double dQpOffset = pQPLayer->dQpOffset[aqPosY * aqStride + aqPosX];
3617 return dQpOffset;
3618 }
3619
cuTreeQPOffset(const CUData & ctu,const CUGeom & cuGeom)3620 double Analysis::cuTreeQPOffset(const CUData& ctu, const CUGeom& cuGeom)
3621 {
3622 uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1);
3623 PicQPAdaptationLayer* pcAQLayer = &m_frame->m_lowres.pAQLayer[aqDepth];
3624
3625 uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pcAQLayer->aqPartWidth;
3626 uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pcAQLayer->aqPartHeight;
3627
3628 uint32_t aqStride = pcAQLayer->numAQPartInWidth;
3629
3630 double dQpOffset = pcAQLayer->dCuTreeOffset[aqPosY * aqStride + aqPosX];
3631 return dQpOffset;
3632 }
3633
calculateQpforCuSize(const CUData & ctu,const CUGeom & cuGeom,int32_t complexCheck,double baseQp)3634 int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp)
3635 {
3636 FrameData& curEncData = *m_frame->m_encData;
3637 double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
3638 bool bCuTreeOffset = IS_REFERENCED(m_frame) && m_param->rc.cuTree && !complexCheck;
3639
3640 if ((m_param->analysisMultiPassDistortion && m_param->rc.bStatRead) || (m_param->ctuDistortionRefine && m_param->analysisLoad))
3641 {
3642 x265_analysis_distortion_data* distortionData = m_frame->m_analysisData.distortionData;
3643 if ((distortionData->threshold[ctu.m_cuAddr] < 0.9 || distortionData->threshold[ctu.m_cuAddr] > 1.1)
3644 && distortionData->highDistortionCtuCount && distortionData->lowDistortionCtuCount)
3645 qp += distortionData->offset[ctu.m_cuAddr];
3646 }
3647
3648 if (m_param->analysisLoadReuseLevel == 10 && m_param->rc.cuTree)
3649 {
3650 int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
3651 if (ctu.m_slice->m_sliceType == I_SLICE)
3652 return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_intra_data*)m_frame->m_analysisData.intraData)->cuQPOff[cuIdx]));
3653 else
3654 return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_inter_data*)m_frame->m_analysisData.interData)->cuQPOff[cuIdx]));
3655 }
3656 if (m_param->rc.hevcAq)
3657 {
3658 /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
3659 double dQpOffset = 0;
3660 if (bCuTreeOffset)
3661 {
3662 dQpOffset = cuTreeQPOffset(ctu, cuGeom);
3663 }
3664 else
3665 {
3666 dQpOffset = aqQPOffset(ctu, cuGeom);
3667 if (complexCheck)
3668 {
3669 int32_t offset = (int32_t)(dQpOffset * 100 + .5);
3670 double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
3671 int32_t max_threshold = (int32_t)(threshold * 100 + .5);
3672 return (offset < max_threshold);
3673 }
3674 }
3675 qp += dQpOffset;
3676 }
3677 else
3678 {
3679 int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
3680 /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
3681 double *qpoffs = bCuTreeOffset ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
3682 if (qpoffs)
3683 {
3684 uint32_t width = m_frame->m_fencPic->m_picWidth;
3685 uint32_t height = m_frame->m_fencPic->m_picHeight;
3686 uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
3687 uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
3688 uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
3689 uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
3690 double dQpOffset = 0;
3691 uint32_t cnt = 0;
3692 for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
3693 {
3694 for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
3695 {
3696 uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
3697 dQpOffset += qpoffs[idx];
3698 cnt++;
3699 }
3700 }
3701 dQpOffset /= cnt;
3702 qp += dQpOffset;
3703 if (complexCheck)
3704 {
3705 int32_t offset = (int32_t)(dQpOffset * 100 + .5);
3706 double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
3707 int32_t max_threshold = (int32_t)(threshold * 100 + .5);
3708 return (offset < max_threshold);
3709 }
3710 }
3711 }
3712
3713 return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)(qp + 0.5));
3714 }
3715
normFactor(const pixel * src,uint32_t blockSize,CUData & ctu,int qp,TextType ttype)3716 void Analysis::normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype)
3717 {
3718 static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); // 416
3719 static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); // 235963
3720 int shift = (X265_DEPTH - 8);
3721
3722 double s = 1 + 0.005 * qp;
3723
3724 // Calculate denominator of normalization factor
3725 uint64_t fDc_den = 0, fAc_den = 0;
3726
3727 // 1. Calculate dc component
3728 uint64_t z_o = 0;
3729 for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 4)
3730 {
3731 for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 4)
3732 {
3733 uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
3734 z_o += temp * temp; // 2 * (Z(0)) pow(2)
3735 }
3736 }
3737 fDc_den = (2 * z_o) + (blockSize * blockSize * ssim_c1); // 2 * (Z(0)) pow(2) + N * C1
3738 fDc_den /= ((blockSize >> 2) * (blockSize >> 2));
3739
3740 // 2. Calculate ac component
3741 uint64_t z_k = 0;
3742 int block = (int)(((log(blockSize) / log(2)) - 2) + 0.5);
3743 primitives.cu[block].normFact(src, blockSize, shift, &z_k);
3744
3745 // Remove the DC part
3746 z_k -= z_o;
3747
3748 fAc_den = z_k + int(s * z_k) + ssim_c2;
3749 fAc_den /= ((blockSize >> 2) * (blockSize >> 2));
3750
3751 ctu.m_fAc_den[ttype] = fAc_den;
3752 ctu.m_fDc_den[ttype] = fDc_den;
3753 }
3754
calculateNormFactor(CUData & ctu,int qp)3755 void Analysis::calculateNormFactor(CUData& ctu, int qp)
3756 {
3757 const pixel* srcY = m_modeDepth[0].fencYuv.m_buf[0];
3758 uint32_t blockSize = m_modeDepth[0].fencYuv.m_size;
3759
3760 normFactor(srcY, blockSize, ctu, qp, TEXT_LUMA);
3761
3762 if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3763 {
3764 const pixel* srcU = m_modeDepth[0].fencYuv.m_buf[1];
3765 const pixel* srcV = m_modeDepth[0].fencYuv.m_buf[2];
3766 uint32_t blockSizeC = m_modeDepth[0].fencYuv.m_csize;
3767
3768 normFactor(srcU, blockSizeC, ctu, qp, TEXT_CHROMA_U);
3769 normFactor(srcV, blockSizeC, ctu, qp, TEXT_CHROMA_V);
3770 }
3771 }
3772
findSameContentRefCount(const CUData & parentCTU,const CUGeom & cuGeom)3773 int Analysis::findSameContentRefCount(const CUData& parentCTU, const CUGeom& cuGeom)
3774 {
3775 int sameContentRef = 0;
3776 int m_curPoc = parentCTU.m_slice->m_poc;
3777 int prevChange = m_prevCtuInfoChange[cuGeom.absPartIdx];
3778 int numPredDir = m_slice->isInterP() ? 1 : 2;
3779 for (int list = 0; list < numPredDir; list++)
3780 {
3781 for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
3782 {
3783 int refPoc = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_poc;
3784 int refPrevChange = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_addOnPrevChange[parentCTU.m_cuAddr][cuGeom.absPartIdx];
3785 if ((refPoc < prevChange && refPoc < m_curPoc) || (refPoc > m_curPoc && prevChange < m_curPoc && refPrevChange > m_curPoc) || ((refPoc == prevChange) && (m_additionalCtuInfo[cuGeom.absPartIdx] == CTU_INFO_CHANGE)))
3786 sameContentRef++; /* Content changed */
3787 }
3788 }
3789 return sameContentRef;
3790 }
3791