1 /*****************************************************************************
2  * Copyright (C) 2013-2020 MulticoreWare, Inc
3  *
4  * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
5  *          Min Chen <chenm003@163.com>
6  *          Steve Borho <steve@borho.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21  *
22  * This program is also available under a commercial proprietary license.
23  * For more information, contact us at license @ x265.com.
24  *****************************************************************************/
25 
26 #include "common.h"
27 #include "frame.h"
28 #include "framedata.h"
29 #include "wavefront.h"
30 #include "param.h"
31 
32 #include "encoder.h"
33 #include "frameencoder.h"
34 #include "common.h"
35 #include "slicetype.h"
36 #include "nal.h"
37 
38 namespace X265_NS {
39 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
40 
FrameEncoder()41 FrameEncoder::FrameEncoder()
42 {
43     m_prevOutputTime = x265_mdate();
44     m_reconfigure = false;
45     m_isFrameEncoder = true;
46     m_threadActive = true;
47     m_slicetypeWaitTime = 0;
48     m_activeWorkerCount = 0;
49     m_completionCount = 0;
50     m_bAllRowsStop = false;
51     m_vbvResetTriggerRow = -1;
52     m_outStreams = NULL;
53     m_backupStreams = NULL;
54     m_substreamSizes = NULL;
55     m_nr = NULL;
56     m_tld = NULL;
57     m_rows = NULL;
58     m_top = NULL;
59     m_param = NULL;
60     m_frame = NULL;
61     m_cuGeoms = NULL;
62     m_ctuGeomMap = NULL;
63     m_localTldIdx = 0;
64     memset(&m_rce, 0, sizeof(RateControlEntry));
65 }
66 
destroy()67 void FrameEncoder::destroy()
68 {
69     if (m_pool)
70     {
71         if (!m_jpId)
72         {
73             int numTLD = m_pool->m_numWorkers;
74             if (!m_param->bEnableWavefront)
75                 numTLD += m_pool->m_numProviders;
76             for (int i = 0; i < numTLD; i++)
77                 m_tld[i].destroy();
78             delete [] m_tld;
79         }
80     }
81     else
82     {
83         m_tld->destroy();
84         delete m_tld;
85     }
86 
87     delete[] m_rows;
88     delete[] m_outStreams;
89     delete[] m_backupStreams;
90     X265_FREE(m_sliceBaseRow);
91     X265_FREE(m_sliceMaxBlockRow);
92     X265_FREE(m_cuGeoms);
93     X265_FREE(m_ctuGeomMap);
94     X265_FREE(m_substreamSizes);
95     X265_FREE(m_nr);
96 
97     m_frameFilter.destroy();
98 
99     if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
100     {
101         delete m_rce.picTimingSEI;
102         delete m_rce.hrdTiming;
103     }
104 }
105 
init(Encoder * top,int numRows,int numCols)106 bool FrameEncoder::init(Encoder *top, int numRows, int numCols)
107 {
108     m_top = top;
109     m_param = top->m_param;
110     m_numRows = numRows;
111     m_numCols = numCols;
112     m_reconfigure = false;
113     m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked)
114                         || (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ?
115                         2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
116     m_filterRowDelayCus = m_filterRowDelay * numCols;
117     m_rows = new CTURow[m_numRows];
118     bool ok = !!m_numRows;
119 
120     m_sliceBaseRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
121     ok &= !!m_sliceBaseRow;
122     m_sliceGroupSize = (uint16_t)(m_numRows + m_param->maxSlices - 1) / m_param->maxSlices;
123     uint32_t sliceGroupSizeAccu = (m_numRows << 8) / m_param->maxSlices;
124     uint32_t rowSum = sliceGroupSizeAccu;
125     uint32_t sidx = 0;
126     for (uint32_t i = 0; i < m_numRows; i++)
127     {
128         const uint32_t rowRange = (rowSum >> 8);
129         if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
130         {
131             rowSum += sliceGroupSizeAccu;
132             m_sliceBaseRow[++sidx] = i;
133         }
134     }
135     X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
136     m_sliceBaseRow[0] = 0;
137     m_sliceBaseRow[m_param->maxSlices] = m_numRows;
138 
139     m_sliceMaxBlockRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
140     ok &= !!m_sliceMaxBlockRow;
141     uint32_t maxBlockRows = (m_param->sourceHeight + (16 - 1)) / 16;
142     sliceGroupSizeAccu = (maxBlockRows << 8) / m_param->maxSlices;
143     rowSum = sliceGroupSizeAccu;
144     sidx = 0;
145     for (uint32_t i = 0; i < maxBlockRows; i++)
146     {
147         const uint32_t rowRange = (rowSum >> 8);
148         if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
149         {
150             rowSum += sliceGroupSizeAccu;
151             m_sliceMaxBlockRow[++sidx] = i;
152         }
153     }
154     m_sliceMaxBlockRow[0] = 0;
155     m_sliceMaxBlockRow[m_param->maxSlices] = maxBlockRows;
156 
157     /* determine full motion search range */
158     int range  = m_param->searchRange;       /* fpel search */
159     range += !!(m_param->searchMethod < 2);  /* diamond/hex range check lag */
160     range += NTAPS_LUMA / 2;                 /* subpel filter half-length */
161     range += 2 + (MotionEstimate::hpelIterationCount(m_param->subpelRefine) + 1) / 2; /* subpel refine steps */
162     m_refLagRows = /*(m_param->maxSlices > 1 ? 1 : 0) +*/ 1 + ((range + m_param->maxCUSize - 1) / m_param->maxCUSize);
163 
164     // NOTE: 2 times of numRows because both Encoder and Filter in same queue
165     if (!WaveFront::init(m_numRows * 2))
166     {
167         x265_log(m_param, X265_LOG_ERROR, "unable to initialize wavefront queue\n");
168         m_pool = NULL;
169     }
170 
171     m_frameFilter.init(top, this, numRows, numCols);
172 
173     // initialize HRD parameters of SPS
174     if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
175     {
176         m_rce.picTimingSEI = new SEIPictureTiming;
177         m_rce.hrdTiming = new HRDTiming;
178 
179         ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
180     }
181 
182     if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
183         m_nr = X265_MALLOC(NoiseReduction, 1);
184     if (m_nr)
185         memset(m_nr, 0, sizeof(NoiseReduction));
186     else
187         m_param->noiseReductionIntra = m_param->noiseReductionInter = 0;
188 
189     // 7.4.7.1 - Ceil( Log2( PicSizeInCtbsY ) ) bits
190     {
191         unsigned long tmp;
192         CLZ(tmp, (numRows * numCols - 1));
193         m_sliceAddrBits = (uint16_t)(tmp + 1);
194     }
195 
196     return ok;
197 }
198 
199 /* Generate a complete list of unique geom sets for the current picture dimensions */
initializeGeoms()200 bool FrameEncoder::initializeGeoms()
201 {
202     /* Geoms only vary between CTUs in the presence of picture edges */
203     int maxCUSize = m_param->maxCUSize;
204     int minCUSize = m_param->minCUSize;
205     int heightRem = m_param->sourceHeight & (maxCUSize - 1);
206     int widthRem = m_param->sourceWidth & (maxCUSize - 1);
207     int allocGeoms = 1; // body
208     if (heightRem && widthRem)
209         allocGeoms = 4; // body, right, bottom, corner
210     else if (heightRem || widthRem)
211         allocGeoms = 2; // body, right or bottom
212 
213     m_ctuGeomMap = X265_MALLOC(uint32_t, m_numRows * m_numCols);
214     m_cuGeoms = X265_MALLOC(CUGeom, allocGeoms * CUGeom::MAX_GEOMS);
215     if (!m_cuGeoms || !m_ctuGeomMap)
216         return false;
217 
218     // body
219     CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, minCUSize, m_cuGeoms);
220     memset(m_ctuGeomMap, 0, sizeof(uint32_t) * m_numRows * m_numCols);
221     if (allocGeoms == 1)
222         return true;
223 
224     int countGeoms = 1;
225     if (widthRem)
226     {
227         // right
228         CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
229         for (uint32_t i = 0; i < m_numRows; i++)
230         {
231             uint32_t ctuAddr = m_numCols * (i + 1) - 1;
232             m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
233         }
234         countGeoms++;
235     }
236     if (heightRem)
237     {
238         // bottom
239         CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
240         for (uint32_t i = 0; i < m_numCols; i++)
241         {
242             uint32_t ctuAddr = m_numCols * (m_numRows - 1) + i;
243             m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
244         }
245         countGeoms++;
246 
247         if (widthRem)
248         {
249             // corner
250             CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
251 
252             uint32_t ctuAddr = m_numCols * m_numRows - 1;
253             m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
254             countGeoms++;
255         }
256         X265_CHECK(countGeoms == allocGeoms, "geometry match check failure\n");
257     }
258 
259     return true;
260 }
261 
startCompressFrame(Frame * curFrame)262 bool FrameEncoder::startCompressFrame(Frame* curFrame)
263 {
264     m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
265     m_frame = curFrame;
266     m_sliceType = curFrame->m_lowres.sliceType;
267     curFrame->m_encData->m_frameEncoderID = m_jpId;
268     curFrame->m_encData->m_jobProvider = this;
269     curFrame->m_encData->m_slice->m_mref = m_mref;
270 
271     if (!m_cuGeoms)
272     {
273         if (!initializeGeoms())
274             return false;
275     }
276 
277     m_enable.trigger();
278     return true;
279 }
280 
threadMain()281 void FrameEncoder::threadMain()
282 {
283     THREAD_NAME("Frame", m_jpId);
284 
285     if (m_pool)
286     {
287         m_pool->setCurrentThreadAffinity();
288 
289         /* the first FE on each NUMA node is responsible for allocating thread
290          * local data for all worker threads in that pool. If WPP is disabled, then
291          * each FE also needs a TLD instance */
292         if (!m_jpId)
293         {
294             int numTLD = m_pool->m_numWorkers;
295             if (!m_param->bEnableWavefront)
296                 numTLD += m_pool->m_numProviders;
297 
298             m_tld = new ThreadLocalData[numTLD];
299             for (int i = 0; i < numTLD; i++)
300             {
301                 m_tld[i].analysis.initSearch(*m_param, m_top->m_scalingList);
302                 m_tld[i].analysis.create(m_tld);
303             }
304 
305             for (int i = 0; i < m_pool->m_numProviders; i++)
306             {
307                 if (m_pool->m_jpTable[i]->m_isFrameEncoder) /* ugh; over-allocation and other issues here */
308                 {
309                     FrameEncoder *peer = dynamic_cast<FrameEncoder*>(m_pool->m_jpTable[i]);
310                     peer->m_tld = m_tld;
311                 }
312             }
313         }
314 
315         if (m_param->bEnableWavefront)
316             m_localTldIdx = -1; // cause exception if used
317         else
318             m_localTldIdx = m_pool->m_numWorkers + m_jpId;
319     }
320     else
321     {
322         m_tld = new ThreadLocalData;
323         m_tld->analysis.initSearch(*m_param, m_top->m_scalingList);
324         m_tld->analysis.create(NULL);
325         m_localTldIdx = 0;
326     }
327 
328     m_done.trigger();     /* signal that thread is initialized */
329     m_enable.wait();      /* Encoder::encode() triggers this event */
330 
331     while (m_threadActive)
332     {
333         if (m_param->bCTUInfo)
334         {
335             while (!m_frame->m_ctuInfo)
336                 m_frame->m_copied.wait();
337         }
338         if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame->m_lowres.sliceType)))
339         {
340             while (((m_frame->m_analysisData.interData == NULL && m_frame->m_analysisData.intraData == NULL) || (uint32_t)m_frame->m_poc != m_frame->m_analysisData.poc))
341                 m_frame->m_copyMVType.wait();
342         }
343         compressFrame();
344         m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */
345         m_enable.wait();
346     }
347 }
348 
processTasks(int)349 void FrameEncoder::WeightAnalysis::processTasks(int /* workerThreadId */)
350 {
351     Frame* frame = master.m_frame;
352     weightAnalyse(*frame->m_encData->m_slice, *frame, *master.m_param);
353 }
354 
355 
getBsLength(int32_t code)356 uint32_t getBsLength( int32_t code )
357 {
358     uint32_t ucode = (code <= 0) ? -code << 1 : (code << 1) - 1;
359 
360     ++ucode;
361     unsigned long idx;
362     CLZ( idx, ucode );
363     uint32_t length = (uint32_t)idx * 2 + 1;
364 
365     return length;
366 }
367 
writeToneMapInfo(x265_sei_payload * payload)368 bool FrameEncoder::writeToneMapInfo(x265_sei_payload *payload)
369 {
370     bool payloadChange = false;
371     if (m_top->m_prevTonemapPayload.payload != NULL && payload->payloadSize == m_top->m_prevTonemapPayload.payloadSize)
372     {
373         if (memcmp(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize) != 0)
374             payloadChange = true;
375     }
376     else
377     {
378         payloadChange = true;
379         if (m_top->m_prevTonemapPayload.payload != NULL)
380             x265_free(m_top->m_prevTonemapPayload.payload);
381         m_top->m_prevTonemapPayload.payload = (uint8_t*)x265_malloc(sizeof(uint8_t)* payload->payloadSize);
382     }
383 
384     if (payloadChange)
385     {
386         m_top->m_prevTonemapPayload.payloadType = payload->payloadType;
387         m_top->m_prevTonemapPayload.payloadSize = payload->payloadSize;
388         memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
389     }
390 
391     bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
392     return (payloadChange || isIDR);
393 }
394 
writeTrailingSEIMessages()395 void FrameEncoder::writeTrailingSEIMessages()
396 {
397     Slice* slice = m_frame->m_encData->m_slice;
398     int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
399     int32_t payloadSize = 0;
400 
401     if (m_param->decodedPictureHashSEI == 1)
402     {
403         m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
404         for (int i = 0; i < planes; i++)
405             MD5Final(&m_seiReconPictureDigest.m_state[i], m_seiReconPictureDigest.m_digest[i]);
406         payloadSize = 1 + 16 * planes;
407     }
408     else if (m_param->decodedPictureHashSEI == 2)
409     {
410         m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
411         for (int i = 0; i < planes; i++)
412             crcFinish(m_seiReconPictureDigest.m_crc[i], m_seiReconPictureDigest.m_digest[i]);
413         payloadSize = 1 + 2 * planes;
414     }
415     else if (m_param->decodedPictureHashSEI == 3)
416     {
417         m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
418         for (int i = 0; i < planes; i++)
419             checksumFinish(m_seiReconPictureDigest.m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
420         payloadSize = 1 + 4 * planes;
421     }
422 
423     m_seiReconPictureDigest.setSize(payloadSize);
424     m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false);
425 }
426 
compressFrame()427 void FrameEncoder::compressFrame()
428 {
429     ProfileScopeEvent(frameThread);
430 
431     m_startCompressTime = x265_mdate();
432     m_totalActiveWorkerCount = 0;
433     m_activeWorkerCountSamples = 0;
434     m_totalWorkerElapsedTime = 0;
435     m_totalNoWorkerTime = 0;
436     m_countRowBlocks = 0;
437     m_allRowsAvailableTime = 0;
438     m_stallStartTime = 0;
439 
440     m_completionCount = 0;
441     m_bAllRowsStop = false;
442     m_vbvResetTriggerRow = -1;
443     m_rowSliceTotalBits[0] = 0;
444     m_rowSliceTotalBits[1] = 0;
445 
446     m_SSDY = m_SSDU = m_SSDV = 0;
447     m_ssim = 0;
448     m_ssimCnt = 0;
449     memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
450 
451     if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
452     {
453         int height = m_frame->m_fencPic->m_picHeight;
454         int width = m_frame->m_fencPic->m_picWidth;
455         intptr_t stride = m_frame->m_fencPic->m_stride;
456 
457         if (!computeEdge(m_frame->m_edgeBitPic, m_frame->m_fencPic->m_picOrg[0], NULL, stride, height, width, false, 1))
458         {
459             x265_log(m_param, X265_LOG_ERROR, " Failed to compute edge !");
460         }
461     }
462 
463     /* Emit access unit delimiter unless this is the first frame and the user is
464      * not repeating headers (since AUD is supposed to be the first NAL in the access
465      * unit) */
466     Slice* slice = m_frame->m_encData->m_slice;
467 
468     if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
469     {
470         m_bs.resetBits();
471         m_entropyCoder.setBitstream(&m_bs);
472         m_entropyCoder.codeAUD(*slice);
473         m_bs.writeByteAlignment();
474         m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs);
475         if (m_param->bSingleSeiNal)
476             m_bs.resetBits();
477     }
478     if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders)
479     {
480         if (m_param->bOptRefListLengthPPS)
481         {
482             ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
483             m_top->updateRefIdx();
484         }
485         if (m_top->m_param->rc.bStatRead  && m_top->m_param->bMultiPassOptRPS)
486         {
487             ScopedLock refIdxLock(m_top->m_rpsInSpsLock);
488             if (!m_top->computeSPSRPSIndex())
489             {
490                 x265_log(m_param, X265_LOG_ERROR, "compute commonly RPS failed!\n");
491                 m_top->m_aborted = true;
492             }
493             m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
494         }
495         else
496             m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
497     }
498 
499     if (m_top->m_param->rc.bStatRead && m_top->m_param->bMultiPassOptRPS)
500         m_frame->m_encData->m_slice->m_rpsIdx = (m_top->m_rateControl->m_rce2Pass + m_frame->m_encodeOrder)->rpsIdx;
501 
502     // Weighted Prediction parameters estimation.
503     bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred;
504     bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred;
505 
506     WeightParam* reuseWP = NULL;
507     if (m_param->analysisLoad && (bUseWeightP || bUseWeightB))
508         reuseWP = (WeightParam*)m_frame->m_analysisData.wt;
509 
510     if (bUseWeightP || bUseWeightB)
511     {
512 #if DETAILED_CU_STATS
513         m_cuStats.countWeightAnalyze++;
514         ScopedElapsedTime time(m_cuStats.weightAnalyzeTime);
515 #endif
516         if (m_param->analysisLoad)
517         {
518             for (int list = 0; list < slice->isInterB() + 1; list++)
519             {
520                 for (int plane = 0; plane < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
521                 {
522                     for (int ref = 1; ref < slice->m_numRefIdx[list]; ref++)
523                         SET_WEIGHT(slice->m_weightPredTable[list][ref][plane], false, 1 << reuseWP->log2WeightDenom, reuseWP->log2WeightDenom, 0);
524                     slice->m_weightPredTable[list][0][plane] = *(reuseWP++);
525                 }
526             }
527         }
528         else
529         {
530             WeightAnalysis wa(*this);
531             if (m_pool && wa.tryBondPeers(*this, 1))
532                 /* use an idle worker for weight analysis */
533                 wa.waitForExit();
534             else
535                 weightAnalyse(*slice, *m_frame, *m_param);
536         }
537     }
538     else
539         slice->disableWeights();
540 
541     if (m_param->analysisSave && (bUseWeightP || bUseWeightB))
542         reuseWP = (WeightParam*)m_frame->m_analysisData.wt;
543     // Generate motion references
544     int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0;
545     for (int l = 0; l < numPredDir; l++)
546     {
547         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
548         {
549             WeightParam *w = NULL;
550             if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].wtPresent)
551                 w = slice->m_weightPredTable[l][ref];
552             slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
553             m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
554         }
555         if (m_param->analysisSave && (bUseWeightP || bUseWeightB))
556         {
557             for (int i = 0; i < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
558                 *(reuseWP++) = slice->m_weightPredTable[l][0][i];
559         }
560 
561     }
562 
563     int numTLD;
564     if (m_pool)
565         numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
566     else
567         numTLD = 1;
568 
569     /* Get the QP for this frame from rate control. This call may block until
570      * frames ahead of it in encode order have called rateControlEnd() */
571     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
572     m_rce.newQp = qp;
573 
574     if (m_nr)
575     {
576         if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
577         {
578             for (int i = 0; i < numTLD; i++)
579             {
580                 m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_top->m_offsetEmergency[qp - QP_MAX_SPEC - 1];
581                 m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_top->m_residualSumEmergency;
582                 m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_top->m_countEmergency;
583             }
584         }
585         else
586         {
587             if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
588             {
589                 for (int i = 0; i < numTLD; i++)
590                 {
591                     m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrOffsetDenoise;
592                     m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrResidualSum;
593                     m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrCount;
594                 }
595             }
596             else
597             {
598                 for (int i = 0; i < numTLD; i++)
599                     m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = NULL;
600             }
601         }
602     }
603 
604     /* Clip slice QP to 0-51 spec range before encoding */
605     slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
606     if (m_param->bHDR10Opt)
607     {
608         int qpCb = x265_clip3(-12, 0, (int)floor((m_top->m_cB * ((-.46) * qp + 9.26)) + 0.5 ));
609         int qpCr = x265_clip3(-12, 0, (int)floor((m_top->m_cR * ((-.46) * qp + 9.26)) + 0.5 ));
610         slice->m_chromaQpOffset[0] = slice->m_pps->chromaQpOffset[0] + qpCb < -12 ? (qpCb + (-12 - (slice->m_pps->chromaQpOffset[0] + qpCb))) : qpCb;
611         slice->m_chromaQpOffset[1] = slice->m_pps->chromaQpOffset[1] + qpCr < -12 ? (qpCr + (-12 - (slice->m_pps->chromaQpOffset[1] + qpCr))) : qpCr;
612     }
613 
614     if (m_param->bOptQpPPS && m_param->bRepeatHeaders)
615     {
616         ScopedLock qpLock(m_top->m_sliceQpLock);
617         for (int i = 0; i < (QP_MAX_MAX + 1); i++)
618         {
619             int delta = slice->m_sliceQp - (i + 1);
620             int codeLength = getBsLength( delta );
621             m_top->m_iBitsCostSum[i] += codeLength;
622         }
623         m_top->m_iFrameNum++;
624     }
625     m_initSliceContext.resetEntropy(*slice);
626 
627     m_frameFilter.start(m_frame, m_initSliceContext);
628 
629     /* ensure all rows are blocked prior to initializing row CTU counters */
630     WaveFront::clearEnabledRowMask();
631 
632     /* reset entropy coders and compute slice id */
633     m_entropyCoder.load(m_initSliceContext);
634     for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
635         for (uint32_t row = m_sliceBaseRow[sliceId]; row < m_sliceBaseRow[sliceId + 1]; row++)
636             m_rows[row].init(m_initSliceContext, sliceId);
637 
638     // reset slice counter for rate control update
639     m_sliceCnt = 0;
640 
641     uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : m_param->maxSlices;
642     X265_CHECK(m_param->bEnableWavefront || (m_param->maxSlices == 1), "Multiple slices without WPP unsupport now!");
643     if (!m_outStreams)
644     {
645         m_outStreams = new Bitstream[numSubstreams];
646         if (!m_param->bEnableWavefront)
647             m_backupStreams = new Bitstream[numSubstreams];
648         m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams);
649         if (!slice->m_bUseSao)
650         {
651             for (uint32_t i = 0; i < numSubstreams; i++)
652                 m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
653         }
654     }
655     else
656     {
657         for (uint32_t i = 0; i < numSubstreams; i++)
658         {
659             m_outStreams[i].resetBits();
660             if (!slice->m_bUseSao)
661                 m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
662             else
663                 m_rows[i].rowGoOnCoder.setBitstream(NULL);
664         }
665     }
666 
667     m_rce.encodeOrder = m_frame->m_encodeOrder;
668     int prevBPSEI = m_rce.encodeOrder ? m_top->m_lastBPSEI : 0;
669 
670     if (m_frame->m_lowres.bKeyframe)
671     {
672         if (m_param->bEmitHRDSEI)
673         {
674             SEIBufferingPeriod* bpSei = &m_top->m_rateControl->m_bufPeriodSEI;
675 
676             // since the temporal layer HRD is not ready, we assumed it is fixed
677             bpSei->m_auCpbRemovalDelayDelta = 1;
678             bpSei->m_cpbDelayOffset = 0;
679             bpSei->m_dpbDelayOffset = 0;
680             bpSei->m_concatenationFlag = (m_param->bEnableHRDConcatFlag && !m_frame->m_poc) ? true : false;
681 
682             // hrdFullness() calculates the initial CPB removal delay and offset
683             m_top->m_rateControl->hrdFullness(bpSei);
684             bpSei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
685 
686             m_top->m_lastBPSEI = m_rce.encodeOrder;
687         }
688 
689         if (m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_param->bEmitIDRRecoverySEI)
690         {
691             /* Recovery Point SEI require the SPS to be "activated" */
692             SEIRecoveryPoint sei;
693             sei.m_recoveryPocCnt = 0;
694             sei.m_exactMatchingFlag = true;
695             sei.m_brokenLinkFlag = false;
696             sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
697         }
698     }
699 
700     if ((m_param->bEmitHRDSEI || !!m_param->interlaceMode))
701     {
702         SEIPictureTiming *sei = m_rce.picTimingSEI;
703         const VUI *vui = &slice->m_sps->vuiParameters;
704         const HRDInfo *hrd = &vui->hrdParameters;
705         int poc = slice->m_poc;
706 
707         if (vui->frameFieldInfoPresentFlag)
708         {
709             if (m_param->interlaceMode > 0)
710             {
711                 if( m_param->interlaceMode == 2 )
712                 {
713                     // m_picStruct should be set to 3 or 4 when field feature is enabled
714                     if (m_param->bField)
715                         // 3: Top field, bottom field, in that order; 4: Bottom field, top field, in that order
716                         sei->m_picStruct = (slice->m_fieldNum == 1) ? 4 : 3;
717                     else
718                         sei->m_picStruct = (poc & 1) ? 1 /* top */ : 2 /* bottom */;
719                 }
720                 else if (m_param->interlaceMode == 1)
721                 {
722                     if (m_param->bField)
723                         sei->m_picStruct = (slice->m_fieldNum == 1) ? 3: 4;
724                     else
725                         sei->m_picStruct = (poc & 1) ? 2 /* bottom */ : 1 /* top */;
726                 }
727             }
728             else if (m_param->bEnableFrameDuplication)
729                 sei->m_picStruct = m_frame->m_picStruct;
730             else
731                 sei->m_picStruct = m_param->pictureStructure;
732 
733             sei->m_sourceScanType = m_param->interlaceMode ? 0 : 1;
734 
735             sei->m_duplicateFlag = false;
736         }
737 
738         if (vui->hrdParametersPresentFlag)
739         {
740             // The m_aucpbremoval delay specifies how many clock ticks the
741             // access unit associated with the picture timing SEI message has to
742             // wait after removal of the access unit with the most recent
743             // buffering period SEI message
744             sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - prevBPSEI), (1 << hrd->cpbRemovalDelayLength));
745             sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
746         }
747 
748         sei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
749     }
750 
751     if (m_param->preferredTransferCharacteristics > -1 && slice->isIRAP())
752     {
753         SEIAlternativeTC m_seiAlternativeTC;
754         m_seiAlternativeTC.m_preferredTransferCharacteristics = m_param->preferredTransferCharacteristics;
755         m_seiAlternativeTC.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
756     }
757 
758     /* Write user SEI */
759     for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
760     {
761         x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
762         if (payload->payloadType == USER_DATA_UNREGISTERED)
763         {
764             SEIuserDataUnregistered sei;
765             sei.m_userData = payload->payload;
766             sei.setSize(payload->payloadSize);
767             sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
768         }
769         else if (payload->payloadType == USER_DATA_REGISTERED_ITU_T_T35)
770         {
771             bool writeSei = m_param->bDhdr10opt ? writeToneMapInfo(payload) : true;
772             if (writeSei)
773             {
774                 SEIuserDataRegistered sei;
775                 sei.m_userData = payload->payload;
776                 sei.setSize(payload->payloadSize);
777                 sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
778             }
779         }
780         else
781             x265_log(m_param, X265_LOG_ERROR, "Unrecognized SEI type\n");
782     }
783 
784     bool isSei = ((m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders) || m_param->bEmitHRDSEI ||
785                  !!m_param->interlaceMode || (m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_param->bEmitIDRRecoverySEI) ||
786                    m_frame->m_userSEI.numPayloads);
787 
788     if (isSei && m_param->bSingleSeiNal)
789     {
790         m_bs.writeByteAlignment();
791         m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
792     }
793     /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to
794      * tune RateControl parameters for other frames.
795      * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
796      * RateControlEnd here, after the slice contexts are initialized. For the rest - ABR
797      * and VBV, unlock only after rateControlUpdateStats of this frame is called */
798     if (m_param->rc.rateControlMode != X265_RC_ABR && !m_top->m_rateControl->m_isVbv)
799     {
800         m_top->m_rateControl->m_startEndOrder.incr();
801 
802         if (m_rce.encodeOrder < m_param->frameNumThreads - 1)
803             m_top->m_rateControl->m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames
804     }
805 
806     if (m_param->bDynamicRefine)
807         computeAvgTrainingData();
808 
809     /* Analyze CTU rows, most of the hard work is done here.  Frame is
810      * compressed in a wave-front pattern if WPP is enabled. Row based loop
811      * filters runs behind the CTU compression and reconstruction */
812 
813     for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
814         m_rows[m_sliceBaseRow[sliceId]].active = true;
815 
816     if (m_param->bEnableWavefront)
817     {
818         int i = 0;
819         for (uint32_t rowInSlice = 0; rowInSlice < m_sliceGroupSize; rowInSlice++)
820         {
821             for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
822             {
823                 const uint32_t sliceStartRow = m_sliceBaseRow[sliceId];
824                 const uint32_t sliceEndRow = m_sliceBaseRow[sliceId + 1] - 1;
825                 const uint32_t row = sliceStartRow + rowInSlice;
826                 if (row > sliceEndRow)
827                     continue;
828                 m_row_to_idx[row] = i;
829                 m_idx_to_row[i] = row;
830                 i += 1;
831             }
832         }
833     }
834 
835     if (m_param->bEnableWavefront)
836     {
837         for (uint32_t rowInSlice = 0; rowInSlice < m_sliceGroupSize; rowInSlice++)
838         {
839             for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
840             {
841                 const uint32_t sliceStartRow = m_sliceBaseRow[sliceId];
842                 const uint32_t sliceEndRow = m_sliceBaseRow[sliceId + 1] - 1;
843                 const uint32_t row = sliceStartRow + rowInSlice;
844 
845                 X265_CHECK(row < m_numRows, "slices row fault was detected");
846 
847                 if (row > sliceEndRow)
848                     continue;
849 
850                 // block until all reference frames have reconstructed the rows we need
851                 for (int l = 0; l < numPredDir; l++)
852                 {
853                     for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
854                     {
855                         Frame *refpic = slice->m_refFrameList[l][ref];
856 
857                         // NOTE: we unnecessary wait row that beyond current slice boundary
858                         const int rowIdx = X265_MIN(sliceEndRow, (row + m_refLagRows));
859 
860                         while (refpic->m_reconRowFlag[rowIdx].get() == 0)
861                             refpic->m_reconRowFlag[rowIdx].waitForChange(0);
862 
863                         if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
864                             m_mref[l][ref].applyWeight(rowIdx, m_numRows, sliceEndRow, sliceId);
865                     }
866                 }
867 
868                 enableRowEncoder(m_row_to_idx[row]); /* clear external dependency for this row */
869                 if (!rowInSlice)
870                 {
871                     m_row0WaitTime = x265_mdate();
872                     enqueueRowEncoder(m_row_to_idx[row]); /* clear internal dependency, start wavefront */
873                 }
874                 tryWakeOne();
875             } // end of loop rowInSlice
876         } // end of loop sliceId
877 
878         m_allRowsAvailableTime = x265_mdate();
879         tryWakeOne(); /* ensure one thread is active or help-wanted flag is set prior to blocking */
880         static const int block_ms = 250;
881         while (m_completionEvent.timedWait(block_ms))
882             tryWakeOne();
883     }
884     else
885     {
886         for (uint32_t i = 0; i < m_numRows + m_filterRowDelay; i++)
887         {
888             // compress
889             if (i < m_numRows)
890             {
891                 // block until all reference frames have reconstructed the rows we need
892                 for (int l = 0; l < numPredDir; l++)
893                 {
894                     int list = l;
895                     for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++)
896                     {
897                         Frame *refpic = slice->m_refFrameList[list][ref];
898 
899                         const int rowIdx = X265_MIN(m_numRows - 1, (i + m_refLagRows));
900                         while (refpic->m_reconRowFlag[rowIdx].get() == 0)
901                             refpic->m_reconRowFlag[rowIdx].waitForChange(0);
902 
903                         if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
904                             m_mref[list][ref].applyWeight(rowIdx, m_numRows, m_numRows, 0);
905                     }
906                 }
907 
908                 if (!i)
909                     m_row0WaitTime = x265_mdate();
910                 else if (i == m_numRows - 1)
911                     m_allRowsAvailableTime = x265_mdate();
912                 processRowEncoder(i, m_tld[m_localTldIdx]);
913             }
914 
915             // filter
916             if (i >= m_filterRowDelay)
917                 m_frameFilter.processRow(i - m_filterRowDelay);
918         }
919     }
920 #if ENABLE_LIBVMAF
921     vmafFrameLevelScore();
922 #endif
923 
924     if (m_param->maxSlices > 1)
925     {
926         PicYuv *reconPic = m_frame->m_reconPic;
927         uint32_t height = reconPic->m_picHeight;
928         initDecodedPictureHashSEI(0, 0, height);
929     }
930 
931     if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder) //Avoid collecting data that will not be used by future frames.
932         collectDynDataFrame();
933 
934     if (m_param->rc.bStatWrite)
935     {
936         int totalI = 0, totalP = 0, totalSkip = 0;
937 
938         // accumulate intra,inter,skip cu count per frame for 2 pass
939         for (uint32_t i = 0; i < m_numRows; i++)
940         {
941             m_frame->m_encData->m_frameStats.mvBits    += m_rows[i].rowStats.mvBits;
942             m_frame->m_encData->m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits;
943             m_frame->m_encData->m_frameStats.miscBits  += m_rows[i].rowStats.miscBits;
944             totalI                                     += m_rows[i].rowStats.intra8x8Cnt;
945             totalP                                     += m_rows[i].rowStats.inter8x8Cnt;
946             totalSkip                                  += m_rows[i].rowStats.skip8x8Cnt;
947         }
948         int totalCuCount = totalI + totalP + totalSkip;
949         m_frame->m_encData->m_frameStats.percent8x8Intra = (double)totalI / totalCuCount;
950         m_frame->m_encData->m_frameStats.percent8x8Inter = (double)totalP / totalCuCount;
951         m_frame->m_encData->m_frameStats.percent8x8Skip  = (double)totalSkip / totalCuCount;
952     }
953 
954     if (m_param->csvLogLevel >= 1)
955     {
956         for (uint32_t i = 0; i < m_numRows; i++)
957         {
958             m_frame->m_encData->m_frameStats.cntIntraNxN += m_rows[i].rowStats.cntIntraNxN;
959             m_frame->m_encData->m_frameStats.totalCu += m_rows[i].rowStats.totalCu;
960             m_frame->m_encData->m_frameStats.totalCtu += m_rows[i].rowStats.totalCtu;
961             m_frame->m_encData->m_frameStats.lumaDistortion += m_rows[i].rowStats.lumaDistortion;
962             m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion;
963             m_frame->m_encData->m_frameStats.psyEnergy += m_rows[i].rowStats.psyEnergy;
964             m_frame->m_encData->m_frameStats.ssimEnergy += m_rows[i].rowStats.ssimEnergy;
965             m_frame->m_encData->m_frameStats.resEnergy += m_rows[i].rowStats.resEnergy;
966             for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
967             {
968                 m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth];
969                 m_frame->m_encData->m_frameStats.cntMergeCu[depth] += m_rows[i].rowStats.cntMergeCu[depth];
970                 for (int m = 0; m < INTER_MODES; m++)
971                     m_frame->m_encData->m_frameStats.cuInterDistribution[depth][m] += m_rows[i].rowStats.cuInterDistribution[depth][m];
972                 for (int n = 0; n < INTRA_MODES; n++)
973                     m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] += m_rows[i].rowStats.cuIntraDistribution[depth][n];
974             }
975         }
976         m_frame->m_encData->m_frameStats.percentIntraNxN = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
977 
978         for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
979         {
980             m_frame->m_encData->m_frameStats.percentSkipCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntSkipCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
981             m_frame->m_encData->m_frameStats.percentMergeCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntMergeCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
982             for (int n = 0; n < INTRA_MODES; n++)
983                 m_frame->m_encData->m_frameStats.percentIntraDistribution[depth][n] = (double)(m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] * 100) / m_frame->m_encData->m_frameStats.totalCu;
984             uint64_t cuInterRectCnt = 0; // sum of Nx2N, 2NxN counts
985             cuInterRectCnt += m_frame->m_encData->m_frameStats.cuInterDistribution[depth][1] + m_frame->m_encData->m_frameStats.cuInterDistribution[depth][2];
986             m_frame->m_encData->m_frameStats.percentInterDistribution[depth][0] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][0] * 100) / m_frame->m_encData->m_frameStats.totalCu;
987             m_frame->m_encData->m_frameStats.percentInterDistribution[depth][1] = (double)(cuInterRectCnt * 100) / m_frame->m_encData->m_frameStats.totalCu;
988             m_frame->m_encData->m_frameStats.percentInterDistribution[depth][2] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][3] * 100) / m_frame->m_encData->m_frameStats.totalCu;
989         }
990     }
991 
992     if (m_param->csvLogLevel >= 2)
993     {
994         m_frame->m_encData->m_frameStats.avgLumaDistortion = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
995         m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
996         m_frame->m_encData->m_frameStats.avgPsyEnergy = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
997         m_frame->m_encData->m_frameStats.avgSsimEnergy = (double)(m_frame->m_encData->m_frameStats.ssimEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
998         m_frame->m_encData->m_frameStats.avgResEnergy = (double)(m_frame->m_encData->m_frameStats.resEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
999     }
1000 
1001     m_bs.resetBits();
1002     m_entropyCoder.load(m_initSliceContext);
1003     m_entropyCoder.setBitstream(&m_bs);
1004 
1005     // finish encode of each CTU row, only required when SAO is enabled
1006     if (slice->m_bUseSao)
1007         encodeSlice(0);
1008 
1009     m_entropyCoder.setBitstream(&m_bs);
1010 
1011     if (m_param->maxSlices > 1)
1012     {
1013         uint32_t nextSliceRow = 0;
1014 
1015         for(uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
1016         {
1017             m_bs.resetBits();
1018 
1019             const uint32_t sliceAddr = nextSliceRow * m_numCols;
1020             if (m_param->bOptRefListLengthPPS)
1021             {
1022                 ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
1023                 m_top->analyseRefIdx(slice->m_numRefIdx);
1024             }
1025             m_entropyCoder.codeSliceHeader(*slice, *m_frame->m_encData, sliceAddr, m_sliceAddrBits, slice->m_sliceQp);
1026 
1027             // Find rows of current slice
1028             const uint32_t prevSliceRow = nextSliceRow;
1029             while(nextSliceRow < m_numRows && m_rows[nextSliceRow].sliceId == sliceId)
1030                 nextSliceRow++;
1031 
1032             // serialize each row, record final lengths in slice header
1033             uint32_t maxStreamSize = m_nalList.serializeSubstreams(&m_substreamSizes[prevSliceRow], (nextSliceRow - prevSliceRow), &m_outStreams[prevSliceRow]);
1034 
1035             // complete the slice header by writing WPP row-starts
1036             m_entropyCoder.setBitstream(&m_bs);
1037             if (slice->m_pps->bEntropyCodingSyncEnabled)
1038                 m_entropyCoder.codeSliceHeaderWPPEntryPoints(&m_substreamSizes[prevSliceRow], (nextSliceRow - prevSliceRow - 1), maxStreamSize);
1039 
1040             m_bs.writeByteAlignment();
1041 
1042             m_nalList.serialize(slice->m_nalUnitType, m_bs);
1043         }
1044     }
1045     else
1046     {
1047         if (m_param->bOptRefListLengthPPS)
1048         {
1049             ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
1050             m_top->analyseRefIdx(slice->m_numRefIdx);
1051         }
1052         m_entropyCoder.codeSliceHeader(*slice, *m_frame->m_encData, 0, 0, slice->m_sliceQp);
1053 
1054         // serialize each row, record final lengths in slice header
1055         uint32_t maxStreamSize = m_nalList.serializeSubstreams(m_substreamSizes, numSubstreams, m_outStreams);
1056 
1057         // complete the slice header by writing WPP row-starts
1058         m_entropyCoder.setBitstream(&m_bs);
1059         if (slice->m_pps->bEntropyCodingSyncEnabled)
1060             m_entropyCoder.codeSliceHeaderWPPEntryPoints(m_substreamSizes, (slice->m_sps->numCuInHeight - 1), maxStreamSize);
1061         m_bs.writeByteAlignment();
1062 
1063         m_nalList.serialize(slice->m_nalUnitType, m_bs);
1064     }
1065 
1066     if (m_param->decodedPictureHashSEI)
1067         writeTrailingSEIMessages();
1068 
1069     uint64_t bytes = 0;
1070     for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
1071     {
1072         int type = m_nalList.m_nal[i].type;
1073 
1074         // exclude SEI
1075         if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
1076         {
1077             bytes += m_nalList.m_nal[i].sizeBytes;
1078             // and exclude start code prefix
1079             bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
1080         }
1081     }
1082     m_accessUnitBits = bytes << 3;
1083 
1084     int filler = 0;
1085     /* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */
1086     if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &filler) < 0)
1087         m_top->m_aborted = true;
1088 
1089     if (filler > 0)
1090     {
1091         filler = (filler - FILLER_OVERHEAD * 8) >> 3;
1092         m_bs.resetBits();
1093         while (filler > 0)
1094         {
1095             m_bs.write(0xff, 8);
1096             filler--;
1097         }
1098         m_bs.writeByteAlignment();
1099         m_nalList.serialize(NAL_UNIT_FILLER_DATA, m_bs);
1100         bytes += m_nalList.m_nal[m_nalList.m_numNal - 1].sizeBytes;
1101         bytes -= 3; //exclude start code prefix
1102         m_accessUnitBits = bytes << 3;
1103     }
1104 
1105     if (m_frame->m_rpu.payloadSize)
1106     {
1107         m_bs.resetBits();
1108         for (int i = 0; i < m_frame->m_rpu.payloadSize; i++)
1109             m_bs.write(m_frame->m_rpu.payload[i], 8);
1110         m_nalList.serialize(NAL_UNIT_UNSPECIFIED, m_bs);
1111     }
1112 
1113     m_endCompressTime = x265_mdate();
1114 
1115     /* Decrement referenced frame reference counts, allow them to be recycled */
1116     for (int l = 0; l < numPredDir; l++)
1117     {
1118         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
1119         {
1120             Frame *refpic = slice->m_refFrameList[l][ref];
1121             ATOMIC_DEC(&refpic->m_countRefEncoders);
1122         }
1123     }
1124 
1125     if (m_nr)
1126     {
1127         bool nrEnabled = (m_rce.newQp < QP_MAX_SPEC || !m_param->rc.vbvBufferSize) && (m_param->noiseReductionIntra || m_param->noiseReductionInter);
1128 
1129         if (nrEnabled)
1130         {
1131             /* Accumulate NR statistics from all worker threads */
1132             for (int i = 0; i < numTLD; i++)
1133             {
1134                 NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
1135                 for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
1136                 {
1137                     for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
1138                         m_nr->nrResidualSum[cat][coeff] += nr->nrResidualSum[cat][coeff];
1139 
1140                     m_nr->nrCount[cat] += nr->nrCount[cat];
1141                 }
1142             }
1143 
1144             noiseReductionUpdate();
1145 
1146             /* Copy updated NR coefficients back to all worker threads */
1147             for (int i = 0; i < numTLD; i++)
1148             {
1149                 NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
1150                 memcpy(nr->nrOffsetDenoise, m_nr->nrOffsetDenoise, sizeof(uint16_t)* MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
1151                 memset(nr->nrCount, 0, sizeof(uint32_t)* MAX_NUM_TR_CATEGORIES);
1152                 memset(nr->nrResidualSum, 0, sizeof(uint32_t)* MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
1153             }
1154         }
1155     }
1156 
1157 #if DETAILED_CU_STATS
1158     /* Accumulate CU statistics from each worker thread, we could report
1159      * per-frame stats here, but currently we do not. */
1160     for (int i = 0; i < numTLD; i++)
1161         m_cuStats.accumulate(m_tld[i].analysis.m_stats[m_jpId], *m_param);
1162 #endif
1163 
1164     m_endFrameTime = x265_mdate();
1165 }
1166 
initDecodedPictureHashSEI(int row,int cuAddr,int height)1167 void FrameEncoder::initDecodedPictureHashSEI(int row, int cuAddr, int height)
1168 {
1169     PicYuv *reconPic = m_frame->m_reconPic;
1170     uint32_t width = reconPic->m_picWidth;
1171     intptr_t stride = reconPic->m_stride;
1172     uint32_t maxCUHeight = m_param->maxCUSize;
1173 
1174     const uint32_t hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
1175     const uint32_t vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
1176 
1177     if (m_param->decodedPictureHashSEI == 1)
1178     {
1179         if (!row)
1180             MD5Init(&m_seiReconPictureDigest.m_state[0]);
1181 
1182         updateMD5Plane(m_seiReconPictureDigest.m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
1183         if (m_param->internalCsp != X265_CSP_I400)
1184         {
1185             if (!row)
1186             {
1187                 MD5Init(&m_seiReconPictureDigest.m_state[1]);
1188                 MD5Init(&m_seiReconPictureDigest.m_state[2]);
1189             }
1190 
1191             width >>= hChromaShift;
1192             height >>= vChromaShift;
1193             stride = reconPic->m_strideC;
1194 
1195             updateMD5Plane(m_seiReconPictureDigest.m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
1196             updateMD5Plane(m_seiReconPictureDigest.m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
1197         }
1198     }
1199     else if (m_param->decodedPictureHashSEI == 2)
1200     {
1201 
1202         if (!row)
1203             m_seiReconPictureDigest.m_crc[0] = 0xffff;
1204 
1205         updateCRC(reconPic->getLumaAddr(cuAddr), m_seiReconPictureDigest.m_crc[0], height, width, stride);
1206         if (m_param->internalCsp != X265_CSP_I400)
1207         {
1208             width >>= hChromaShift;
1209             height >>= vChromaShift;
1210             stride = reconPic->m_strideC;
1211             m_seiReconPictureDigest.m_crc[1] = m_seiReconPictureDigest.m_crc[2] = 0xffff;
1212 
1213             updateCRC(reconPic->getCbAddr(cuAddr), m_seiReconPictureDigest.m_crc[1], height, width, stride);
1214             updateCRC(reconPic->getCrAddr(cuAddr), m_seiReconPictureDigest.m_crc[2], height, width, stride);
1215         }
1216     }
1217     else if (m_param->decodedPictureHashSEI == 3)
1218     {
1219         if (!row)
1220             m_seiReconPictureDigest.m_checksum[0] = 0;
1221 
1222         updateChecksum(reconPic->m_picOrg[0], m_seiReconPictureDigest.m_checksum[0], height, width, stride, row, maxCUHeight);
1223         if (m_param->internalCsp != X265_CSP_I400)
1224         {
1225             width >>= hChromaShift;
1226             height >>= vChromaShift;
1227             stride = reconPic->m_strideC;
1228             maxCUHeight >>= vChromaShift;
1229 
1230             if (!row)
1231                 m_seiReconPictureDigest.m_checksum[1] = m_seiReconPictureDigest.m_checksum[2] = 0;
1232 
1233             updateChecksum(reconPic->m_picOrg[1], m_seiReconPictureDigest.m_checksum[1], height, width, stride, row, maxCUHeight);
1234             updateChecksum(reconPic->m_picOrg[2], m_seiReconPictureDigest.m_checksum[2], height, width, stride, row, maxCUHeight);
1235         }
1236     }
1237 }
1238 
encodeSlice(uint32_t sliceAddr)1239 void FrameEncoder::encodeSlice(uint32_t sliceAddr)
1240 {
1241     Slice* slice = m_frame->m_encData->m_slice;
1242     const uint32_t widthInLCUs = slice->m_sps->numCuInWidth;
1243     const uint32_t lastCUAddr = (slice->m_endCUAddr + m_param->num4x4Partitions - 1) / m_param->num4x4Partitions;
1244     const uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1;
1245 
1246     SAOParam* saoParam = slice->m_sps->bUseSAO && slice->m_bUseSao ? m_frame->m_encData->m_saoParam : NULL;
1247     for (uint32_t cuAddr = sliceAddr; cuAddr < lastCUAddr; cuAddr++)
1248     {
1249         uint32_t col = cuAddr % widthInLCUs;
1250         uint32_t row = cuAddr / widthInLCUs;
1251         uint32_t subStrm = row % numSubstreams;
1252         CUData* ctu = m_frame->m_encData->getPicCTU(cuAddr);
1253 
1254         m_entropyCoder.setBitstream(&m_outStreams[subStrm]);
1255 
1256         // Synchronize cabac probabilities with upper-right CTU if it's available and we're at the start of a line.
1257         if (m_param->bEnableWavefront && !col && row)
1258         {
1259             m_entropyCoder.copyState(m_initSliceContext);
1260             m_entropyCoder.loadContexts(m_rows[row - 1].bufferedEntropy);
1261         }
1262 
1263         // Initialize slice context
1264         if (ctu->m_bFirstRowInSlice && !col)
1265             m_entropyCoder.load(m_initSliceContext);
1266 
1267         if (saoParam)
1268         {
1269             if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
1270             {
1271                 int mergeLeft = col && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_LEFT;
1272                 int mergeUp = !ctu->m_bFirstRowInSlice && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_UP;
1273                 if (col)
1274                     m_entropyCoder.codeSaoMerge(mergeLeft);
1275                 if (!ctu->m_bFirstRowInSlice && !mergeLeft)
1276                     m_entropyCoder.codeSaoMerge(mergeUp);
1277                 if (!mergeLeft && !mergeUp)
1278                 {
1279                     if (saoParam->bSaoFlag[0])
1280                         m_entropyCoder.codeSaoOffset(saoParam->ctuParam[0][cuAddr], 0);
1281                     if (saoParam->bSaoFlag[1])
1282                     {
1283                         m_entropyCoder.codeSaoOffset(saoParam->ctuParam[1][cuAddr], 1);
1284                         m_entropyCoder.codeSaoOffset(saoParam->ctuParam[2][cuAddr], 2);
1285                     }
1286                 }
1287             }
1288             else
1289             {
1290                 for (int i = 0; i < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
1291                     saoParam->ctuParam[i][cuAddr].reset();
1292             }
1293         }
1294 
1295         // final coding (bitstream generation) for this CU
1296         m_entropyCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]);
1297 
1298         if (m_param->bEnableWavefront)
1299         {
1300             if (col == 1)
1301                 // Store probabilities of second CTU in line into buffer
1302                 m_rows[row].bufferedEntropy.loadContexts(m_entropyCoder);
1303 
1304             if (col == widthInLCUs - 1)
1305                 m_entropyCoder.finishSlice();
1306         }
1307     }
1308 
1309     if (!m_param->bEnableWavefront)
1310         m_entropyCoder.finishSlice();
1311 }
1312 
processRow(int row,int threadId)1313 void FrameEncoder::processRow(int row, int threadId)
1314 {
1315     int64_t startTime = x265_mdate();
1316     if (ATOMIC_INC(&m_activeWorkerCount) == 1 && m_stallStartTime)
1317         m_totalNoWorkerTime += x265_mdate() - m_stallStartTime;
1318 
1319     const uint32_t realRow = m_idx_to_row[row >> 1];
1320     const uint32_t typeNum = m_idx_to_row[row & 1];
1321 
1322     if (!typeNum)
1323         processRowEncoder(realRow, m_tld[threadId]);
1324     else
1325     {
1326         m_frameFilter.processRow(realRow);
1327 
1328         // NOTE: Active next row
1329         if (realRow != m_sliceBaseRow[m_rows[realRow].sliceId + 1] - 1)
1330             enqueueRowFilter(m_row_to_idx[realRow + 1]);
1331     }
1332 
1333     if (ATOMIC_DEC(&m_activeWorkerCount) == 0)
1334         m_stallStartTime = x265_mdate();
1335 
1336     m_totalWorkerElapsedTime += x265_mdate() - startTime; // not thread safe, but good enough
1337 }
1338 
1339 // Called by worker threads
processRowEncoder(int intRow,ThreadLocalData & tld)1340 void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld)
1341 {
1342     const uint32_t row = (uint32_t)intRow;
1343     CTURow& curRow = m_rows[row];
1344 
1345     if (m_param->bEnableWavefront)
1346     {
1347         ScopedLock self(curRow.lock);
1348         if (!curRow.active)
1349             /* VBV restart is in progress, exit out */
1350             return;
1351         if (curRow.busy)
1352         {
1353             /* On multi-socket Windows servers, we have seen problems with
1354              * ATOMIC_CAS which resulted in multiple worker threads processing
1355              * the same CU row, which often resulted in bad pointer accesses. We
1356              * believe the problem is fixed, but are leaving this check in place
1357              * to prevent crashes in case it is not */
1358             x265_log(m_param, X265_LOG_WARNING,
1359                      "internal error - simultaneous row access detected. Please report HW to x265-devel@videolan.org\n");
1360             return;
1361         }
1362         curRow.busy = true;
1363     }
1364 
1365     /* When WPP is enabled, every row has its own row coder instance. Otherwise
1366      * they share row 0 */
1367     Entropy& rowCoder = m_param->bEnableWavefront ? curRow.rowGoOnCoder : m_rows[0].rowGoOnCoder;
1368     FrameData& curEncData = *m_frame->m_encData;
1369     Slice *slice = curEncData.m_slice;
1370 
1371     const uint32_t numCols = m_numCols;
1372     const uint32_t lineStartCUAddr = row * numCols;
1373     bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
1374 
1375     const uint32_t sliceId = curRow.sliceId;
1376     uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
1377     uint32_t noOfBlocks = m_param->maxCUSize / 16;
1378     const uint32_t bFirstRowInSlice = ((row == 0) || (m_rows[row - 1].sliceId != curRow.sliceId)) ? 1 : 0;
1379     const uint32_t bLastRowInSlice = ((row == m_numRows - 1) || (m_rows[row + 1].sliceId != curRow.sliceId)) ? 1 : 0;
1380     const uint32_t endRowInSlicePlus1 = m_sliceBaseRow[sliceId + 1];
1381     const uint32_t rowInSlice = row - m_sliceBaseRow[sliceId];
1382 
1383     // Load SBAC coder context from previous row and initialize row state.
1384     if (bFirstRowInSlice && !curRow.completed)
1385         rowCoder.load(m_initSliceContext);
1386 
1387     // calculate mean QP for consistent deltaQP signalling calculation
1388     if (m_param->bOptCUDeltaQP)
1389     {
1390         ScopedLock self(curRow.lock);
1391         if (!curRow.avgQPComputed)
1392         {
1393             if (m_param->bEnableWavefront || !row)
1394             {
1395                 double meanQPOff = 0;
1396                 bool isReferenced = IS_REFERENCED(m_frame);
1397                 double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
1398                 if (qpoffs)
1399                 {
1400                     uint32_t loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
1401 
1402                     uint32_t cuYStart = 0, height = m_frame->m_fencPic->m_picHeight;
1403                     if (m_param->bEnableWavefront)
1404                     {
1405                         cuYStart = intRow * m_param->maxCUSize;
1406                         height = cuYStart + m_param->maxCUSize;
1407                     }
1408 
1409                     uint32_t qgSize = m_param->rc.qgSize, width = m_frame->m_fencPic->m_picWidth;
1410                     uint32_t maxOffsetCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
1411                     uint32_t count = 0;
1412                     for (uint32_t cuY = cuYStart; cuY < height && (cuY < m_frame->m_fencPic->m_picHeight); cuY += qgSize)
1413                     {
1414                         for (uint32_t cuX = 0; cuX < width; cuX += qgSize)
1415                         {
1416                             double qp_offset = 0;
1417                             uint32_t cnt = 0;
1418 
1419                             for (uint32_t block_yy = cuY; block_yy < cuY + qgSize && block_yy < m_frame->m_fencPic->m_picHeight; block_yy += loopIncr)
1420                             {
1421                                 for (uint32_t block_xx = cuX; block_xx < cuX + qgSize && block_xx < width; block_xx += loopIncr)
1422                                 {
1423                                     int idx = ((block_yy / loopIncr) * (maxOffsetCols)) + (block_xx / loopIncr);
1424                                     qp_offset += qpoffs[idx];
1425                                     cnt++;
1426                                 }
1427                             }
1428                             qp_offset /= cnt;
1429                             meanQPOff += qp_offset;
1430                             count++;
1431                         }
1432                     }
1433                     meanQPOff /= count;
1434                 }
1435                 rowCoder.m_meanQP = slice->m_sliceQp + meanQPOff;
1436             }
1437             else
1438             {
1439                 rowCoder.m_meanQP = m_rows[0].rowGoOnCoder.m_meanQP;
1440             }
1441             curRow.avgQPComputed = 1;
1442         }
1443     }
1444 
1445     // Initialize restrict on MV range in slices
1446     tld.analysis.m_sliceMinY = -(int32_t)(rowInSlice * m_param->maxCUSize * 4) + 3 * 4;
1447     tld.analysis.m_sliceMaxY = (int32_t)((endRowInSlicePlus1 - 1 - row) * (m_param->maxCUSize * 4) - 4 * 4);
1448 
1449     // Handle single row slice
1450     if (tld.analysis.m_sliceMaxY < tld.analysis.m_sliceMinY)
1451         tld.analysis.m_sliceMaxY = tld.analysis.m_sliceMinY = 0;
1452 
1453 
1454     while (curRow.completed < numCols)
1455     {
1456         ProfileScopeEvent(encodeCTU);
1457 
1458         const uint32_t col = curRow.completed;
1459         const uint32_t cuAddr = lineStartCUAddr + col;
1460         CUData* ctu = curEncData.getPicCTU(cuAddr);
1461         const uint32_t bLastCuInSlice = (bLastRowInSlice & (col == numCols - 1)) ? 1 : 0;
1462         ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp, bFirstRowInSlice, bLastRowInSlice, bLastCuInSlice);
1463 
1464         if (bIsVbv)
1465         {
1466             if (col == 0 && !m_param->bEnableWavefront)
1467             {
1468                 m_backupStreams[0].copyBits(&m_outStreams[0]);
1469                 curRow.bufferedEntropy.copyState(rowCoder);
1470                 curRow.bufferedEntropy.loadContexts(rowCoder);
1471             }
1472             if (bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
1473             {
1474                 curEncData.m_rowStat[row].rowQp = curEncData.m_avgQpRc;
1475                 curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(curEncData.m_avgQpRc);
1476             }
1477 
1478             FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];
1479             if (m_param->bEnableWavefront && rowInSlice >= col && !bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
1480                 cuStat.baseQp = curEncData.m_cuStat[cuAddr - numCols + 1].baseQp;
1481             else if (!m_param->bEnableWavefront && !bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
1482                 cuStat.baseQp = curEncData.m_rowStat[row - 1].rowQp;
1483             else
1484                 cuStat.baseQp = curEncData.m_rowStat[row].rowQp;
1485 
1486             /* TODO: use defines from slicetype.h for lowres block size */
1487             uint32_t block_y = (ctu->m_cuPelY >> m_param->maxLog2CUSize) * noOfBlocks;
1488             uint32_t block_x = (ctu->m_cuPelX >> m_param->maxLog2CUSize) * noOfBlocks;
1489             if (!m_param->analysisLoad || !m_param->bDisableLookahead)
1490             {
1491                 cuStat.vbvCost = 0;
1492                 cuStat.intraVbvCost = 0;
1493 
1494                 for (uint32_t h = 0; h < noOfBlocks && block_y < m_sliceMaxBlockRow[sliceId + 1]; h++, block_y++)
1495                 {
1496                     uint32_t idx = block_x + (block_y * maxBlockCols);
1497 
1498                     for (uint32_t w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++, idx++)
1499                     {
1500                         cuStat.vbvCost += m_frame->m_lowres.lowresCostForRc[idx] & LOWRES_COST_MASK;
1501                         cuStat.intraVbvCost += m_frame->m_lowres.intraCost[idx];
1502                     }
1503                 }
1504             }
1505         }
1506         else
1507             curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_avgQpRc;
1508 
1509         if (m_param->bEnableWavefront && !col && !bFirstRowInSlice)
1510         {
1511             // Load SBAC coder context from previous row and initialize row state.
1512             rowCoder.copyState(m_initSliceContext);
1513             rowCoder.loadContexts(m_rows[row - 1].bufferedEntropy);
1514         }
1515         if (m_param->dynamicRd && (int32_t)(m_rce.qpaRc - m_rce.qpNoVbv) > 0)
1516             ctu->m_vbvAffected = true;
1517 
1518         // Does all the CU analysis, returns best top level mode decision
1519         Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
1520 
1521         /* startPoint > encodeOrder is true when the start point changes for
1522         a new GOP but few frames from the previous GOP is still incomplete.
1523         The data of frames in this interval will not be used by any future frames. */
1524         if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder)
1525             collectDynDataRow(*ctu, &curRow.rowStats);
1526 
1527         // take a sample of the current active worker count
1528         ATOMIC_ADD(&m_totalActiveWorkerCount, m_activeWorkerCount);
1529         ATOMIC_INC(&m_activeWorkerCountSamples);
1530 
1531         /* advance top-level row coder to include the context of this CTU.
1532          * if SAO is disabled, rowCoder writes the final CTU bitstream */
1533         rowCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]);
1534 
1535         if (m_param->bEnableWavefront && col == 1)
1536             // Save CABAC state for next row
1537             curRow.bufferedEntropy.loadContexts(rowCoder);
1538 
1539         /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
1540         if (slice->m_bUseSao && m_param->bSaoNonDeblocked)
1541             m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
1542 
1543         /* Deblock with idle threading */
1544         if (m_param->bEnableLoopFilter | slice->m_bUseSao)
1545         {
1546             // NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO
1547             if (!bIsVbv)
1548             {
1549                 // Delay one row to avoid intra prediction conflict
1550                 if (m_pool && !bFirstRowInSlice)
1551                 {
1552                     int allowCol = col;
1553 
1554                     // avoid race condition on last column
1555                     if (rowInSlice >= 2)
1556                     {
1557                         allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
1558                                                                   : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
1559                     }
1560                     m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
1561                 }
1562 
1563                 // Last Row may start early
1564                 if (m_pool && bLastRowInSlice)
1565                 {
1566                     // Deblocking last row
1567                     int allowCol = col;
1568 
1569                     // avoid race condition on last column
1570                     if (rowInSlice >= 2)
1571                     {
1572                         allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
1573                                                                   : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
1574                     }
1575                     m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
1576                 }
1577             } // end of !bIsVbv
1578         }
1579         // Both Loopfilter and SAO Disabled
1580         else
1581         {
1582             m_frameFilter.m_parallelFilter[row].processPostCu(col);
1583         }
1584 
1585         // Completed CU processing
1586         curRow.completed++;
1587 
1588         FrameStats frameLog;
1589         curEncData.m_rowStat[row].sumQpAq += collectCTUStatistics(*ctu, &frameLog);
1590 
1591         // copy number of intra, inter cu per row into frame stats for 2 pass
1592         if (m_param->rc.bStatWrite)
1593         {
1594             curRow.rowStats.mvBits    += best.mvBits;
1595             curRow.rowStats.coeffBits += best.coeffBits;
1596             curRow.rowStats.miscBits  += best.totalBits - (best.mvBits + best.coeffBits);
1597 
1598             for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
1599             {
1600                 /* 1 << shift == number of 8x8 blocks at current depth */
1601                 int shift = 2 * (m_param->maxCUDepth - depth);
1602                 int cuSize = m_param->maxCUSize >> depth;
1603 
1604                 curRow.rowStats.intra8x8Cnt += (cuSize == 8) ? (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN) :
1605                                                                (int)(frameLog.cntIntra[depth] << shift);
1606 
1607                 curRow.rowStats.inter8x8Cnt += (int)(frameLog.cntInter[depth] << shift);
1608                 curRow.rowStats.skip8x8Cnt += (int)((frameLog.cntSkipCu[depth] + frameLog.cntMergeCu[depth]) << shift);
1609             }
1610         }
1611         curRow.rowStats.totalCtu++;
1612         curRow.rowStats.lumaDistortion   += best.lumaDistortion;
1613         curRow.rowStats.chromaDistortion += best.chromaDistortion;
1614         curRow.rowStats.psyEnergy        += best.psyEnergy;
1615         curRow.rowStats.ssimEnergy       += best.ssimEnergy;
1616         curRow.rowStats.resEnergy        += best.resEnergy;
1617         curRow.rowStats.cntIntraNxN      += frameLog.cntIntraNxN;
1618         curRow.rowStats.totalCu          += frameLog.totalCu;
1619         for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
1620         {
1621             curRow.rowStats.cntSkipCu[depth] += frameLog.cntSkipCu[depth];
1622             curRow.rowStats.cntMergeCu[depth] += frameLog.cntMergeCu[depth];
1623             for (int m = 0; m < INTER_MODES; m++)
1624                 curRow.rowStats.cuInterDistribution[depth][m] += frameLog.cuInterDistribution[depth][m];
1625             for (int n = 0; n < INTRA_MODES; n++)
1626                 curRow.rowStats.cuIntraDistribution[depth][n] += frameLog.cuIntraDistribution[depth][n];
1627         }
1628 
1629         curEncData.m_cuStat[cuAddr].totalBits = best.totalBits;
1630         x265_emms();
1631 
1632         if (bIsVbv)
1633         {
1634             // Update encoded bits, satdCost, baseQP for each CU if tune grain is disabled
1635             FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];
1636             if ((m_param->bEnableWavefront && ((cuAddr == m_sliceBaseRow[sliceId] * numCols) || !m_param->rc.bEnableConstVbv)) || !m_param->bEnableWavefront)
1637             {
1638                 curEncData.m_rowStat[row].rowSatd += cuStat.vbvCost;
1639                 curEncData.m_rowStat[row].rowIntraSatd += cuStat.intraVbvCost;
1640                 curEncData.m_rowStat[row].encodedBits += cuStat.totalBits;
1641                 curEncData.m_rowStat[row].sumQpRc += cuStat.baseQp;
1642                 curEncData.m_rowStat[row].numEncodedCUs = cuAddr;
1643             }
1644 
1645             // If current block is at row end checkpoint, call vbv ratecontrol.
1646             if (!m_param->bEnableWavefront && col == numCols - 1)
1647             {
1648                 double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
1649                 curRow.reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
1650                 qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase);
1651                 curEncData.m_rowStat[row].rowQp = qpBase;
1652                 curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(qpBase);
1653                 if (curRow.reEncode < 0)
1654                 {
1655                     x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n",
1656                         m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp);
1657 
1658                     m_vbvResetTriggerRow = row;
1659                     m_outStreams[0].copyBits(&m_backupStreams[0]);
1660 
1661                     rowCoder.copyState(curRow.bufferedEntropy);
1662                     rowCoder.loadContexts(curRow.bufferedEntropy);
1663 
1664                     curRow.completed = 0;
1665                     memset(&curRow.rowStats, 0, sizeof(curRow.rowStats));
1666                     curEncData.m_rowStat[row].numEncodedCUs = 0;
1667                     curEncData.m_rowStat[row].encodedBits = 0;
1668                     curEncData.m_rowStat[row].rowSatd = 0;
1669                     curEncData.m_rowStat[row].rowIntraSatd = 0;
1670                     curEncData.m_rowStat[row].sumQpRc = 0;
1671                     curEncData.m_rowStat[row].sumQpAq = 0;
1672                 }
1673             }
1674             // If current block is at row diagonal checkpoint, call vbv ratecontrol.
1675             else if (m_param->bEnableWavefront && rowInSlice == col && !bFirstRowInSlice)
1676             {
1677                 if (m_param->rc.bEnableConstVbv)
1678                 {
1679                     uint32_t startCuAddr = numCols * row;
1680                     uint32_t EndCuAddr = startCuAddr + col;
1681 
1682                     for (int32_t r = row; r >= (int32_t)m_sliceBaseRow[sliceId]; r--)
1683                     {
1684                         for (uint32_t c = startCuAddr; c <= EndCuAddr && c <= numCols * (r + 1) - 1; c++)
1685                         {
1686                             curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost;
1687                             curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost;
1688                             curEncData.m_rowStat[r].encodedBits += curEncData.m_cuStat[c].totalBits;
1689                             curEncData.m_rowStat[r].sumQpRc += curEncData.m_cuStat[c].baseQp;
1690                             curEncData.m_rowStat[r].numEncodedCUs = c;
1691                         }
1692                         if (curRow.reEncode < 0)
1693                             break;
1694                         startCuAddr = EndCuAddr - numCols;
1695                         EndCuAddr = startCuAddr + 1;
1696                     }
1697                 }
1698                 double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
1699                 curRow.reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
1700                 qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase);
1701                 curEncData.m_rowStat[row].rowQp = qpBase;
1702                 curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(qpBase);
1703 
1704                 if (curRow.reEncode < 0)
1705                 {
1706                     x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n",
1707                              m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp);
1708 
1709                     // prevent the WaveFront::findJob() method from providing new jobs
1710                     m_vbvResetTriggerRow = row;
1711                     m_bAllRowsStop = true;
1712 
1713                     for (uint32_t r = m_sliceBaseRow[sliceId + 1] - 1; r >= row; r--)
1714                     {
1715                         CTURow& stopRow = m_rows[r];
1716 
1717                         if (r != row)
1718                         {
1719                             /* if row was active (ready to be run) clear active bit and bitmap bit for this row */
1720                             stopRow.lock.acquire();
1721                             while (stopRow.active)
1722                             {
1723                                 if (dequeueRow(r * 2))
1724                                     stopRow.active = false;
1725                                 else
1726                                 {
1727                                     /* we must release the row lock to allow the thread to exit */
1728                                     stopRow.lock.release();
1729                                     GIVE_UP_TIME();
1730                                     stopRow.lock.acquire();
1731                                 }
1732                             }
1733                             stopRow.lock.release();
1734 
1735                             bool bRowBusy = true;
1736                             do
1737                             {
1738                                 stopRow.lock.acquire();
1739                                 bRowBusy = stopRow.busy;
1740                                 stopRow.lock.release();
1741 
1742                                 if (bRowBusy)
1743                                 {
1744                                     GIVE_UP_TIME();
1745                                 }
1746                             }
1747                             while (bRowBusy);
1748                         }
1749 
1750                         m_outStreams[r].resetBits();
1751                         stopRow.completed = 0;
1752                         memset(&stopRow.rowStats, 0, sizeof(stopRow.rowStats));
1753                         curEncData.m_rowStat[r].numEncodedCUs = 0;
1754                         curEncData.m_rowStat[r].encodedBits = 0;
1755                         curEncData.m_rowStat[r].rowSatd = 0;
1756                         curEncData.m_rowStat[r].rowIntraSatd = 0;
1757                         curEncData.m_rowStat[r].sumQpRc = 0;
1758                         curEncData.m_rowStat[r].sumQpAq = 0;
1759                     }
1760 
1761                     m_bAllRowsStop = false;
1762                 }
1763             }
1764         }
1765 
1766         if (m_param->bEnableWavefront && curRow.completed >= 2 && !bLastRowInSlice &&
1767             (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow))
1768         {
1769             /* activate next row */
1770             ScopedLock below(m_rows[row + 1].lock);
1771 
1772             if (m_rows[row + 1].active == false &&
1773                 m_rows[row + 1].completed + 2 <= curRow.completed)
1774             {
1775                 m_rows[row + 1].active = true;
1776                 enqueueRowEncoder(m_row_to_idx[row + 1]);
1777                 tryWakeOne(); /* wake up a sleeping thread or set the help wanted flag */
1778             }
1779         }
1780 
1781         ScopedLock self(curRow.lock);
1782         if ((m_bAllRowsStop && intRow > m_vbvResetTriggerRow) ||
1783             (!bFirstRowInSlice && ((curRow.completed < numCols - 1) || (m_rows[row - 1].completed < numCols)) && m_rows[row - 1].completed < curRow.completed + 2))
1784         {
1785             curRow.active = false;
1786             curRow.busy = false;
1787             ATOMIC_INC(&m_countRowBlocks);
1788             return;
1789         }
1790     }
1791 
1792     /* this row of CTUs has been compressed */
1793     if (m_param->bEnableWavefront && m_param->rc.bEnableConstVbv)
1794     {
1795         if (bLastRowInSlice)
1796         {
1797             for (uint32_t r = m_sliceBaseRow[sliceId]; r < m_sliceBaseRow[sliceId + 1]; r++)
1798             {
1799                 for (uint32_t c = curEncData.m_rowStat[r].numEncodedCUs + 1; c < numCols * (r + 1); c++)
1800                 {
1801                     curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost;
1802                     curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost;
1803                     curEncData.m_rowStat[r].encodedBits += curEncData.m_cuStat[c].totalBits;
1804                     curEncData.m_rowStat[r].sumQpRc += curEncData.m_cuStat[c].baseQp;
1805                     curEncData.m_rowStat[r].numEncodedCUs = c;
1806                 }
1807             }
1808         }
1809     }
1810 
1811     /* If encoding with ABR, update update bits and complexity in rate control
1812      * after a number of rows so the next frame's rateControlStart has more
1813      * accurate data for estimation. At the start of the encode we update stats
1814      * after half the frame is encoded, but after this initial period we update
1815      * after refLagRows (the number of rows reference frames must have completed
1816      * before referencees may begin encoding) */
1817     if (m_param->rc.rateControlMode == X265_RC_ABR || bIsVbv)
1818     {
1819         uint32_t rowCount = 0;
1820         uint32_t maxRows = m_sliceBaseRow[sliceId + 1] - m_sliceBaseRow[sliceId];
1821 
1822         if (!m_rce.encodeOrder)
1823             rowCount = maxRows - 1;
1824         else if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
1825             rowCount = X265_MIN((maxRows + 1) / 2, maxRows - 1);
1826         else
1827             rowCount = X265_MIN(m_refLagRows / m_param->maxSlices, maxRows - 1);
1828 
1829         if (rowInSlice == rowCount)
1830         {
1831             m_rowSliceTotalBits[sliceId] = 0;
1832             if (bIsVbv && !(m_param->rc.bEnableConstVbv && m_param->bEnableWavefront))
1833             {
1834                 for (uint32_t i = m_sliceBaseRow[sliceId]; i < rowCount + m_sliceBaseRow[sliceId]; i++)
1835                     m_rowSliceTotalBits[sliceId] += curEncData.m_rowStat[i].encodedBits;
1836             }
1837             else
1838             {
1839                 uint32_t startAddr = m_sliceBaseRow[sliceId] * numCols;
1840                 uint32_t finishAddr = startAddr + rowCount * numCols;
1841 
1842                 for (uint32_t cuAddr = startAddr; cuAddr < finishAddr; cuAddr++)
1843                     m_rowSliceTotalBits[sliceId] += curEncData.m_cuStat[cuAddr].totalBits;
1844             }
1845 
1846             if (ATOMIC_INC(&m_sliceCnt) == (int)m_param->maxSlices)
1847             {
1848                 m_rce.rowTotalBits = 0;
1849                 for (uint32_t i = 0; i < m_param->maxSlices; i++)
1850                     m_rce.rowTotalBits += m_rowSliceTotalBits[i];
1851                 m_top->m_rateControl->rateControlUpdateStats(&m_rce);
1852             }
1853         }
1854     }
1855 
1856     /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */
1857     /* end_of_sub_stream_one_bit / end_of_slice_segment_flag */
1858        if (!slice->m_bUseSao && (m_param->bEnableWavefront || bLastRowInSlice))
1859                rowCoder.finishSlice();
1860 
1861 
1862     /* Processing left Deblock block with current threading */
1863     if ((m_param->bEnableLoopFilter | slice->m_bUseSao) & (rowInSlice >= 2))
1864     {
1865         /* Check conditional to start previous row process with current threading */
1866         if (m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() == (int)numCols)
1867         {
1868             /* stop threading on current row and restart it */
1869             m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols);
1870             m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
1871         }
1872     }
1873 
1874     /* trigger row-wise loop filters */
1875     if (m_param->bEnableWavefront)
1876     {
1877         if (rowInSlice >= m_filterRowDelay)
1878         {
1879             enableRowFilter(m_row_to_idx[row - m_filterRowDelay]);
1880 
1881             /* NOTE: Activate filter if first row (row 0) */
1882             if (rowInSlice == m_filterRowDelay)
1883                 enqueueRowFilter(m_row_to_idx[row - m_filterRowDelay]);
1884             tryWakeOne();
1885         }
1886 
1887         if (bLastRowInSlice)
1888         {
1889             for (uint32_t i = endRowInSlicePlus1 - m_filterRowDelay; i < endRowInSlicePlus1; i++)
1890             {
1891                 enableRowFilter(m_row_to_idx[i]);
1892             }
1893             tryWakeOne();
1894         }
1895 
1896         // handle specially case - single row slice
1897         if  (bFirstRowInSlice & bLastRowInSlice)
1898         {
1899             enqueueRowFilter(m_row_to_idx[row]);
1900             tryWakeOne();
1901         }
1902     }
1903 
1904     curRow.busy = false;
1905 
1906     // CHECK_ME: Does it always FALSE condition?
1907     if (ATOMIC_INC(&m_completionCount) == 2 * (int)m_numRows)
1908         m_completionEvent.trigger();
1909 }
1910 
collectDynDataRow(CUData & ctu,FrameStats * rowStats)1911 void FrameEncoder::collectDynDataRow(CUData& ctu, FrameStats* rowStats)
1912 {
1913     for (uint32_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
1914     {
1915         for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
1916         {
1917             int offset = (depth * X265_REFINE_INTER_LEVELS) + i;
1918             if (ctu.m_collectCUCount[offset])
1919             {
1920                 rowStats->rowVarDyn[offset] += ctu.m_collectCUVariance[offset];
1921                 rowStats->rowRdDyn[offset] += ctu.m_collectCURd[offset];
1922                 rowStats->rowCntDyn[offset] += ctu.m_collectCUCount[offset];
1923             }
1924         }
1925     }
1926 }
1927 
collectDynDataFrame()1928 void FrameEncoder::collectDynDataFrame()
1929 {
1930     for (uint32_t row = 0; row < m_numRows; row++)
1931     {
1932         for (uint32_t refLevel = 0; refLevel < X265_REFINE_INTER_LEVELS; refLevel++)
1933         {
1934             for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
1935             {
1936                 int offset = (depth * X265_REFINE_INTER_LEVELS) + refLevel;
1937                 int curFrameIndex = m_frame->m_encodeOrder - m_top->m_startPoint;
1938                 int index = (curFrameIndex * X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
1939                 if (m_rows[row].rowStats.rowCntDyn[offset])
1940                 {
1941                     m_top->m_variance[index] += m_rows[row].rowStats.rowVarDyn[offset];
1942                     m_top->m_rdCost[index] += m_rows[row].rowStats.rowRdDyn[offset];
1943                     m_top->m_trainingCount[index] += m_rows[row].rowStats.rowCntDyn[offset];
1944                 }
1945             }
1946         }
1947     }
1948 }
1949 
computeAvgTrainingData()1950 void FrameEncoder::computeAvgTrainingData()
1951 {
1952     if (m_frame->m_lowres.bScenecut || m_frame->m_lowres.bKeyframe)
1953     {
1954         m_top->m_startPoint = m_frame->m_encodeOrder;
1955         int size = (m_param->keyframeMax + m_param->lookaheadDepth) * m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
1956         memset(m_top->m_variance, 0, size * sizeof(uint64_t));
1957         memset(m_top->m_rdCost, 0, size * sizeof(uint64_t));
1958         memset(m_top->m_trainingCount, 0, size * sizeof(uint32_t));
1959     }
1960     if (m_frame->m_encodeOrder - m_top->m_startPoint < 2 * m_param->frameNumThreads)
1961         m_frame->m_classifyFrame = false;
1962     else
1963         m_frame->m_classifyFrame = true;
1964 
1965     int size = m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
1966     memset(m_frame->m_classifyRd, 0, size * sizeof(uint64_t));
1967     memset(m_frame->m_classifyVariance, 0, size * sizeof(uint64_t));
1968     memset(m_frame->m_classifyCount, 0, size * sizeof(uint32_t));
1969     if (m_frame->m_classifyFrame)
1970     {
1971         uint32_t limit = m_frame->m_encodeOrder - m_top->m_startPoint - m_param->frameNumThreads;
1972         for (uint32_t i = 1; i < limit; i++)
1973         {
1974             for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
1975             {
1976                 for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
1977                 {
1978                     int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
1979                     int index = (i* X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
1980                     if (m_top->m_trainingCount[index])
1981                     {
1982                         m_frame->m_classifyRd[offset] += m_top->m_rdCost[index] / m_top->m_trainingCount[index];
1983                         m_frame->m_classifyVariance[offset] += m_top->m_variance[index] / m_top->m_trainingCount[index];
1984                         m_frame->m_classifyCount[offset] += m_top->m_trainingCount[index];
1985                     }
1986                 }
1987             }
1988         }
1989         /* Calculates the average feature values of historic frames that are being considered for the current frame */
1990         int historyCount = m_frame->m_encodeOrder - m_param->frameNumThreads - m_top->m_startPoint - 1;
1991         if (historyCount)
1992         {
1993             for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
1994             {
1995                 for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
1996                 {
1997                     int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
1998                     m_frame->m_classifyRd[offset] /= historyCount;
1999                     m_frame->m_classifyVariance[offset] /= historyCount;
2000                 }
2001             }
2002         }
2003     }
2004 }
2005 
2006 /* collect statistics about CU coding decisions, return total QP */
collectCTUStatistics(const CUData & ctu,FrameStats * log)2007 int FrameEncoder::collectCTUStatistics(const CUData& ctu, FrameStats* log)
2008 {
2009     int totQP = 0;
2010     uint32_t depth = 0;
2011     for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
2012     {
2013         depth = ctu.m_cuDepth[absPartIdx];
2014         totQP += ctu.m_qp[absPartIdx] * (ctu.m_numPartitions >> (depth * 2));
2015     }
2016 
2017     if (m_param->csvLogLevel >= 1 || m_param->rc.bStatWrite)
2018     {
2019         if (ctu.m_slice->m_sliceType == I_SLICE)
2020         {
2021             depth = 0;
2022             for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
2023             {
2024                 depth = ctu.m_cuDepth[absPartIdx];
2025 
2026                 log->totalCu++;
2027                 log->cntIntra[depth]++;
2028 
2029                 if (ctu.m_predMode[absPartIdx] == MODE_NONE)
2030                 {
2031                     log->totalCu--;
2032                     log->cntIntra[depth]--;
2033                 }
2034                 else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
2035                 {
2036                     /* TODO: log intra modes at absPartIdx +0 to +3 */
2037                     X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n");
2038                     log->cntIntraNxN++;
2039                     log->cntIntra[depth]--;
2040                 }
2041                 else if (ctu.m_lumaIntraDir[absPartIdx] > 1)
2042                     log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++;
2043                 else
2044                     log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++;
2045             }
2046         }
2047         else
2048         {
2049             depth = 0;
2050             for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
2051             {
2052                 depth = ctu.m_cuDepth[absPartIdx];
2053 
2054                 log->totalCu++;
2055 
2056                 if (ctu.m_predMode[absPartIdx] == MODE_NONE)
2057                     log->totalCu--;
2058                 else if (ctu.isSkipped(absPartIdx))
2059                 {
2060                     if (ctu.m_mergeFlag[0])
2061                         log->cntMergeCu[depth]++;
2062                     else
2063                         log->cntSkipCu[depth]++;
2064                 }
2065                 else if (ctu.isInter(absPartIdx))
2066                 {
2067                     log->cntInter[depth]++;
2068 
2069                     if (ctu.m_partSize[absPartIdx] < AMP_ID)
2070                         log->cuInterDistribution[depth][ctu.m_partSize[absPartIdx]]++;
2071                     else
2072                         log->cuInterDistribution[depth][AMP_ID]++;
2073                 }
2074                 else if (ctu.isIntra(absPartIdx))
2075                 {
2076                     log->cntIntra[depth]++;
2077 
2078                     if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
2079                     {
2080                         X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n");
2081                         log->cntIntraNxN++;
2082                         log->cntIntra[depth]--;
2083                         /* TODO: log intra modes at absPartIdx +0 to +3 */
2084                     }
2085                     else if (ctu.m_lumaIntraDir[absPartIdx] > 1)
2086                         log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++;
2087                     else
2088                         log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++;
2089                 }
2090             }
2091         }
2092     }
2093 
2094     return totQP;
2095 }
2096 
2097 /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
noiseReductionUpdate()2098 void FrameEncoder::noiseReductionUpdate()
2099 {
2100     static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
2101 
2102     for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
2103     {
2104         int trSize = cat & 3;
2105         int coefCount = 1 << ((trSize + 2) * 2);
2106 
2107         if (m_nr->nrCount[cat] > maxBlocksPerTrSize[trSize])
2108         {
2109             for (int i = 0; i < coefCount; i++)
2110                 m_nr->nrResidualSum[cat][i] >>= 1;
2111             m_nr->nrCount[cat] >>= 1;
2112         }
2113 
2114         int nrStrength = cat < 8 ? m_param->noiseReductionIntra : m_param->noiseReductionInter;
2115         uint64_t scaledCount = (uint64_t)nrStrength * m_nr->nrCount[cat];
2116 
2117         for (int i = 0; i < coefCount; i++)
2118         {
2119             uint64_t value = scaledCount + m_nr->nrResidualSum[cat][i] / 2;
2120             uint64_t denom = m_nr->nrResidualSum[cat][i] + 1;
2121             m_nr->nrOffsetDenoise[cat][i] = (uint16_t)(value / denom);
2122         }
2123 
2124         // Don't denoise DC coefficients
2125         m_nr->nrOffsetDenoise[cat][0] = 0;
2126     }
2127 }
2128 #if ENABLE_LIBVMAF
vmafFrameLevelScore()2129 void FrameEncoder::vmafFrameLevelScore()
2130 {
2131     PicYuv *fenc = m_frame->m_fencPic;
2132     PicYuv *recon = m_frame->m_reconPic;
2133 
2134     x265_vmaf_framedata *vmafframedata = (x265_vmaf_framedata*)x265_malloc(sizeof(x265_vmaf_framedata));
2135     if (!vmafframedata)
2136     {
2137         x265_log(NULL, X265_LOG_ERROR, "vmaf frame data alloc failed\n");
2138     }
2139 
2140     vmafframedata->height = fenc->m_picHeight;
2141     vmafframedata->width = fenc->m_picWidth;
2142     vmafframedata->frame_set = 0;
2143     vmafframedata->internalBitDepth = m_param->internalBitDepth;
2144     vmafframedata->reference_frame = fenc;
2145     vmafframedata->distorted_frame = recon;
2146 
2147     fenc->m_vmafScore = x265_calculate_vmaf_framelevelscore(vmafframedata);
2148 
2149     if (vmafframedata)
2150     x265_free(vmafframedata);
2151 }
2152 #endif
2153 
getEncodedPicture(NALList & output)2154 Frame *FrameEncoder::getEncodedPicture(NALList& output)
2155 {
2156     if (m_frame)
2157     {
2158         /* block here until worker thread completes */
2159         m_done.wait();
2160 
2161         Frame *ret = m_frame;
2162         m_frame = NULL;
2163         output.takeContents(m_nalList);
2164         m_prevOutputTime = x265_mdate();
2165         return ret;
2166     }
2167 
2168     return NULL;
2169 }
2170 }
2171