1 /*****************************************************************************
2 * Copyright (C) 2013-2020 MulticoreWare, Inc
3 *
4 * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
5 * Min Chen <chenm003@163.com>
6 * Steve Borho <steve@borho.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
25
26 #include "common.h"
27 #include "frame.h"
28 #include "framedata.h"
29 #include "wavefront.h"
30 #include "param.h"
31
32 #include "encoder.h"
33 #include "frameencoder.h"
34 #include "common.h"
35 #include "slicetype.h"
36 #include "nal.h"
37
38 namespace X265_NS {
39 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
40
FrameEncoder()41 FrameEncoder::FrameEncoder()
42 {
43 m_prevOutputTime = x265_mdate();
44 m_reconfigure = false;
45 m_isFrameEncoder = true;
46 m_threadActive = true;
47 m_slicetypeWaitTime = 0;
48 m_activeWorkerCount = 0;
49 m_completionCount = 0;
50 m_bAllRowsStop = false;
51 m_vbvResetTriggerRow = -1;
52 m_outStreams = NULL;
53 m_backupStreams = NULL;
54 m_substreamSizes = NULL;
55 m_nr = NULL;
56 m_tld = NULL;
57 m_rows = NULL;
58 m_top = NULL;
59 m_param = NULL;
60 m_frame = NULL;
61 m_cuGeoms = NULL;
62 m_ctuGeomMap = NULL;
63 m_localTldIdx = 0;
64 memset(&m_rce, 0, sizeof(RateControlEntry));
65 }
66
destroy()67 void FrameEncoder::destroy()
68 {
69 if (m_pool)
70 {
71 if (!m_jpId)
72 {
73 int numTLD = m_pool->m_numWorkers;
74 if (!m_param->bEnableWavefront)
75 numTLD += m_pool->m_numProviders;
76 for (int i = 0; i < numTLD; i++)
77 m_tld[i].destroy();
78 delete [] m_tld;
79 }
80 }
81 else
82 {
83 m_tld->destroy();
84 delete m_tld;
85 }
86
87 delete[] m_rows;
88 delete[] m_outStreams;
89 delete[] m_backupStreams;
90 X265_FREE(m_sliceBaseRow);
91 X265_FREE(m_sliceMaxBlockRow);
92 X265_FREE(m_cuGeoms);
93 X265_FREE(m_ctuGeomMap);
94 X265_FREE(m_substreamSizes);
95 X265_FREE(m_nr);
96
97 m_frameFilter.destroy();
98
99 if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
100 {
101 delete m_rce.picTimingSEI;
102 delete m_rce.hrdTiming;
103 }
104 }
105
init(Encoder * top,int numRows,int numCols)106 bool FrameEncoder::init(Encoder *top, int numRows, int numCols)
107 {
108 m_top = top;
109 m_param = top->m_param;
110 m_numRows = numRows;
111 m_numCols = numCols;
112 m_reconfigure = false;
113 m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked)
114 || (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ?
115 2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
116 m_filterRowDelayCus = m_filterRowDelay * numCols;
117 m_rows = new CTURow[m_numRows];
118 bool ok = !!m_numRows;
119
120 m_sliceBaseRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
121 ok &= !!m_sliceBaseRow;
122 m_sliceGroupSize = (uint16_t)(m_numRows + m_param->maxSlices - 1) / m_param->maxSlices;
123 uint32_t sliceGroupSizeAccu = (m_numRows << 8) / m_param->maxSlices;
124 uint32_t rowSum = sliceGroupSizeAccu;
125 uint32_t sidx = 0;
126 for (uint32_t i = 0; i < m_numRows; i++)
127 {
128 const uint32_t rowRange = (rowSum >> 8);
129 if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
130 {
131 rowSum += sliceGroupSizeAccu;
132 m_sliceBaseRow[++sidx] = i;
133 }
134 }
135 X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
136 m_sliceBaseRow[0] = 0;
137 m_sliceBaseRow[m_param->maxSlices] = m_numRows;
138
139 m_sliceMaxBlockRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
140 ok &= !!m_sliceMaxBlockRow;
141 uint32_t maxBlockRows = (m_param->sourceHeight + (16 - 1)) / 16;
142 sliceGroupSizeAccu = (maxBlockRows << 8) / m_param->maxSlices;
143 rowSum = sliceGroupSizeAccu;
144 sidx = 0;
145 for (uint32_t i = 0; i < maxBlockRows; i++)
146 {
147 const uint32_t rowRange = (rowSum >> 8);
148 if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
149 {
150 rowSum += sliceGroupSizeAccu;
151 m_sliceMaxBlockRow[++sidx] = i;
152 }
153 }
154 m_sliceMaxBlockRow[0] = 0;
155 m_sliceMaxBlockRow[m_param->maxSlices] = maxBlockRows;
156
157 /* determine full motion search range */
158 int range = m_param->searchRange; /* fpel search */
159 range += !!(m_param->searchMethod < 2); /* diamond/hex range check lag */
160 range += NTAPS_LUMA / 2; /* subpel filter half-length */
161 range += 2 + (MotionEstimate::hpelIterationCount(m_param->subpelRefine) + 1) / 2; /* subpel refine steps */
162 m_refLagRows = /*(m_param->maxSlices > 1 ? 1 : 0) +*/ 1 + ((range + m_param->maxCUSize - 1) / m_param->maxCUSize);
163
164 // NOTE: 2 times of numRows because both Encoder and Filter in same queue
165 if (!WaveFront::init(m_numRows * 2))
166 {
167 x265_log(m_param, X265_LOG_ERROR, "unable to initialize wavefront queue\n");
168 m_pool = NULL;
169 }
170
171 m_frameFilter.init(top, this, numRows, numCols);
172
173 // initialize HRD parameters of SPS
174 if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
175 {
176 m_rce.picTimingSEI = new SEIPictureTiming;
177 m_rce.hrdTiming = new HRDTiming;
178
179 ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
180 }
181
182 if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
183 m_nr = X265_MALLOC(NoiseReduction, 1);
184 if (m_nr)
185 memset(m_nr, 0, sizeof(NoiseReduction));
186 else
187 m_param->noiseReductionIntra = m_param->noiseReductionInter = 0;
188
189 // 7.4.7.1 - Ceil( Log2( PicSizeInCtbsY ) ) bits
190 {
191 unsigned long tmp;
192 CLZ(tmp, (numRows * numCols - 1));
193 m_sliceAddrBits = (uint16_t)(tmp + 1);
194 }
195
196 return ok;
197 }
198
199 /* Generate a complete list of unique geom sets for the current picture dimensions */
initializeGeoms()200 bool FrameEncoder::initializeGeoms()
201 {
202 /* Geoms only vary between CTUs in the presence of picture edges */
203 int maxCUSize = m_param->maxCUSize;
204 int minCUSize = m_param->minCUSize;
205 int heightRem = m_param->sourceHeight & (maxCUSize - 1);
206 int widthRem = m_param->sourceWidth & (maxCUSize - 1);
207 int allocGeoms = 1; // body
208 if (heightRem && widthRem)
209 allocGeoms = 4; // body, right, bottom, corner
210 else if (heightRem || widthRem)
211 allocGeoms = 2; // body, right or bottom
212
213 m_ctuGeomMap = X265_MALLOC(uint32_t, m_numRows * m_numCols);
214 m_cuGeoms = X265_MALLOC(CUGeom, allocGeoms * CUGeom::MAX_GEOMS);
215 if (!m_cuGeoms || !m_ctuGeomMap)
216 return false;
217
218 // body
219 CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, minCUSize, m_cuGeoms);
220 memset(m_ctuGeomMap, 0, sizeof(uint32_t) * m_numRows * m_numCols);
221 if (allocGeoms == 1)
222 return true;
223
224 int countGeoms = 1;
225 if (widthRem)
226 {
227 // right
228 CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
229 for (uint32_t i = 0; i < m_numRows; i++)
230 {
231 uint32_t ctuAddr = m_numCols * (i + 1) - 1;
232 m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
233 }
234 countGeoms++;
235 }
236 if (heightRem)
237 {
238 // bottom
239 CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
240 for (uint32_t i = 0; i < m_numCols; i++)
241 {
242 uint32_t ctuAddr = m_numCols * (m_numRows - 1) + i;
243 m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
244 }
245 countGeoms++;
246
247 if (widthRem)
248 {
249 // corner
250 CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
251
252 uint32_t ctuAddr = m_numCols * m_numRows - 1;
253 m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
254 countGeoms++;
255 }
256 X265_CHECK(countGeoms == allocGeoms, "geometry match check failure\n");
257 }
258
259 return true;
260 }
261
startCompressFrame(Frame * curFrame)262 bool FrameEncoder::startCompressFrame(Frame* curFrame)
263 {
264 m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
265 m_frame = curFrame;
266 m_sliceType = curFrame->m_lowres.sliceType;
267 curFrame->m_encData->m_frameEncoderID = m_jpId;
268 curFrame->m_encData->m_jobProvider = this;
269 curFrame->m_encData->m_slice->m_mref = m_mref;
270
271 if (!m_cuGeoms)
272 {
273 if (!initializeGeoms())
274 return false;
275 }
276
277 m_enable.trigger();
278 return true;
279 }
280
threadMain()281 void FrameEncoder::threadMain()
282 {
283 THREAD_NAME("Frame", m_jpId);
284
285 if (m_pool)
286 {
287 m_pool->setCurrentThreadAffinity();
288
289 /* the first FE on each NUMA node is responsible for allocating thread
290 * local data for all worker threads in that pool. If WPP is disabled, then
291 * each FE also needs a TLD instance */
292 if (!m_jpId)
293 {
294 int numTLD = m_pool->m_numWorkers;
295 if (!m_param->bEnableWavefront)
296 numTLD += m_pool->m_numProviders;
297
298 m_tld = new ThreadLocalData[numTLD];
299 for (int i = 0; i < numTLD; i++)
300 {
301 m_tld[i].analysis.initSearch(*m_param, m_top->m_scalingList);
302 m_tld[i].analysis.create(m_tld);
303 }
304
305 for (int i = 0; i < m_pool->m_numProviders; i++)
306 {
307 if (m_pool->m_jpTable[i]->m_isFrameEncoder) /* ugh; over-allocation and other issues here */
308 {
309 FrameEncoder *peer = dynamic_cast<FrameEncoder*>(m_pool->m_jpTable[i]);
310 peer->m_tld = m_tld;
311 }
312 }
313 }
314
315 if (m_param->bEnableWavefront)
316 m_localTldIdx = -1; // cause exception if used
317 else
318 m_localTldIdx = m_pool->m_numWorkers + m_jpId;
319 }
320 else
321 {
322 m_tld = new ThreadLocalData;
323 m_tld->analysis.initSearch(*m_param, m_top->m_scalingList);
324 m_tld->analysis.create(NULL);
325 m_localTldIdx = 0;
326 }
327
328 m_done.trigger(); /* signal that thread is initialized */
329 m_enable.wait(); /* Encoder::encode() triggers this event */
330
331 while (m_threadActive)
332 {
333 if (m_param->bCTUInfo)
334 {
335 while (!m_frame->m_ctuInfo)
336 m_frame->m_copied.wait();
337 }
338 if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame->m_lowres.sliceType)))
339 {
340 while (((m_frame->m_analysisData.interData == NULL && m_frame->m_analysisData.intraData == NULL) || (uint32_t)m_frame->m_poc != m_frame->m_analysisData.poc))
341 m_frame->m_copyMVType.wait();
342 }
343 compressFrame();
344 m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */
345 m_enable.wait();
346 }
347 }
348
processTasks(int)349 void FrameEncoder::WeightAnalysis::processTasks(int /* workerThreadId */)
350 {
351 Frame* frame = master.m_frame;
352 weightAnalyse(*frame->m_encData->m_slice, *frame, *master.m_param);
353 }
354
355
getBsLength(int32_t code)356 uint32_t getBsLength( int32_t code )
357 {
358 uint32_t ucode = (code <= 0) ? -code << 1 : (code << 1) - 1;
359
360 ++ucode;
361 unsigned long idx;
362 CLZ( idx, ucode );
363 uint32_t length = (uint32_t)idx * 2 + 1;
364
365 return length;
366 }
367
writeToneMapInfo(x265_sei_payload * payload)368 bool FrameEncoder::writeToneMapInfo(x265_sei_payload *payload)
369 {
370 bool payloadChange = false;
371 if (m_top->m_prevTonemapPayload.payload != NULL && payload->payloadSize == m_top->m_prevTonemapPayload.payloadSize)
372 {
373 if (memcmp(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize) != 0)
374 payloadChange = true;
375 }
376 else
377 {
378 payloadChange = true;
379 if (m_top->m_prevTonemapPayload.payload != NULL)
380 x265_free(m_top->m_prevTonemapPayload.payload);
381 m_top->m_prevTonemapPayload.payload = (uint8_t*)x265_malloc(sizeof(uint8_t)* payload->payloadSize);
382 }
383
384 if (payloadChange)
385 {
386 m_top->m_prevTonemapPayload.payloadType = payload->payloadType;
387 m_top->m_prevTonemapPayload.payloadSize = payload->payloadSize;
388 memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
389 }
390
391 bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
392 return (payloadChange || isIDR);
393 }
394
writeTrailingSEIMessages()395 void FrameEncoder::writeTrailingSEIMessages()
396 {
397 Slice* slice = m_frame->m_encData->m_slice;
398 int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
399 int32_t payloadSize = 0;
400
401 if (m_param->decodedPictureHashSEI == 1)
402 {
403 m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
404 for (int i = 0; i < planes; i++)
405 MD5Final(&m_seiReconPictureDigest.m_state[i], m_seiReconPictureDigest.m_digest[i]);
406 payloadSize = 1 + 16 * planes;
407 }
408 else if (m_param->decodedPictureHashSEI == 2)
409 {
410 m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
411 for (int i = 0; i < planes; i++)
412 crcFinish(m_seiReconPictureDigest.m_crc[i], m_seiReconPictureDigest.m_digest[i]);
413 payloadSize = 1 + 2 * planes;
414 }
415 else if (m_param->decodedPictureHashSEI == 3)
416 {
417 m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
418 for (int i = 0; i < planes; i++)
419 checksumFinish(m_seiReconPictureDigest.m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
420 payloadSize = 1 + 4 * planes;
421 }
422
423 m_seiReconPictureDigest.setSize(payloadSize);
424 m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false);
425 }
426
compressFrame()427 void FrameEncoder::compressFrame()
428 {
429 ProfileScopeEvent(frameThread);
430
431 m_startCompressTime = x265_mdate();
432 m_totalActiveWorkerCount = 0;
433 m_activeWorkerCountSamples = 0;
434 m_totalWorkerElapsedTime = 0;
435 m_totalNoWorkerTime = 0;
436 m_countRowBlocks = 0;
437 m_allRowsAvailableTime = 0;
438 m_stallStartTime = 0;
439
440 m_completionCount = 0;
441 m_bAllRowsStop = false;
442 m_vbvResetTriggerRow = -1;
443 m_rowSliceTotalBits[0] = 0;
444 m_rowSliceTotalBits[1] = 0;
445
446 m_SSDY = m_SSDU = m_SSDV = 0;
447 m_ssim = 0;
448 m_ssimCnt = 0;
449 memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
450
451 if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
452 {
453 int height = m_frame->m_fencPic->m_picHeight;
454 int width = m_frame->m_fencPic->m_picWidth;
455 intptr_t stride = m_frame->m_fencPic->m_stride;
456
457 if (!computeEdge(m_frame->m_edgeBitPic, m_frame->m_fencPic->m_picOrg[0], NULL, stride, height, width, false, 1))
458 {
459 x265_log(m_param, X265_LOG_ERROR, " Failed to compute edge !");
460 }
461 }
462
463 /* Emit access unit delimiter unless this is the first frame and the user is
464 * not repeating headers (since AUD is supposed to be the first NAL in the access
465 * unit) */
466 Slice* slice = m_frame->m_encData->m_slice;
467
468 if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
469 {
470 m_bs.resetBits();
471 m_entropyCoder.setBitstream(&m_bs);
472 m_entropyCoder.codeAUD(*slice);
473 m_bs.writeByteAlignment();
474 m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs);
475 if (m_param->bSingleSeiNal)
476 m_bs.resetBits();
477 }
478 if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders)
479 {
480 if (m_param->bOptRefListLengthPPS)
481 {
482 ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
483 m_top->updateRefIdx();
484 }
485 if (m_top->m_param->rc.bStatRead && m_top->m_param->bMultiPassOptRPS)
486 {
487 ScopedLock refIdxLock(m_top->m_rpsInSpsLock);
488 if (!m_top->computeSPSRPSIndex())
489 {
490 x265_log(m_param, X265_LOG_ERROR, "compute commonly RPS failed!\n");
491 m_top->m_aborted = true;
492 }
493 m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
494 }
495 else
496 m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
497 }
498
499 if (m_top->m_param->rc.bStatRead && m_top->m_param->bMultiPassOptRPS)
500 m_frame->m_encData->m_slice->m_rpsIdx = (m_top->m_rateControl->m_rce2Pass + m_frame->m_encodeOrder)->rpsIdx;
501
502 // Weighted Prediction parameters estimation.
503 bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred;
504 bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred;
505
506 WeightParam* reuseWP = NULL;
507 if (m_param->analysisLoad && (bUseWeightP || bUseWeightB))
508 reuseWP = (WeightParam*)m_frame->m_analysisData.wt;
509
510 if (bUseWeightP || bUseWeightB)
511 {
512 #if DETAILED_CU_STATS
513 m_cuStats.countWeightAnalyze++;
514 ScopedElapsedTime time(m_cuStats.weightAnalyzeTime);
515 #endif
516 if (m_param->analysisLoad)
517 {
518 for (int list = 0; list < slice->isInterB() + 1; list++)
519 {
520 for (int plane = 0; plane < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
521 {
522 for (int ref = 1; ref < slice->m_numRefIdx[list]; ref++)
523 SET_WEIGHT(slice->m_weightPredTable[list][ref][plane], false, 1 << reuseWP->log2WeightDenom, reuseWP->log2WeightDenom, 0);
524 slice->m_weightPredTable[list][0][plane] = *(reuseWP++);
525 }
526 }
527 }
528 else
529 {
530 WeightAnalysis wa(*this);
531 if (m_pool && wa.tryBondPeers(*this, 1))
532 /* use an idle worker for weight analysis */
533 wa.waitForExit();
534 else
535 weightAnalyse(*slice, *m_frame, *m_param);
536 }
537 }
538 else
539 slice->disableWeights();
540
541 if (m_param->analysisSave && (bUseWeightP || bUseWeightB))
542 reuseWP = (WeightParam*)m_frame->m_analysisData.wt;
543 // Generate motion references
544 int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0;
545 for (int l = 0; l < numPredDir; l++)
546 {
547 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
548 {
549 WeightParam *w = NULL;
550 if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].wtPresent)
551 w = slice->m_weightPredTable[l][ref];
552 slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
553 m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
554 }
555 if (m_param->analysisSave && (bUseWeightP || bUseWeightB))
556 {
557 for (int i = 0; i < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
558 *(reuseWP++) = slice->m_weightPredTable[l][0][i];
559 }
560
561 }
562
563 int numTLD;
564 if (m_pool)
565 numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
566 else
567 numTLD = 1;
568
569 /* Get the QP for this frame from rate control. This call may block until
570 * frames ahead of it in encode order have called rateControlEnd() */
571 int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
572 m_rce.newQp = qp;
573
574 if (m_nr)
575 {
576 if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
577 {
578 for (int i = 0; i < numTLD; i++)
579 {
580 m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_top->m_offsetEmergency[qp - QP_MAX_SPEC - 1];
581 m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_top->m_residualSumEmergency;
582 m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_top->m_countEmergency;
583 }
584 }
585 else
586 {
587 if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
588 {
589 for (int i = 0; i < numTLD; i++)
590 {
591 m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrOffsetDenoise;
592 m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrResidualSum;
593 m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrCount;
594 }
595 }
596 else
597 {
598 for (int i = 0; i < numTLD; i++)
599 m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = NULL;
600 }
601 }
602 }
603
604 /* Clip slice QP to 0-51 spec range before encoding */
605 slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
606 if (m_param->bHDR10Opt)
607 {
608 int qpCb = x265_clip3(-12, 0, (int)floor((m_top->m_cB * ((-.46) * qp + 9.26)) + 0.5 ));
609 int qpCr = x265_clip3(-12, 0, (int)floor((m_top->m_cR * ((-.46) * qp + 9.26)) + 0.5 ));
610 slice->m_chromaQpOffset[0] = slice->m_pps->chromaQpOffset[0] + qpCb < -12 ? (qpCb + (-12 - (slice->m_pps->chromaQpOffset[0] + qpCb))) : qpCb;
611 slice->m_chromaQpOffset[1] = slice->m_pps->chromaQpOffset[1] + qpCr < -12 ? (qpCr + (-12 - (slice->m_pps->chromaQpOffset[1] + qpCr))) : qpCr;
612 }
613
614 if (m_param->bOptQpPPS && m_param->bRepeatHeaders)
615 {
616 ScopedLock qpLock(m_top->m_sliceQpLock);
617 for (int i = 0; i < (QP_MAX_MAX + 1); i++)
618 {
619 int delta = slice->m_sliceQp - (i + 1);
620 int codeLength = getBsLength( delta );
621 m_top->m_iBitsCostSum[i] += codeLength;
622 }
623 m_top->m_iFrameNum++;
624 }
625 m_initSliceContext.resetEntropy(*slice);
626
627 m_frameFilter.start(m_frame, m_initSliceContext);
628
629 /* ensure all rows are blocked prior to initializing row CTU counters */
630 WaveFront::clearEnabledRowMask();
631
632 /* reset entropy coders and compute slice id */
633 m_entropyCoder.load(m_initSliceContext);
634 for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
635 for (uint32_t row = m_sliceBaseRow[sliceId]; row < m_sliceBaseRow[sliceId + 1]; row++)
636 m_rows[row].init(m_initSliceContext, sliceId);
637
638 // reset slice counter for rate control update
639 m_sliceCnt = 0;
640
641 uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : m_param->maxSlices;
642 X265_CHECK(m_param->bEnableWavefront || (m_param->maxSlices == 1), "Multiple slices without WPP unsupport now!");
643 if (!m_outStreams)
644 {
645 m_outStreams = new Bitstream[numSubstreams];
646 if (!m_param->bEnableWavefront)
647 m_backupStreams = new Bitstream[numSubstreams];
648 m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams);
649 if (!slice->m_bUseSao)
650 {
651 for (uint32_t i = 0; i < numSubstreams; i++)
652 m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
653 }
654 }
655 else
656 {
657 for (uint32_t i = 0; i < numSubstreams; i++)
658 {
659 m_outStreams[i].resetBits();
660 if (!slice->m_bUseSao)
661 m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
662 else
663 m_rows[i].rowGoOnCoder.setBitstream(NULL);
664 }
665 }
666
667 m_rce.encodeOrder = m_frame->m_encodeOrder;
668 int prevBPSEI = m_rce.encodeOrder ? m_top->m_lastBPSEI : 0;
669
670 if (m_frame->m_lowres.bKeyframe)
671 {
672 if (m_param->bEmitHRDSEI)
673 {
674 SEIBufferingPeriod* bpSei = &m_top->m_rateControl->m_bufPeriodSEI;
675
676 // since the temporal layer HRD is not ready, we assumed it is fixed
677 bpSei->m_auCpbRemovalDelayDelta = 1;
678 bpSei->m_cpbDelayOffset = 0;
679 bpSei->m_dpbDelayOffset = 0;
680 bpSei->m_concatenationFlag = (m_param->bEnableHRDConcatFlag && !m_frame->m_poc) ? true : false;
681
682 // hrdFullness() calculates the initial CPB removal delay and offset
683 m_top->m_rateControl->hrdFullness(bpSei);
684 bpSei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
685
686 m_top->m_lastBPSEI = m_rce.encodeOrder;
687 }
688
689 if (m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_param->bEmitIDRRecoverySEI)
690 {
691 /* Recovery Point SEI require the SPS to be "activated" */
692 SEIRecoveryPoint sei;
693 sei.m_recoveryPocCnt = 0;
694 sei.m_exactMatchingFlag = true;
695 sei.m_brokenLinkFlag = false;
696 sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
697 }
698 }
699
700 if ((m_param->bEmitHRDSEI || !!m_param->interlaceMode))
701 {
702 SEIPictureTiming *sei = m_rce.picTimingSEI;
703 const VUI *vui = &slice->m_sps->vuiParameters;
704 const HRDInfo *hrd = &vui->hrdParameters;
705 int poc = slice->m_poc;
706
707 if (vui->frameFieldInfoPresentFlag)
708 {
709 if (m_param->interlaceMode > 0)
710 {
711 if( m_param->interlaceMode == 2 )
712 {
713 // m_picStruct should be set to 3 or 4 when field feature is enabled
714 if (m_param->bField)
715 // 3: Top field, bottom field, in that order; 4: Bottom field, top field, in that order
716 sei->m_picStruct = (slice->m_fieldNum == 1) ? 4 : 3;
717 else
718 sei->m_picStruct = (poc & 1) ? 1 /* top */ : 2 /* bottom */;
719 }
720 else if (m_param->interlaceMode == 1)
721 {
722 if (m_param->bField)
723 sei->m_picStruct = (slice->m_fieldNum == 1) ? 3: 4;
724 else
725 sei->m_picStruct = (poc & 1) ? 2 /* bottom */ : 1 /* top */;
726 }
727 }
728 else if (m_param->bEnableFrameDuplication)
729 sei->m_picStruct = m_frame->m_picStruct;
730 else
731 sei->m_picStruct = m_param->pictureStructure;
732
733 sei->m_sourceScanType = m_param->interlaceMode ? 0 : 1;
734
735 sei->m_duplicateFlag = false;
736 }
737
738 if (vui->hrdParametersPresentFlag)
739 {
740 // The m_aucpbremoval delay specifies how many clock ticks the
741 // access unit associated with the picture timing SEI message has to
742 // wait after removal of the access unit with the most recent
743 // buffering period SEI message
744 sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - prevBPSEI), (1 << hrd->cpbRemovalDelayLength));
745 sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
746 }
747
748 sei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
749 }
750
751 if (m_param->preferredTransferCharacteristics > -1 && slice->isIRAP())
752 {
753 SEIAlternativeTC m_seiAlternativeTC;
754 m_seiAlternativeTC.m_preferredTransferCharacteristics = m_param->preferredTransferCharacteristics;
755 m_seiAlternativeTC.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
756 }
757
758 /* Write user SEI */
759 for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
760 {
761 x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
762 if (payload->payloadType == USER_DATA_UNREGISTERED)
763 {
764 SEIuserDataUnregistered sei;
765 sei.m_userData = payload->payload;
766 sei.setSize(payload->payloadSize);
767 sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
768 }
769 else if (payload->payloadType == USER_DATA_REGISTERED_ITU_T_T35)
770 {
771 bool writeSei = m_param->bDhdr10opt ? writeToneMapInfo(payload) : true;
772 if (writeSei)
773 {
774 SEIuserDataRegistered sei;
775 sei.m_userData = payload->payload;
776 sei.setSize(payload->payloadSize);
777 sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
778 }
779 }
780 else
781 x265_log(m_param, X265_LOG_ERROR, "Unrecognized SEI type\n");
782 }
783
784 bool isSei = ((m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders) || m_param->bEmitHRDSEI ||
785 !!m_param->interlaceMode || (m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_param->bEmitIDRRecoverySEI) ||
786 m_frame->m_userSEI.numPayloads);
787
788 if (isSei && m_param->bSingleSeiNal)
789 {
790 m_bs.writeByteAlignment();
791 m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
792 }
793 /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to
794 * tune RateControl parameters for other frames.
795 * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
796 * RateControlEnd here, after the slice contexts are initialized. For the rest - ABR
797 * and VBV, unlock only after rateControlUpdateStats of this frame is called */
798 if (m_param->rc.rateControlMode != X265_RC_ABR && !m_top->m_rateControl->m_isVbv)
799 {
800 m_top->m_rateControl->m_startEndOrder.incr();
801
802 if (m_rce.encodeOrder < m_param->frameNumThreads - 1)
803 m_top->m_rateControl->m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames
804 }
805
806 if (m_param->bDynamicRefine)
807 computeAvgTrainingData();
808
809 /* Analyze CTU rows, most of the hard work is done here. Frame is
810 * compressed in a wave-front pattern if WPP is enabled. Row based loop
811 * filters runs behind the CTU compression and reconstruction */
812
813 for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
814 m_rows[m_sliceBaseRow[sliceId]].active = true;
815
816 if (m_param->bEnableWavefront)
817 {
818 int i = 0;
819 for (uint32_t rowInSlice = 0; rowInSlice < m_sliceGroupSize; rowInSlice++)
820 {
821 for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
822 {
823 const uint32_t sliceStartRow = m_sliceBaseRow[sliceId];
824 const uint32_t sliceEndRow = m_sliceBaseRow[sliceId + 1] - 1;
825 const uint32_t row = sliceStartRow + rowInSlice;
826 if (row > sliceEndRow)
827 continue;
828 m_row_to_idx[row] = i;
829 m_idx_to_row[i] = row;
830 i += 1;
831 }
832 }
833 }
834
835 if (m_param->bEnableWavefront)
836 {
837 for (uint32_t rowInSlice = 0; rowInSlice < m_sliceGroupSize; rowInSlice++)
838 {
839 for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
840 {
841 const uint32_t sliceStartRow = m_sliceBaseRow[sliceId];
842 const uint32_t sliceEndRow = m_sliceBaseRow[sliceId + 1] - 1;
843 const uint32_t row = sliceStartRow + rowInSlice;
844
845 X265_CHECK(row < m_numRows, "slices row fault was detected");
846
847 if (row > sliceEndRow)
848 continue;
849
850 // block until all reference frames have reconstructed the rows we need
851 for (int l = 0; l < numPredDir; l++)
852 {
853 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
854 {
855 Frame *refpic = slice->m_refFrameList[l][ref];
856
857 // NOTE: we unnecessary wait row that beyond current slice boundary
858 const int rowIdx = X265_MIN(sliceEndRow, (row + m_refLagRows));
859
860 while (refpic->m_reconRowFlag[rowIdx].get() == 0)
861 refpic->m_reconRowFlag[rowIdx].waitForChange(0);
862
863 if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
864 m_mref[l][ref].applyWeight(rowIdx, m_numRows, sliceEndRow, sliceId);
865 }
866 }
867
868 enableRowEncoder(m_row_to_idx[row]); /* clear external dependency for this row */
869 if (!rowInSlice)
870 {
871 m_row0WaitTime = x265_mdate();
872 enqueueRowEncoder(m_row_to_idx[row]); /* clear internal dependency, start wavefront */
873 }
874 tryWakeOne();
875 } // end of loop rowInSlice
876 } // end of loop sliceId
877
878 m_allRowsAvailableTime = x265_mdate();
879 tryWakeOne(); /* ensure one thread is active or help-wanted flag is set prior to blocking */
880 static const int block_ms = 250;
881 while (m_completionEvent.timedWait(block_ms))
882 tryWakeOne();
883 }
884 else
885 {
886 for (uint32_t i = 0; i < m_numRows + m_filterRowDelay; i++)
887 {
888 // compress
889 if (i < m_numRows)
890 {
891 // block until all reference frames have reconstructed the rows we need
892 for (int l = 0; l < numPredDir; l++)
893 {
894 int list = l;
895 for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++)
896 {
897 Frame *refpic = slice->m_refFrameList[list][ref];
898
899 const int rowIdx = X265_MIN(m_numRows - 1, (i + m_refLagRows));
900 while (refpic->m_reconRowFlag[rowIdx].get() == 0)
901 refpic->m_reconRowFlag[rowIdx].waitForChange(0);
902
903 if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
904 m_mref[list][ref].applyWeight(rowIdx, m_numRows, m_numRows, 0);
905 }
906 }
907
908 if (!i)
909 m_row0WaitTime = x265_mdate();
910 else if (i == m_numRows - 1)
911 m_allRowsAvailableTime = x265_mdate();
912 processRowEncoder(i, m_tld[m_localTldIdx]);
913 }
914
915 // filter
916 if (i >= m_filterRowDelay)
917 m_frameFilter.processRow(i - m_filterRowDelay);
918 }
919 }
920 #if ENABLE_LIBVMAF
921 vmafFrameLevelScore();
922 #endif
923
924 if (m_param->maxSlices > 1)
925 {
926 PicYuv *reconPic = m_frame->m_reconPic;
927 uint32_t height = reconPic->m_picHeight;
928 initDecodedPictureHashSEI(0, 0, height);
929 }
930
931 if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder) //Avoid collecting data that will not be used by future frames.
932 collectDynDataFrame();
933
934 if (m_param->rc.bStatWrite)
935 {
936 int totalI = 0, totalP = 0, totalSkip = 0;
937
938 // accumulate intra,inter,skip cu count per frame for 2 pass
939 for (uint32_t i = 0; i < m_numRows; i++)
940 {
941 m_frame->m_encData->m_frameStats.mvBits += m_rows[i].rowStats.mvBits;
942 m_frame->m_encData->m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits;
943 m_frame->m_encData->m_frameStats.miscBits += m_rows[i].rowStats.miscBits;
944 totalI += m_rows[i].rowStats.intra8x8Cnt;
945 totalP += m_rows[i].rowStats.inter8x8Cnt;
946 totalSkip += m_rows[i].rowStats.skip8x8Cnt;
947 }
948 int totalCuCount = totalI + totalP + totalSkip;
949 m_frame->m_encData->m_frameStats.percent8x8Intra = (double)totalI / totalCuCount;
950 m_frame->m_encData->m_frameStats.percent8x8Inter = (double)totalP / totalCuCount;
951 m_frame->m_encData->m_frameStats.percent8x8Skip = (double)totalSkip / totalCuCount;
952 }
953
954 if (m_param->csvLogLevel >= 1)
955 {
956 for (uint32_t i = 0; i < m_numRows; i++)
957 {
958 m_frame->m_encData->m_frameStats.cntIntraNxN += m_rows[i].rowStats.cntIntraNxN;
959 m_frame->m_encData->m_frameStats.totalCu += m_rows[i].rowStats.totalCu;
960 m_frame->m_encData->m_frameStats.totalCtu += m_rows[i].rowStats.totalCtu;
961 m_frame->m_encData->m_frameStats.lumaDistortion += m_rows[i].rowStats.lumaDistortion;
962 m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion;
963 m_frame->m_encData->m_frameStats.psyEnergy += m_rows[i].rowStats.psyEnergy;
964 m_frame->m_encData->m_frameStats.ssimEnergy += m_rows[i].rowStats.ssimEnergy;
965 m_frame->m_encData->m_frameStats.resEnergy += m_rows[i].rowStats.resEnergy;
966 for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
967 {
968 m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth];
969 m_frame->m_encData->m_frameStats.cntMergeCu[depth] += m_rows[i].rowStats.cntMergeCu[depth];
970 for (int m = 0; m < INTER_MODES; m++)
971 m_frame->m_encData->m_frameStats.cuInterDistribution[depth][m] += m_rows[i].rowStats.cuInterDistribution[depth][m];
972 for (int n = 0; n < INTRA_MODES; n++)
973 m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] += m_rows[i].rowStats.cuIntraDistribution[depth][n];
974 }
975 }
976 m_frame->m_encData->m_frameStats.percentIntraNxN = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
977
978 for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
979 {
980 m_frame->m_encData->m_frameStats.percentSkipCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntSkipCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
981 m_frame->m_encData->m_frameStats.percentMergeCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntMergeCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
982 for (int n = 0; n < INTRA_MODES; n++)
983 m_frame->m_encData->m_frameStats.percentIntraDistribution[depth][n] = (double)(m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] * 100) / m_frame->m_encData->m_frameStats.totalCu;
984 uint64_t cuInterRectCnt = 0; // sum of Nx2N, 2NxN counts
985 cuInterRectCnt += m_frame->m_encData->m_frameStats.cuInterDistribution[depth][1] + m_frame->m_encData->m_frameStats.cuInterDistribution[depth][2];
986 m_frame->m_encData->m_frameStats.percentInterDistribution[depth][0] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][0] * 100) / m_frame->m_encData->m_frameStats.totalCu;
987 m_frame->m_encData->m_frameStats.percentInterDistribution[depth][1] = (double)(cuInterRectCnt * 100) / m_frame->m_encData->m_frameStats.totalCu;
988 m_frame->m_encData->m_frameStats.percentInterDistribution[depth][2] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][3] * 100) / m_frame->m_encData->m_frameStats.totalCu;
989 }
990 }
991
992 if (m_param->csvLogLevel >= 2)
993 {
994 m_frame->m_encData->m_frameStats.avgLumaDistortion = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
995 m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
996 m_frame->m_encData->m_frameStats.avgPsyEnergy = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
997 m_frame->m_encData->m_frameStats.avgSsimEnergy = (double)(m_frame->m_encData->m_frameStats.ssimEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
998 m_frame->m_encData->m_frameStats.avgResEnergy = (double)(m_frame->m_encData->m_frameStats.resEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
999 }
1000
1001 m_bs.resetBits();
1002 m_entropyCoder.load(m_initSliceContext);
1003 m_entropyCoder.setBitstream(&m_bs);
1004
1005 // finish encode of each CTU row, only required when SAO is enabled
1006 if (slice->m_bUseSao)
1007 encodeSlice(0);
1008
1009 m_entropyCoder.setBitstream(&m_bs);
1010
1011 if (m_param->maxSlices > 1)
1012 {
1013 uint32_t nextSliceRow = 0;
1014
1015 for(uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
1016 {
1017 m_bs.resetBits();
1018
1019 const uint32_t sliceAddr = nextSliceRow * m_numCols;
1020 if (m_param->bOptRefListLengthPPS)
1021 {
1022 ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
1023 m_top->analyseRefIdx(slice->m_numRefIdx);
1024 }
1025 m_entropyCoder.codeSliceHeader(*slice, *m_frame->m_encData, sliceAddr, m_sliceAddrBits, slice->m_sliceQp);
1026
1027 // Find rows of current slice
1028 const uint32_t prevSliceRow = nextSliceRow;
1029 while(nextSliceRow < m_numRows && m_rows[nextSliceRow].sliceId == sliceId)
1030 nextSliceRow++;
1031
1032 // serialize each row, record final lengths in slice header
1033 uint32_t maxStreamSize = m_nalList.serializeSubstreams(&m_substreamSizes[prevSliceRow], (nextSliceRow - prevSliceRow), &m_outStreams[prevSliceRow]);
1034
1035 // complete the slice header by writing WPP row-starts
1036 m_entropyCoder.setBitstream(&m_bs);
1037 if (slice->m_pps->bEntropyCodingSyncEnabled)
1038 m_entropyCoder.codeSliceHeaderWPPEntryPoints(&m_substreamSizes[prevSliceRow], (nextSliceRow - prevSliceRow - 1), maxStreamSize);
1039
1040 m_bs.writeByteAlignment();
1041
1042 m_nalList.serialize(slice->m_nalUnitType, m_bs);
1043 }
1044 }
1045 else
1046 {
1047 if (m_param->bOptRefListLengthPPS)
1048 {
1049 ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
1050 m_top->analyseRefIdx(slice->m_numRefIdx);
1051 }
1052 m_entropyCoder.codeSliceHeader(*slice, *m_frame->m_encData, 0, 0, slice->m_sliceQp);
1053
1054 // serialize each row, record final lengths in slice header
1055 uint32_t maxStreamSize = m_nalList.serializeSubstreams(m_substreamSizes, numSubstreams, m_outStreams);
1056
1057 // complete the slice header by writing WPP row-starts
1058 m_entropyCoder.setBitstream(&m_bs);
1059 if (slice->m_pps->bEntropyCodingSyncEnabled)
1060 m_entropyCoder.codeSliceHeaderWPPEntryPoints(m_substreamSizes, (slice->m_sps->numCuInHeight - 1), maxStreamSize);
1061 m_bs.writeByteAlignment();
1062
1063 m_nalList.serialize(slice->m_nalUnitType, m_bs);
1064 }
1065
1066 if (m_param->decodedPictureHashSEI)
1067 writeTrailingSEIMessages();
1068
1069 uint64_t bytes = 0;
1070 for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
1071 {
1072 int type = m_nalList.m_nal[i].type;
1073
1074 // exclude SEI
1075 if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
1076 {
1077 bytes += m_nalList.m_nal[i].sizeBytes;
1078 // and exclude start code prefix
1079 bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
1080 }
1081 }
1082 m_accessUnitBits = bytes << 3;
1083
1084 int filler = 0;
1085 /* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */
1086 if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &filler) < 0)
1087 m_top->m_aborted = true;
1088
1089 if (filler > 0)
1090 {
1091 filler = (filler - FILLER_OVERHEAD * 8) >> 3;
1092 m_bs.resetBits();
1093 while (filler > 0)
1094 {
1095 m_bs.write(0xff, 8);
1096 filler--;
1097 }
1098 m_bs.writeByteAlignment();
1099 m_nalList.serialize(NAL_UNIT_FILLER_DATA, m_bs);
1100 bytes += m_nalList.m_nal[m_nalList.m_numNal - 1].sizeBytes;
1101 bytes -= 3; //exclude start code prefix
1102 m_accessUnitBits = bytes << 3;
1103 }
1104
1105 if (m_frame->m_rpu.payloadSize)
1106 {
1107 m_bs.resetBits();
1108 for (int i = 0; i < m_frame->m_rpu.payloadSize; i++)
1109 m_bs.write(m_frame->m_rpu.payload[i], 8);
1110 m_nalList.serialize(NAL_UNIT_UNSPECIFIED, m_bs);
1111 }
1112
1113 m_endCompressTime = x265_mdate();
1114
1115 /* Decrement referenced frame reference counts, allow them to be recycled */
1116 for (int l = 0; l < numPredDir; l++)
1117 {
1118 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
1119 {
1120 Frame *refpic = slice->m_refFrameList[l][ref];
1121 ATOMIC_DEC(&refpic->m_countRefEncoders);
1122 }
1123 }
1124
1125 if (m_nr)
1126 {
1127 bool nrEnabled = (m_rce.newQp < QP_MAX_SPEC || !m_param->rc.vbvBufferSize) && (m_param->noiseReductionIntra || m_param->noiseReductionInter);
1128
1129 if (nrEnabled)
1130 {
1131 /* Accumulate NR statistics from all worker threads */
1132 for (int i = 0; i < numTLD; i++)
1133 {
1134 NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
1135 for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
1136 {
1137 for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
1138 m_nr->nrResidualSum[cat][coeff] += nr->nrResidualSum[cat][coeff];
1139
1140 m_nr->nrCount[cat] += nr->nrCount[cat];
1141 }
1142 }
1143
1144 noiseReductionUpdate();
1145
1146 /* Copy updated NR coefficients back to all worker threads */
1147 for (int i = 0; i < numTLD; i++)
1148 {
1149 NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
1150 memcpy(nr->nrOffsetDenoise, m_nr->nrOffsetDenoise, sizeof(uint16_t)* MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
1151 memset(nr->nrCount, 0, sizeof(uint32_t)* MAX_NUM_TR_CATEGORIES);
1152 memset(nr->nrResidualSum, 0, sizeof(uint32_t)* MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
1153 }
1154 }
1155 }
1156
1157 #if DETAILED_CU_STATS
1158 /* Accumulate CU statistics from each worker thread, we could report
1159 * per-frame stats here, but currently we do not. */
1160 for (int i = 0; i < numTLD; i++)
1161 m_cuStats.accumulate(m_tld[i].analysis.m_stats[m_jpId], *m_param);
1162 #endif
1163
1164 m_endFrameTime = x265_mdate();
1165 }
1166
initDecodedPictureHashSEI(int row,int cuAddr,int height)1167 void FrameEncoder::initDecodedPictureHashSEI(int row, int cuAddr, int height)
1168 {
1169 PicYuv *reconPic = m_frame->m_reconPic;
1170 uint32_t width = reconPic->m_picWidth;
1171 intptr_t stride = reconPic->m_stride;
1172 uint32_t maxCUHeight = m_param->maxCUSize;
1173
1174 const uint32_t hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
1175 const uint32_t vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
1176
1177 if (m_param->decodedPictureHashSEI == 1)
1178 {
1179 if (!row)
1180 MD5Init(&m_seiReconPictureDigest.m_state[0]);
1181
1182 updateMD5Plane(m_seiReconPictureDigest.m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
1183 if (m_param->internalCsp != X265_CSP_I400)
1184 {
1185 if (!row)
1186 {
1187 MD5Init(&m_seiReconPictureDigest.m_state[1]);
1188 MD5Init(&m_seiReconPictureDigest.m_state[2]);
1189 }
1190
1191 width >>= hChromaShift;
1192 height >>= vChromaShift;
1193 stride = reconPic->m_strideC;
1194
1195 updateMD5Plane(m_seiReconPictureDigest.m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
1196 updateMD5Plane(m_seiReconPictureDigest.m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
1197 }
1198 }
1199 else if (m_param->decodedPictureHashSEI == 2)
1200 {
1201
1202 if (!row)
1203 m_seiReconPictureDigest.m_crc[0] = 0xffff;
1204
1205 updateCRC(reconPic->getLumaAddr(cuAddr), m_seiReconPictureDigest.m_crc[0], height, width, stride);
1206 if (m_param->internalCsp != X265_CSP_I400)
1207 {
1208 width >>= hChromaShift;
1209 height >>= vChromaShift;
1210 stride = reconPic->m_strideC;
1211 m_seiReconPictureDigest.m_crc[1] = m_seiReconPictureDigest.m_crc[2] = 0xffff;
1212
1213 updateCRC(reconPic->getCbAddr(cuAddr), m_seiReconPictureDigest.m_crc[1], height, width, stride);
1214 updateCRC(reconPic->getCrAddr(cuAddr), m_seiReconPictureDigest.m_crc[2], height, width, stride);
1215 }
1216 }
1217 else if (m_param->decodedPictureHashSEI == 3)
1218 {
1219 if (!row)
1220 m_seiReconPictureDigest.m_checksum[0] = 0;
1221
1222 updateChecksum(reconPic->m_picOrg[0], m_seiReconPictureDigest.m_checksum[0], height, width, stride, row, maxCUHeight);
1223 if (m_param->internalCsp != X265_CSP_I400)
1224 {
1225 width >>= hChromaShift;
1226 height >>= vChromaShift;
1227 stride = reconPic->m_strideC;
1228 maxCUHeight >>= vChromaShift;
1229
1230 if (!row)
1231 m_seiReconPictureDigest.m_checksum[1] = m_seiReconPictureDigest.m_checksum[2] = 0;
1232
1233 updateChecksum(reconPic->m_picOrg[1], m_seiReconPictureDigest.m_checksum[1], height, width, stride, row, maxCUHeight);
1234 updateChecksum(reconPic->m_picOrg[2], m_seiReconPictureDigest.m_checksum[2], height, width, stride, row, maxCUHeight);
1235 }
1236 }
1237 }
1238
encodeSlice(uint32_t sliceAddr)1239 void FrameEncoder::encodeSlice(uint32_t sliceAddr)
1240 {
1241 Slice* slice = m_frame->m_encData->m_slice;
1242 const uint32_t widthInLCUs = slice->m_sps->numCuInWidth;
1243 const uint32_t lastCUAddr = (slice->m_endCUAddr + m_param->num4x4Partitions - 1) / m_param->num4x4Partitions;
1244 const uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1;
1245
1246 SAOParam* saoParam = slice->m_sps->bUseSAO && slice->m_bUseSao ? m_frame->m_encData->m_saoParam : NULL;
1247 for (uint32_t cuAddr = sliceAddr; cuAddr < lastCUAddr; cuAddr++)
1248 {
1249 uint32_t col = cuAddr % widthInLCUs;
1250 uint32_t row = cuAddr / widthInLCUs;
1251 uint32_t subStrm = row % numSubstreams;
1252 CUData* ctu = m_frame->m_encData->getPicCTU(cuAddr);
1253
1254 m_entropyCoder.setBitstream(&m_outStreams[subStrm]);
1255
1256 // Synchronize cabac probabilities with upper-right CTU if it's available and we're at the start of a line.
1257 if (m_param->bEnableWavefront && !col && row)
1258 {
1259 m_entropyCoder.copyState(m_initSliceContext);
1260 m_entropyCoder.loadContexts(m_rows[row - 1].bufferedEntropy);
1261 }
1262
1263 // Initialize slice context
1264 if (ctu->m_bFirstRowInSlice && !col)
1265 m_entropyCoder.load(m_initSliceContext);
1266
1267 if (saoParam)
1268 {
1269 if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
1270 {
1271 int mergeLeft = col && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_LEFT;
1272 int mergeUp = !ctu->m_bFirstRowInSlice && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_UP;
1273 if (col)
1274 m_entropyCoder.codeSaoMerge(mergeLeft);
1275 if (!ctu->m_bFirstRowInSlice && !mergeLeft)
1276 m_entropyCoder.codeSaoMerge(mergeUp);
1277 if (!mergeLeft && !mergeUp)
1278 {
1279 if (saoParam->bSaoFlag[0])
1280 m_entropyCoder.codeSaoOffset(saoParam->ctuParam[0][cuAddr], 0);
1281 if (saoParam->bSaoFlag[1])
1282 {
1283 m_entropyCoder.codeSaoOffset(saoParam->ctuParam[1][cuAddr], 1);
1284 m_entropyCoder.codeSaoOffset(saoParam->ctuParam[2][cuAddr], 2);
1285 }
1286 }
1287 }
1288 else
1289 {
1290 for (int i = 0; i < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
1291 saoParam->ctuParam[i][cuAddr].reset();
1292 }
1293 }
1294
1295 // final coding (bitstream generation) for this CU
1296 m_entropyCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]);
1297
1298 if (m_param->bEnableWavefront)
1299 {
1300 if (col == 1)
1301 // Store probabilities of second CTU in line into buffer
1302 m_rows[row].bufferedEntropy.loadContexts(m_entropyCoder);
1303
1304 if (col == widthInLCUs - 1)
1305 m_entropyCoder.finishSlice();
1306 }
1307 }
1308
1309 if (!m_param->bEnableWavefront)
1310 m_entropyCoder.finishSlice();
1311 }
1312
processRow(int row,int threadId)1313 void FrameEncoder::processRow(int row, int threadId)
1314 {
1315 int64_t startTime = x265_mdate();
1316 if (ATOMIC_INC(&m_activeWorkerCount) == 1 && m_stallStartTime)
1317 m_totalNoWorkerTime += x265_mdate() - m_stallStartTime;
1318
1319 const uint32_t realRow = m_idx_to_row[row >> 1];
1320 const uint32_t typeNum = m_idx_to_row[row & 1];
1321
1322 if (!typeNum)
1323 processRowEncoder(realRow, m_tld[threadId]);
1324 else
1325 {
1326 m_frameFilter.processRow(realRow);
1327
1328 // NOTE: Active next row
1329 if (realRow != m_sliceBaseRow[m_rows[realRow].sliceId + 1] - 1)
1330 enqueueRowFilter(m_row_to_idx[realRow + 1]);
1331 }
1332
1333 if (ATOMIC_DEC(&m_activeWorkerCount) == 0)
1334 m_stallStartTime = x265_mdate();
1335
1336 m_totalWorkerElapsedTime += x265_mdate() - startTime; // not thread safe, but good enough
1337 }
1338
1339 // Called by worker threads
processRowEncoder(int intRow,ThreadLocalData & tld)1340 void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld)
1341 {
1342 const uint32_t row = (uint32_t)intRow;
1343 CTURow& curRow = m_rows[row];
1344
1345 if (m_param->bEnableWavefront)
1346 {
1347 ScopedLock self(curRow.lock);
1348 if (!curRow.active)
1349 /* VBV restart is in progress, exit out */
1350 return;
1351 if (curRow.busy)
1352 {
1353 /* On multi-socket Windows servers, we have seen problems with
1354 * ATOMIC_CAS which resulted in multiple worker threads processing
1355 * the same CU row, which often resulted in bad pointer accesses. We
1356 * believe the problem is fixed, but are leaving this check in place
1357 * to prevent crashes in case it is not */
1358 x265_log(m_param, X265_LOG_WARNING,
1359 "internal error - simultaneous row access detected. Please report HW to x265-devel@videolan.org\n");
1360 return;
1361 }
1362 curRow.busy = true;
1363 }
1364
1365 /* When WPP is enabled, every row has its own row coder instance. Otherwise
1366 * they share row 0 */
1367 Entropy& rowCoder = m_param->bEnableWavefront ? curRow.rowGoOnCoder : m_rows[0].rowGoOnCoder;
1368 FrameData& curEncData = *m_frame->m_encData;
1369 Slice *slice = curEncData.m_slice;
1370
1371 const uint32_t numCols = m_numCols;
1372 const uint32_t lineStartCUAddr = row * numCols;
1373 bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
1374
1375 const uint32_t sliceId = curRow.sliceId;
1376 uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
1377 uint32_t noOfBlocks = m_param->maxCUSize / 16;
1378 const uint32_t bFirstRowInSlice = ((row == 0) || (m_rows[row - 1].sliceId != curRow.sliceId)) ? 1 : 0;
1379 const uint32_t bLastRowInSlice = ((row == m_numRows - 1) || (m_rows[row + 1].sliceId != curRow.sliceId)) ? 1 : 0;
1380 const uint32_t endRowInSlicePlus1 = m_sliceBaseRow[sliceId + 1];
1381 const uint32_t rowInSlice = row - m_sliceBaseRow[sliceId];
1382
1383 // Load SBAC coder context from previous row and initialize row state.
1384 if (bFirstRowInSlice && !curRow.completed)
1385 rowCoder.load(m_initSliceContext);
1386
1387 // calculate mean QP for consistent deltaQP signalling calculation
1388 if (m_param->bOptCUDeltaQP)
1389 {
1390 ScopedLock self(curRow.lock);
1391 if (!curRow.avgQPComputed)
1392 {
1393 if (m_param->bEnableWavefront || !row)
1394 {
1395 double meanQPOff = 0;
1396 bool isReferenced = IS_REFERENCED(m_frame);
1397 double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
1398 if (qpoffs)
1399 {
1400 uint32_t loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
1401
1402 uint32_t cuYStart = 0, height = m_frame->m_fencPic->m_picHeight;
1403 if (m_param->bEnableWavefront)
1404 {
1405 cuYStart = intRow * m_param->maxCUSize;
1406 height = cuYStart + m_param->maxCUSize;
1407 }
1408
1409 uint32_t qgSize = m_param->rc.qgSize, width = m_frame->m_fencPic->m_picWidth;
1410 uint32_t maxOffsetCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
1411 uint32_t count = 0;
1412 for (uint32_t cuY = cuYStart; cuY < height && (cuY < m_frame->m_fencPic->m_picHeight); cuY += qgSize)
1413 {
1414 for (uint32_t cuX = 0; cuX < width; cuX += qgSize)
1415 {
1416 double qp_offset = 0;
1417 uint32_t cnt = 0;
1418
1419 for (uint32_t block_yy = cuY; block_yy < cuY + qgSize && block_yy < m_frame->m_fencPic->m_picHeight; block_yy += loopIncr)
1420 {
1421 for (uint32_t block_xx = cuX; block_xx < cuX + qgSize && block_xx < width; block_xx += loopIncr)
1422 {
1423 int idx = ((block_yy / loopIncr) * (maxOffsetCols)) + (block_xx / loopIncr);
1424 qp_offset += qpoffs[idx];
1425 cnt++;
1426 }
1427 }
1428 qp_offset /= cnt;
1429 meanQPOff += qp_offset;
1430 count++;
1431 }
1432 }
1433 meanQPOff /= count;
1434 }
1435 rowCoder.m_meanQP = slice->m_sliceQp + meanQPOff;
1436 }
1437 else
1438 {
1439 rowCoder.m_meanQP = m_rows[0].rowGoOnCoder.m_meanQP;
1440 }
1441 curRow.avgQPComputed = 1;
1442 }
1443 }
1444
1445 // Initialize restrict on MV range in slices
1446 tld.analysis.m_sliceMinY = -(int32_t)(rowInSlice * m_param->maxCUSize * 4) + 3 * 4;
1447 tld.analysis.m_sliceMaxY = (int32_t)((endRowInSlicePlus1 - 1 - row) * (m_param->maxCUSize * 4) - 4 * 4);
1448
1449 // Handle single row slice
1450 if (tld.analysis.m_sliceMaxY < tld.analysis.m_sliceMinY)
1451 tld.analysis.m_sliceMaxY = tld.analysis.m_sliceMinY = 0;
1452
1453
1454 while (curRow.completed < numCols)
1455 {
1456 ProfileScopeEvent(encodeCTU);
1457
1458 const uint32_t col = curRow.completed;
1459 const uint32_t cuAddr = lineStartCUAddr + col;
1460 CUData* ctu = curEncData.getPicCTU(cuAddr);
1461 const uint32_t bLastCuInSlice = (bLastRowInSlice & (col == numCols - 1)) ? 1 : 0;
1462 ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp, bFirstRowInSlice, bLastRowInSlice, bLastCuInSlice);
1463
1464 if (bIsVbv)
1465 {
1466 if (col == 0 && !m_param->bEnableWavefront)
1467 {
1468 m_backupStreams[0].copyBits(&m_outStreams[0]);
1469 curRow.bufferedEntropy.copyState(rowCoder);
1470 curRow.bufferedEntropy.loadContexts(rowCoder);
1471 }
1472 if (bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
1473 {
1474 curEncData.m_rowStat[row].rowQp = curEncData.m_avgQpRc;
1475 curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(curEncData.m_avgQpRc);
1476 }
1477
1478 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];
1479 if (m_param->bEnableWavefront && rowInSlice >= col && !bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
1480 cuStat.baseQp = curEncData.m_cuStat[cuAddr - numCols + 1].baseQp;
1481 else if (!m_param->bEnableWavefront && !bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
1482 cuStat.baseQp = curEncData.m_rowStat[row - 1].rowQp;
1483 else
1484 cuStat.baseQp = curEncData.m_rowStat[row].rowQp;
1485
1486 /* TODO: use defines from slicetype.h for lowres block size */
1487 uint32_t block_y = (ctu->m_cuPelY >> m_param->maxLog2CUSize) * noOfBlocks;
1488 uint32_t block_x = (ctu->m_cuPelX >> m_param->maxLog2CUSize) * noOfBlocks;
1489 if (!m_param->analysisLoad || !m_param->bDisableLookahead)
1490 {
1491 cuStat.vbvCost = 0;
1492 cuStat.intraVbvCost = 0;
1493
1494 for (uint32_t h = 0; h < noOfBlocks && block_y < m_sliceMaxBlockRow[sliceId + 1]; h++, block_y++)
1495 {
1496 uint32_t idx = block_x + (block_y * maxBlockCols);
1497
1498 for (uint32_t w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++, idx++)
1499 {
1500 cuStat.vbvCost += m_frame->m_lowres.lowresCostForRc[idx] & LOWRES_COST_MASK;
1501 cuStat.intraVbvCost += m_frame->m_lowres.intraCost[idx];
1502 }
1503 }
1504 }
1505 }
1506 else
1507 curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_avgQpRc;
1508
1509 if (m_param->bEnableWavefront && !col && !bFirstRowInSlice)
1510 {
1511 // Load SBAC coder context from previous row and initialize row state.
1512 rowCoder.copyState(m_initSliceContext);
1513 rowCoder.loadContexts(m_rows[row - 1].bufferedEntropy);
1514 }
1515 if (m_param->dynamicRd && (int32_t)(m_rce.qpaRc - m_rce.qpNoVbv) > 0)
1516 ctu->m_vbvAffected = true;
1517
1518 // Does all the CU analysis, returns best top level mode decision
1519 Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
1520
1521 /* startPoint > encodeOrder is true when the start point changes for
1522 a new GOP but few frames from the previous GOP is still incomplete.
1523 The data of frames in this interval will not be used by any future frames. */
1524 if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder)
1525 collectDynDataRow(*ctu, &curRow.rowStats);
1526
1527 // take a sample of the current active worker count
1528 ATOMIC_ADD(&m_totalActiveWorkerCount, m_activeWorkerCount);
1529 ATOMIC_INC(&m_activeWorkerCountSamples);
1530
1531 /* advance top-level row coder to include the context of this CTU.
1532 * if SAO is disabled, rowCoder writes the final CTU bitstream */
1533 rowCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]);
1534
1535 if (m_param->bEnableWavefront && col == 1)
1536 // Save CABAC state for next row
1537 curRow.bufferedEntropy.loadContexts(rowCoder);
1538
1539 /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
1540 if (slice->m_bUseSao && m_param->bSaoNonDeblocked)
1541 m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
1542
1543 /* Deblock with idle threading */
1544 if (m_param->bEnableLoopFilter | slice->m_bUseSao)
1545 {
1546 // NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO
1547 if (!bIsVbv)
1548 {
1549 // Delay one row to avoid intra prediction conflict
1550 if (m_pool && !bFirstRowInSlice)
1551 {
1552 int allowCol = col;
1553
1554 // avoid race condition on last column
1555 if (rowInSlice >= 2)
1556 {
1557 allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
1558 : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
1559 }
1560 m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
1561 }
1562
1563 // Last Row may start early
1564 if (m_pool && bLastRowInSlice)
1565 {
1566 // Deblocking last row
1567 int allowCol = col;
1568
1569 // avoid race condition on last column
1570 if (rowInSlice >= 2)
1571 {
1572 allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
1573 : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
1574 }
1575 m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
1576 }
1577 } // end of !bIsVbv
1578 }
1579 // Both Loopfilter and SAO Disabled
1580 else
1581 {
1582 m_frameFilter.m_parallelFilter[row].processPostCu(col);
1583 }
1584
1585 // Completed CU processing
1586 curRow.completed++;
1587
1588 FrameStats frameLog;
1589 curEncData.m_rowStat[row].sumQpAq += collectCTUStatistics(*ctu, &frameLog);
1590
1591 // copy number of intra, inter cu per row into frame stats for 2 pass
1592 if (m_param->rc.bStatWrite)
1593 {
1594 curRow.rowStats.mvBits += best.mvBits;
1595 curRow.rowStats.coeffBits += best.coeffBits;
1596 curRow.rowStats.miscBits += best.totalBits - (best.mvBits + best.coeffBits);
1597
1598 for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
1599 {
1600 /* 1 << shift == number of 8x8 blocks at current depth */
1601 int shift = 2 * (m_param->maxCUDepth - depth);
1602 int cuSize = m_param->maxCUSize >> depth;
1603
1604 curRow.rowStats.intra8x8Cnt += (cuSize == 8) ? (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN) :
1605 (int)(frameLog.cntIntra[depth] << shift);
1606
1607 curRow.rowStats.inter8x8Cnt += (int)(frameLog.cntInter[depth] << shift);
1608 curRow.rowStats.skip8x8Cnt += (int)((frameLog.cntSkipCu[depth] + frameLog.cntMergeCu[depth]) << shift);
1609 }
1610 }
1611 curRow.rowStats.totalCtu++;
1612 curRow.rowStats.lumaDistortion += best.lumaDistortion;
1613 curRow.rowStats.chromaDistortion += best.chromaDistortion;
1614 curRow.rowStats.psyEnergy += best.psyEnergy;
1615 curRow.rowStats.ssimEnergy += best.ssimEnergy;
1616 curRow.rowStats.resEnergy += best.resEnergy;
1617 curRow.rowStats.cntIntraNxN += frameLog.cntIntraNxN;
1618 curRow.rowStats.totalCu += frameLog.totalCu;
1619 for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
1620 {
1621 curRow.rowStats.cntSkipCu[depth] += frameLog.cntSkipCu[depth];
1622 curRow.rowStats.cntMergeCu[depth] += frameLog.cntMergeCu[depth];
1623 for (int m = 0; m < INTER_MODES; m++)
1624 curRow.rowStats.cuInterDistribution[depth][m] += frameLog.cuInterDistribution[depth][m];
1625 for (int n = 0; n < INTRA_MODES; n++)
1626 curRow.rowStats.cuIntraDistribution[depth][n] += frameLog.cuIntraDistribution[depth][n];
1627 }
1628
1629 curEncData.m_cuStat[cuAddr].totalBits = best.totalBits;
1630 x265_emms();
1631
1632 if (bIsVbv)
1633 {
1634 // Update encoded bits, satdCost, baseQP for each CU if tune grain is disabled
1635 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];
1636 if ((m_param->bEnableWavefront && ((cuAddr == m_sliceBaseRow[sliceId] * numCols) || !m_param->rc.bEnableConstVbv)) || !m_param->bEnableWavefront)
1637 {
1638 curEncData.m_rowStat[row].rowSatd += cuStat.vbvCost;
1639 curEncData.m_rowStat[row].rowIntraSatd += cuStat.intraVbvCost;
1640 curEncData.m_rowStat[row].encodedBits += cuStat.totalBits;
1641 curEncData.m_rowStat[row].sumQpRc += cuStat.baseQp;
1642 curEncData.m_rowStat[row].numEncodedCUs = cuAddr;
1643 }
1644
1645 // If current block is at row end checkpoint, call vbv ratecontrol.
1646 if (!m_param->bEnableWavefront && col == numCols - 1)
1647 {
1648 double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
1649 curRow.reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
1650 qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase);
1651 curEncData.m_rowStat[row].rowQp = qpBase;
1652 curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(qpBase);
1653 if (curRow.reEncode < 0)
1654 {
1655 x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n",
1656 m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp);
1657
1658 m_vbvResetTriggerRow = row;
1659 m_outStreams[0].copyBits(&m_backupStreams[0]);
1660
1661 rowCoder.copyState(curRow.bufferedEntropy);
1662 rowCoder.loadContexts(curRow.bufferedEntropy);
1663
1664 curRow.completed = 0;
1665 memset(&curRow.rowStats, 0, sizeof(curRow.rowStats));
1666 curEncData.m_rowStat[row].numEncodedCUs = 0;
1667 curEncData.m_rowStat[row].encodedBits = 0;
1668 curEncData.m_rowStat[row].rowSatd = 0;
1669 curEncData.m_rowStat[row].rowIntraSatd = 0;
1670 curEncData.m_rowStat[row].sumQpRc = 0;
1671 curEncData.m_rowStat[row].sumQpAq = 0;
1672 }
1673 }
1674 // If current block is at row diagonal checkpoint, call vbv ratecontrol.
1675 else if (m_param->bEnableWavefront && rowInSlice == col && !bFirstRowInSlice)
1676 {
1677 if (m_param->rc.bEnableConstVbv)
1678 {
1679 uint32_t startCuAddr = numCols * row;
1680 uint32_t EndCuAddr = startCuAddr + col;
1681
1682 for (int32_t r = row; r >= (int32_t)m_sliceBaseRow[sliceId]; r--)
1683 {
1684 for (uint32_t c = startCuAddr; c <= EndCuAddr && c <= numCols * (r + 1) - 1; c++)
1685 {
1686 curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost;
1687 curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost;
1688 curEncData.m_rowStat[r].encodedBits += curEncData.m_cuStat[c].totalBits;
1689 curEncData.m_rowStat[r].sumQpRc += curEncData.m_cuStat[c].baseQp;
1690 curEncData.m_rowStat[r].numEncodedCUs = c;
1691 }
1692 if (curRow.reEncode < 0)
1693 break;
1694 startCuAddr = EndCuAddr - numCols;
1695 EndCuAddr = startCuAddr + 1;
1696 }
1697 }
1698 double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
1699 curRow.reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
1700 qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase);
1701 curEncData.m_rowStat[row].rowQp = qpBase;
1702 curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(qpBase);
1703
1704 if (curRow.reEncode < 0)
1705 {
1706 x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n",
1707 m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp);
1708
1709 // prevent the WaveFront::findJob() method from providing new jobs
1710 m_vbvResetTriggerRow = row;
1711 m_bAllRowsStop = true;
1712
1713 for (uint32_t r = m_sliceBaseRow[sliceId + 1] - 1; r >= row; r--)
1714 {
1715 CTURow& stopRow = m_rows[r];
1716
1717 if (r != row)
1718 {
1719 /* if row was active (ready to be run) clear active bit and bitmap bit for this row */
1720 stopRow.lock.acquire();
1721 while (stopRow.active)
1722 {
1723 if (dequeueRow(r * 2))
1724 stopRow.active = false;
1725 else
1726 {
1727 /* we must release the row lock to allow the thread to exit */
1728 stopRow.lock.release();
1729 GIVE_UP_TIME();
1730 stopRow.lock.acquire();
1731 }
1732 }
1733 stopRow.lock.release();
1734
1735 bool bRowBusy = true;
1736 do
1737 {
1738 stopRow.lock.acquire();
1739 bRowBusy = stopRow.busy;
1740 stopRow.lock.release();
1741
1742 if (bRowBusy)
1743 {
1744 GIVE_UP_TIME();
1745 }
1746 }
1747 while (bRowBusy);
1748 }
1749
1750 m_outStreams[r].resetBits();
1751 stopRow.completed = 0;
1752 memset(&stopRow.rowStats, 0, sizeof(stopRow.rowStats));
1753 curEncData.m_rowStat[r].numEncodedCUs = 0;
1754 curEncData.m_rowStat[r].encodedBits = 0;
1755 curEncData.m_rowStat[r].rowSatd = 0;
1756 curEncData.m_rowStat[r].rowIntraSatd = 0;
1757 curEncData.m_rowStat[r].sumQpRc = 0;
1758 curEncData.m_rowStat[r].sumQpAq = 0;
1759 }
1760
1761 m_bAllRowsStop = false;
1762 }
1763 }
1764 }
1765
1766 if (m_param->bEnableWavefront && curRow.completed >= 2 && !bLastRowInSlice &&
1767 (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow))
1768 {
1769 /* activate next row */
1770 ScopedLock below(m_rows[row + 1].lock);
1771
1772 if (m_rows[row + 1].active == false &&
1773 m_rows[row + 1].completed + 2 <= curRow.completed)
1774 {
1775 m_rows[row + 1].active = true;
1776 enqueueRowEncoder(m_row_to_idx[row + 1]);
1777 tryWakeOne(); /* wake up a sleeping thread or set the help wanted flag */
1778 }
1779 }
1780
1781 ScopedLock self(curRow.lock);
1782 if ((m_bAllRowsStop && intRow > m_vbvResetTriggerRow) ||
1783 (!bFirstRowInSlice && ((curRow.completed < numCols - 1) || (m_rows[row - 1].completed < numCols)) && m_rows[row - 1].completed < curRow.completed + 2))
1784 {
1785 curRow.active = false;
1786 curRow.busy = false;
1787 ATOMIC_INC(&m_countRowBlocks);
1788 return;
1789 }
1790 }
1791
1792 /* this row of CTUs has been compressed */
1793 if (m_param->bEnableWavefront && m_param->rc.bEnableConstVbv)
1794 {
1795 if (bLastRowInSlice)
1796 {
1797 for (uint32_t r = m_sliceBaseRow[sliceId]; r < m_sliceBaseRow[sliceId + 1]; r++)
1798 {
1799 for (uint32_t c = curEncData.m_rowStat[r].numEncodedCUs + 1; c < numCols * (r + 1); c++)
1800 {
1801 curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost;
1802 curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost;
1803 curEncData.m_rowStat[r].encodedBits += curEncData.m_cuStat[c].totalBits;
1804 curEncData.m_rowStat[r].sumQpRc += curEncData.m_cuStat[c].baseQp;
1805 curEncData.m_rowStat[r].numEncodedCUs = c;
1806 }
1807 }
1808 }
1809 }
1810
1811 /* If encoding with ABR, update update bits and complexity in rate control
1812 * after a number of rows so the next frame's rateControlStart has more
1813 * accurate data for estimation. At the start of the encode we update stats
1814 * after half the frame is encoded, but after this initial period we update
1815 * after refLagRows (the number of rows reference frames must have completed
1816 * before referencees may begin encoding) */
1817 if (m_param->rc.rateControlMode == X265_RC_ABR || bIsVbv)
1818 {
1819 uint32_t rowCount = 0;
1820 uint32_t maxRows = m_sliceBaseRow[sliceId + 1] - m_sliceBaseRow[sliceId];
1821
1822 if (!m_rce.encodeOrder)
1823 rowCount = maxRows - 1;
1824 else if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
1825 rowCount = X265_MIN((maxRows + 1) / 2, maxRows - 1);
1826 else
1827 rowCount = X265_MIN(m_refLagRows / m_param->maxSlices, maxRows - 1);
1828
1829 if (rowInSlice == rowCount)
1830 {
1831 m_rowSliceTotalBits[sliceId] = 0;
1832 if (bIsVbv && !(m_param->rc.bEnableConstVbv && m_param->bEnableWavefront))
1833 {
1834 for (uint32_t i = m_sliceBaseRow[sliceId]; i < rowCount + m_sliceBaseRow[sliceId]; i++)
1835 m_rowSliceTotalBits[sliceId] += curEncData.m_rowStat[i].encodedBits;
1836 }
1837 else
1838 {
1839 uint32_t startAddr = m_sliceBaseRow[sliceId] * numCols;
1840 uint32_t finishAddr = startAddr + rowCount * numCols;
1841
1842 for (uint32_t cuAddr = startAddr; cuAddr < finishAddr; cuAddr++)
1843 m_rowSliceTotalBits[sliceId] += curEncData.m_cuStat[cuAddr].totalBits;
1844 }
1845
1846 if (ATOMIC_INC(&m_sliceCnt) == (int)m_param->maxSlices)
1847 {
1848 m_rce.rowTotalBits = 0;
1849 for (uint32_t i = 0; i < m_param->maxSlices; i++)
1850 m_rce.rowTotalBits += m_rowSliceTotalBits[i];
1851 m_top->m_rateControl->rateControlUpdateStats(&m_rce);
1852 }
1853 }
1854 }
1855
1856 /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */
1857 /* end_of_sub_stream_one_bit / end_of_slice_segment_flag */
1858 if (!slice->m_bUseSao && (m_param->bEnableWavefront || bLastRowInSlice))
1859 rowCoder.finishSlice();
1860
1861
1862 /* Processing left Deblock block with current threading */
1863 if ((m_param->bEnableLoopFilter | slice->m_bUseSao) & (rowInSlice >= 2))
1864 {
1865 /* Check conditional to start previous row process with current threading */
1866 if (m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() == (int)numCols)
1867 {
1868 /* stop threading on current row and restart it */
1869 m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols);
1870 m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
1871 }
1872 }
1873
1874 /* trigger row-wise loop filters */
1875 if (m_param->bEnableWavefront)
1876 {
1877 if (rowInSlice >= m_filterRowDelay)
1878 {
1879 enableRowFilter(m_row_to_idx[row - m_filterRowDelay]);
1880
1881 /* NOTE: Activate filter if first row (row 0) */
1882 if (rowInSlice == m_filterRowDelay)
1883 enqueueRowFilter(m_row_to_idx[row - m_filterRowDelay]);
1884 tryWakeOne();
1885 }
1886
1887 if (bLastRowInSlice)
1888 {
1889 for (uint32_t i = endRowInSlicePlus1 - m_filterRowDelay; i < endRowInSlicePlus1; i++)
1890 {
1891 enableRowFilter(m_row_to_idx[i]);
1892 }
1893 tryWakeOne();
1894 }
1895
1896 // handle specially case - single row slice
1897 if (bFirstRowInSlice & bLastRowInSlice)
1898 {
1899 enqueueRowFilter(m_row_to_idx[row]);
1900 tryWakeOne();
1901 }
1902 }
1903
1904 curRow.busy = false;
1905
1906 // CHECK_ME: Does it always FALSE condition?
1907 if (ATOMIC_INC(&m_completionCount) == 2 * (int)m_numRows)
1908 m_completionEvent.trigger();
1909 }
1910
collectDynDataRow(CUData & ctu,FrameStats * rowStats)1911 void FrameEncoder::collectDynDataRow(CUData& ctu, FrameStats* rowStats)
1912 {
1913 for (uint32_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
1914 {
1915 for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
1916 {
1917 int offset = (depth * X265_REFINE_INTER_LEVELS) + i;
1918 if (ctu.m_collectCUCount[offset])
1919 {
1920 rowStats->rowVarDyn[offset] += ctu.m_collectCUVariance[offset];
1921 rowStats->rowRdDyn[offset] += ctu.m_collectCURd[offset];
1922 rowStats->rowCntDyn[offset] += ctu.m_collectCUCount[offset];
1923 }
1924 }
1925 }
1926 }
1927
collectDynDataFrame()1928 void FrameEncoder::collectDynDataFrame()
1929 {
1930 for (uint32_t row = 0; row < m_numRows; row++)
1931 {
1932 for (uint32_t refLevel = 0; refLevel < X265_REFINE_INTER_LEVELS; refLevel++)
1933 {
1934 for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
1935 {
1936 int offset = (depth * X265_REFINE_INTER_LEVELS) + refLevel;
1937 int curFrameIndex = m_frame->m_encodeOrder - m_top->m_startPoint;
1938 int index = (curFrameIndex * X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
1939 if (m_rows[row].rowStats.rowCntDyn[offset])
1940 {
1941 m_top->m_variance[index] += m_rows[row].rowStats.rowVarDyn[offset];
1942 m_top->m_rdCost[index] += m_rows[row].rowStats.rowRdDyn[offset];
1943 m_top->m_trainingCount[index] += m_rows[row].rowStats.rowCntDyn[offset];
1944 }
1945 }
1946 }
1947 }
1948 }
1949
computeAvgTrainingData()1950 void FrameEncoder::computeAvgTrainingData()
1951 {
1952 if (m_frame->m_lowres.bScenecut || m_frame->m_lowres.bKeyframe)
1953 {
1954 m_top->m_startPoint = m_frame->m_encodeOrder;
1955 int size = (m_param->keyframeMax + m_param->lookaheadDepth) * m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
1956 memset(m_top->m_variance, 0, size * sizeof(uint64_t));
1957 memset(m_top->m_rdCost, 0, size * sizeof(uint64_t));
1958 memset(m_top->m_trainingCount, 0, size * sizeof(uint32_t));
1959 }
1960 if (m_frame->m_encodeOrder - m_top->m_startPoint < 2 * m_param->frameNumThreads)
1961 m_frame->m_classifyFrame = false;
1962 else
1963 m_frame->m_classifyFrame = true;
1964
1965 int size = m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
1966 memset(m_frame->m_classifyRd, 0, size * sizeof(uint64_t));
1967 memset(m_frame->m_classifyVariance, 0, size * sizeof(uint64_t));
1968 memset(m_frame->m_classifyCount, 0, size * sizeof(uint32_t));
1969 if (m_frame->m_classifyFrame)
1970 {
1971 uint32_t limit = m_frame->m_encodeOrder - m_top->m_startPoint - m_param->frameNumThreads;
1972 for (uint32_t i = 1; i < limit; i++)
1973 {
1974 for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
1975 {
1976 for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
1977 {
1978 int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
1979 int index = (i* X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
1980 if (m_top->m_trainingCount[index])
1981 {
1982 m_frame->m_classifyRd[offset] += m_top->m_rdCost[index] / m_top->m_trainingCount[index];
1983 m_frame->m_classifyVariance[offset] += m_top->m_variance[index] / m_top->m_trainingCount[index];
1984 m_frame->m_classifyCount[offset] += m_top->m_trainingCount[index];
1985 }
1986 }
1987 }
1988 }
1989 /* Calculates the average feature values of historic frames that are being considered for the current frame */
1990 int historyCount = m_frame->m_encodeOrder - m_param->frameNumThreads - m_top->m_startPoint - 1;
1991 if (historyCount)
1992 {
1993 for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
1994 {
1995 for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
1996 {
1997 int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
1998 m_frame->m_classifyRd[offset] /= historyCount;
1999 m_frame->m_classifyVariance[offset] /= historyCount;
2000 }
2001 }
2002 }
2003 }
2004 }
2005
2006 /* collect statistics about CU coding decisions, return total QP */
collectCTUStatistics(const CUData & ctu,FrameStats * log)2007 int FrameEncoder::collectCTUStatistics(const CUData& ctu, FrameStats* log)
2008 {
2009 int totQP = 0;
2010 uint32_t depth = 0;
2011 for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
2012 {
2013 depth = ctu.m_cuDepth[absPartIdx];
2014 totQP += ctu.m_qp[absPartIdx] * (ctu.m_numPartitions >> (depth * 2));
2015 }
2016
2017 if (m_param->csvLogLevel >= 1 || m_param->rc.bStatWrite)
2018 {
2019 if (ctu.m_slice->m_sliceType == I_SLICE)
2020 {
2021 depth = 0;
2022 for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
2023 {
2024 depth = ctu.m_cuDepth[absPartIdx];
2025
2026 log->totalCu++;
2027 log->cntIntra[depth]++;
2028
2029 if (ctu.m_predMode[absPartIdx] == MODE_NONE)
2030 {
2031 log->totalCu--;
2032 log->cntIntra[depth]--;
2033 }
2034 else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
2035 {
2036 /* TODO: log intra modes at absPartIdx +0 to +3 */
2037 X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n");
2038 log->cntIntraNxN++;
2039 log->cntIntra[depth]--;
2040 }
2041 else if (ctu.m_lumaIntraDir[absPartIdx] > 1)
2042 log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++;
2043 else
2044 log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++;
2045 }
2046 }
2047 else
2048 {
2049 depth = 0;
2050 for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
2051 {
2052 depth = ctu.m_cuDepth[absPartIdx];
2053
2054 log->totalCu++;
2055
2056 if (ctu.m_predMode[absPartIdx] == MODE_NONE)
2057 log->totalCu--;
2058 else if (ctu.isSkipped(absPartIdx))
2059 {
2060 if (ctu.m_mergeFlag[0])
2061 log->cntMergeCu[depth]++;
2062 else
2063 log->cntSkipCu[depth]++;
2064 }
2065 else if (ctu.isInter(absPartIdx))
2066 {
2067 log->cntInter[depth]++;
2068
2069 if (ctu.m_partSize[absPartIdx] < AMP_ID)
2070 log->cuInterDistribution[depth][ctu.m_partSize[absPartIdx]]++;
2071 else
2072 log->cuInterDistribution[depth][AMP_ID]++;
2073 }
2074 else if (ctu.isIntra(absPartIdx))
2075 {
2076 log->cntIntra[depth]++;
2077
2078 if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
2079 {
2080 X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n");
2081 log->cntIntraNxN++;
2082 log->cntIntra[depth]--;
2083 /* TODO: log intra modes at absPartIdx +0 to +3 */
2084 }
2085 else if (ctu.m_lumaIntraDir[absPartIdx] > 1)
2086 log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++;
2087 else
2088 log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++;
2089 }
2090 }
2091 }
2092 }
2093
2094 return totQP;
2095 }
2096
2097 /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
noiseReductionUpdate()2098 void FrameEncoder::noiseReductionUpdate()
2099 {
2100 static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
2101
2102 for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
2103 {
2104 int trSize = cat & 3;
2105 int coefCount = 1 << ((trSize + 2) * 2);
2106
2107 if (m_nr->nrCount[cat] > maxBlocksPerTrSize[trSize])
2108 {
2109 for (int i = 0; i < coefCount; i++)
2110 m_nr->nrResidualSum[cat][i] >>= 1;
2111 m_nr->nrCount[cat] >>= 1;
2112 }
2113
2114 int nrStrength = cat < 8 ? m_param->noiseReductionIntra : m_param->noiseReductionInter;
2115 uint64_t scaledCount = (uint64_t)nrStrength * m_nr->nrCount[cat];
2116
2117 for (int i = 0; i < coefCount; i++)
2118 {
2119 uint64_t value = scaledCount + m_nr->nrResidualSum[cat][i] / 2;
2120 uint64_t denom = m_nr->nrResidualSum[cat][i] + 1;
2121 m_nr->nrOffsetDenoise[cat][i] = (uint16_t)(value / denom);
2122 }
2123
2124 // Don't denoise DC coefficients
2125 m_nr->nrOffsetDenoise[cat][0] = 0;
2126 }
2127 }
2128 #if ENABLE_LIBVMAF
vmafFrameLevelScore()2129 void FrameEncoder::vmafFrameLevelScore()
2130 {
2131 PicYuv *fenc = m_frame->m_fencPic;
2132 PicYuv *recon = m_frame->m_reconPic;
2133
2134 x265_vmaf_framedata *vmafframedata = (x265_vmaf_framedata*)x265_malloc(sizeof(x265_vmaf_framedata));
2135 if (!vmafframedata)
2136 {
2137 x265_log(NULL, X265_LOG_ERROR, "vmaf frame data alloc failed\n");
2138 }
2139
2140 vmafframedata->height = fenc->m_picHeight;
2141 vmafframedata->width = fenc->m_picWidth;
2142 vmafframedata->frame_set = 0;
2143 vmafframedata->internalBitDepth = m_param->internalBitDepth;
2144 vmafframedata->reference_frame = fenc;
2145 vmafframedata->distorted_frame = recon;
2146
2147 fenc->m_vmafScore = x265_calculate_vmaf_framelevelscore(vmafframedata);
2148
2149 if (vmafframedata)
2150 x265_free(vmafframedata);
2151 }
2152 #endif
2153
getEncodedPicture(NALList & output)2154 Frame *FrameEncoder::getEncodedPicture(NALList& output)
2155 {
2156 if (m_frame)
2157 {
2158 /* block here until worker thread completes */
2159 m_done.wait();
2160
2161 Frame *ret = m_frame;
2162 m_frame = NULL;
2163 output.takeContents(m_nalList);
2164 m_prevOutputTime = x265_mdate();
2165 return ret;
2166 }
2167
2168 return NULL;
2169 }
2170 }
2171