1 /* -----------------------------------------------------------------------------
2 The copyright in this software is being made available under the BSD
3 License, included below. No patent rights, trademark rights and/or
4 other Intellectual Property Rights other than the copyrights concerning
5 the Software are granted under this license.
6 
7 For any license concerning other Intellectual Property rights than the software,
8 especially patent licenses, a separate Agreement needs to be closed.
9 For more information please contact:
10 
11 Fraunhofer Heinrich Hertz Institute
12 Einsteinufer 37
13 10587 Berlin, Germany
14 www.hhi.fraunhofer.de/vvc
15 vvc@hhi.fraunhofer.de
16 
17 Copyright (c) 2018-2021, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V.
18 All rights reserved.
19 
20 Redistribution and use in source and binary forms, with or without
21 modification, are permitted provided that the following conditions are met:
22 
23  * Redistributions of source code must retain the above copyright notice,
24    this list of conditions and the following disclaimer.
25  * Redistributions in binary form must reproduce the above copyright notice,
26    this list of conditions and the following disclaimer in the documentation
27    and/or other materials provided with the distribution.
28  * Neither the name of Fraunhofer nor the names of its contributors may
29    be used to endorse or promote products derived from this software without
30    specific prior written permission.
31 
32 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
33 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
36 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
37 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
38 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
40 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
42 THE POSSIBILITY OF SUCH DAMAGE.
43 
44 
45 ------------------------------------------------------------------------------------------- */
46 
47 /** \file     DecLibRecon.cpp
48     \brief    decoder class
49 */
50 
51 #include "DecLib.h"
52 
53 #include "CommonLib/TrQuant.h"
54 #if ENABLE_SIMD_TCOEFF_OPS
55 #include "CommonLib/TrQuant_EMT.h"
56 #endif
57 #include "CommonLib/InterPrediction.h"
58 #include "CommonLib/IntraPrediction.h"
59 #include "CommonLib/Unit.h"
60 #include "CommonLib/Buffer.h"
61 #include "CommonLib/UnitTools.h"
62 
63 #include "CommonLib/dtrace_next.h"
64 #include "CommonLib/dtrace_buffer.h"
65 
66 namespace vvdec
67 {
68 
69 #ifdef TRACE_ENABLE_ITT
70 extern __itt_domain*              itt_domain_dec;
71 extern std::vector<__itt_domain*> itt_domain_decInst;
72 
73 extern __itt_string_handle* itt_handle_alf;
74 extern __itt_string_handle* itt_handle_presao;
75 extern __itt_string_handle* itt_handle_sao;
76 extern __itt_string_handle* itt_handle_lfl;
77 extern __itt_string_handle* itt_handle_intra;
78 extern __itt_string_handle* itt_handle_inter;
79 extern __itt_string_handle* itt_handle_mider;
80 extern __itt_string_handle* itt_handle_lfcl;
81 extern __itt_string_handle* itt_handle_ext;
82 extern __itt_string_handle* itt_handle_dmvr;
83 extern __itt_string_handle* itt_handle_rsp;
84 
85 extern __itt_string_handle* itt_handle_schedTasks;
86 extern __itt_string_handle* itt_handle_waitTasks;
87 
88 // create global domain for DecLib
89 extern __itt_domain* itt_domain_glb;
90 // create a global counter
91 extern __itt_counter itt_frame_counter;
92 
93 #define ITT_TASKSTART( d, t ) __itt_task_begin( ( d ), __itt_null, __itt_null, ( t ) )
94 #define ITT_TASKEND( d, t )   __itt_task_end  ( ( d ) )
95 #else
96 #define ITT_TASKSTART( d, t )
97 #define ITT_TASKEND( d, t )
98 #endif
99 
100 //! \ingroup DecoderLib
101 //! \{
102 
reset(CodingStructure & cs,TaskType ctuStartState,int tasksPerLine,bool doALF)103 void CommonTaskParam::reset( CodingStructure& cs, TaskType ctuStartState, int tasksPerLine, bool doALF )
104 {
105   this->cs = &cs;
106 
107   const int heightInCtus = cs.pcv->heightInCtus;
108   CHECKD( !ctuStates.empty() && std::any_of( ctuStates.begin(), ctuStates.end(), []( CtuState& s ) { return s != DONE; } ), "some CTUs of previous pic not done" );
109   ctuStates = std::vector<CtuState>( heightInCtus * tasksPerLine );
110   for( auto& ctu: ctuStates )
111   {
112     ctu.store( ctuStartState );
113   }
114 
115 
116   this->perLineMiHist = std::vector<MotionHist>( heightInCtus );
117   this->dmvrTriggers  = std::vector<Barrier>   ( heightInCtus );
118 
119   this->doALF        = doALF;
120   this->alfPrepared.lock();
121 }
122 
DecLibRecon()123 DecLibRecon::DecLibRecon()
124 {
125 #if ENABLE_SIMD_OPT_BUFFER && defined( TARGET_SIMD_X86 )
126   g_pelBufOP.initPelBufOpsX86();
127 #endif
128 #if ENABLE_SIMD_TCOEFF_OPS && defined( TARGET_SIMD_X86 )
129   g_tCoeffOps.initTCoeffOpsX86();
130 #endif
131 
132 }
133 
create(ThreadPool * threadPool,unsigned instanceId)134 void DecLibRecon::create( ThreadPool* threadPool, unsigned instanceId )
135 {
136   // run constructor again to ensure all variables, especially in DecLibParser have been reset
137   this->~DecLibRecon();
138   new( this ) DecLibRecon;
139 
140 
141 #if TRACE_ENABLE_ITT
142   if( itt_domain_decInst.size() < instanceId + 1 )
143   {
144     std::string name( "DecLibRecon " + std::to_string( instanceId ) );
145     itt_domain_decInst.push_back( __itt_domain_create( name.c_str() ) );
146     itt_domain_decInst.back()->flags = 1;
147 
148     CHECK( itt_domain_decInst.back() != itt_domain_decInst[instanceId], "current decLibRecon ITT-Domain is not the last in vector. Instances created in the wrong order?" );
149   }
150   m_itt_decInst = itt_domain_decInst[instanceId];
151 #endif
152 
153   m_decodeThreadPool = threadPool;
154   m_numDecThreads    = std::max( 1, threadPool ? threadPool->numThreads() : 1 );
155 
156   m_cIntraPred = new IntraPrediction[m_numDecThreads];
157   m_cInterPred = new InterPrediction[m_numDecThreads];
158   m_cTrQuant   = new TrQuant        [m_numDecThreads];
159   m_cCuDecoder = new DecCu          [m_numDecThreads];
160   m_cReshaper  = new Reshape        [m_numDecThreads];
161 }
162 
destroy()163 void DecLibRecon::destroy()
164 {
165   m_decodeThreadPool = nullptr;
166 
167   delete[] m_cIntraPred; m_cIntraPred = nullptr;
168   delete[] m_cInterPred; m_cInterPred = nullptr;
169   delete[] m_cTrQuant;   m_cTrQuant   = nullptr;
170   delete[] m_cCuDecoder; m_cCuDecoder = nullptr;
171   delete[] m_cReshaper;  m_cReshaper  = nullptr;
172 }
173 
borderExtPic(Picture * pic)174 void DecLibRecon::borderExtPic( Picture* pic )
175 {
176   pic->borderExtStarted = true;
177 
178   const bool wrapAround = pic->cs->sps->getUseWrapAround();
179   if( wrapAround )
180   {
181     // copy reconstruction buffer to wrapAround buffer. All other border-extension tasks depend on this task.
182     static auto copyTask = []( int, Picture* picture ) {
183       ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
184       picture->getRecoBuf( true ).copyFrom( picture->getRecoBuf() );
185       ITT_TASKEND( itt_domain_dec, itt_handle_ext );
186       return true;
187     };
188     pic->m_copyWrapBufDone.lock();
189     m_decodeThreadPool->addBarrierTask<Picture>( copyTask,
190                                                  pic,
191                                                  nullptr,
192                                                  &pic->m_copyWrapBufDone,
193                                                  { &pic->done} );
194   }
195 
196   // start actual border extension tasks
197   {
198     static auto task = []( int, Picture* picture ) {
199       ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
200       picture->extendPicBorder( true, false, false, false );
201       ITT_TASKEND( itt_domain_dec, itt_handle_ext );
202       return true;
203     };
204     m_decodeThreadPool->addBarrierTask<Picture>( task,
205                                                  pic,
206                                                  &pic->m_borderExtTaskCounter,
207                                                  nullptr,
208                                                  { wrapAround ? &pic->m_copyWrapBufDone : &pic->done} );
209   }
210 
211   {
212     static auto task = []( int, Picture* picture ) {
213       ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
214       picture->extendPicBorder( false, true, false, false );
215       ITT_TASKEND( itt_domain_dec, itt_handle_ext );
216       return true;
217     };
218     m_decodeThreadPool->addBarrierTask<Picture>( task,
219                                                  pic,
220                                                  &pic->m_borderExtTaskCounter,
221                                                  nullptr,
222                                                  { wrapAround ? &pic->m_copyWrapBufDone : &pic->done} );
223   }
224 
225   {
226     static auto task = []( int, Picture* picture ) {
227       ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
228       picture->extendPicBorder( false, false, true, false, CH_L );
229       ITT_TASKEND( itt_domain_dec, itt_handle_ext );
230       return true;
231     };
232     m_decodeThreadPool->addBarrierTask<Picture>( task,
233                                                  pic,
234                                                  &pic->m_borderExtTaskCounter,
235                                                  nullptr,
236                                                  { wrapAround ? &pic->m_copyWrapBufDone : &pic->done} );
237   }
238   {
239     static auto task = []( int, Picture* picture ) {
240       ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
241       picture->extendPicBorder( false, false, false, true, CH_L );
242       ITT_TASKEND( itt_domain_dec, itt_handle_ext );
243       return true;
244     };
245     m_decodeThreadPool->addBarrierTask<Picture>( task,
246                                                  pic,
247                                                  &pic->m_borderExtTaskCounter,
248                                                  nullptr,
249                                                  { wrapAround ? &pic->m_copyWrapBufDone : &pic->done} );
250   }
251 
252   {
253     static auto task = []( int, Picture* picture ) {
254       ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
255       picture->extendPicBorder( false, false, true, false, CH_C );
256       ITT_TASKEND( itt_domain_dec, itt_handle_ext );
257       return true;
258     };
259     m_decodeThreadPool->addBarrierTask<Picture>( task,
260                                                  pic,
261                                                  &pic->m_borderExtTaskCounter,
262                                                  nullptr,
263                                                  { wrapAround ? &pic->m_copyWrapBufDone : &pic->done} );
264   }
265   {
266     static auto task = []( int, Picture* picture ) {
267       ITT_TASKSTART( itt_domain_dec, itt_handle_ext );
268       picture->extendPicBorder( false, false, false, true, CH_C );
269       ITT_TASKEND( itt_domain_dec, itt_handle_ext );
270       return true;
271     };
272     m_decodeThreadPool->addBarrierTask<Picture>( task,
273                                                  pic,
274                                                  &pic->m_borderExtTaskCounter,
275                                                  nullptr,
276                                                  { wrapAround ? &pic->m_copyWrapBufDone : &pic->done} );
277   }
278 }
279 
createSubPicRefBufs(Picture * pic)280 void DecLibRecon::createSubPicRefBufs( Picture* pic )
281 {
282   pic->subPicExtStarted = true;
283 
284   const PPS* pps       = pic->cs->pps.get();
285   const SPS* sps       = pic->cs->sps.get();
286   const int  numSubPic = pps->getNumSubPics();
287 
288   pic->m_subPicRefBufs.resize( numSubPic );
289   for( int i = 0; i < numSubPic; ++i )
290   {
291     const SubPic& currSubPic = pps->getSubPic( i );
292     const Area    subPicArea( currSubPic.getSubPicLeft(),
293                               currSubPic.getSubPicTop(),
294                               currSubPic.getSubPicWidthInLumaSample(),
295                               currSubPic.getSubPicHeightInLumaSample() );
296 
297     pic->m_subPicRefBufs[i].create( pic->chromaFormat, Size( subPicArea ), sps->getMaxCUWidth(), pic->margin, MEMORY_ALIGN_DEF_SIZE );
298 
299     static auto task = []( int, SubPicExtTask* t ) {
300       t->subPicBuf->copyFrom( t->picture->getRecoBuf().subBuf( t->subPicArea ) );
301       t->picture->extendPicBorderBuf( *t->subPicBuf );
302       return true;
303     };
304     m_subPicExtTasks.emplace_back( SubPicExtTask{ pic, &pic->m_subPicRefBufs[i], subPicArea } );
305     m_decodeThreadPool->addBarrierTask<SubPicExtTask>( task, &m_subPicExtTasks.back(), &pic->m_borderExtTaskCounter, nullptr, { &pic->done } );
306   }
307 }
308 
decompressPicture(Picture * pcPic)309 void DecLibRecon::decompressPicture( Picture* pcPic )
310 {
311   CodingStructure& cs = *pcPic->cs;
312 
313   pcPic->inProgress = true;
314 
315 #ifdef TRACE_ENABLE_ITT
316   // mark start of frame
317     pcPic->m_itt_decLibInst = m_itt_decInst;
318   __itt_frame_begin_v3( pcPic->m_itt_decLibInst, nullptr );
319 #endif
320 
321   // Initialise the various objects for the new set of settings
322   const SPS * sps = cs.sps.get();
323   const PPS * pps = cs.pps.get();
324 
325   for( int i = 0; i < m_numDecThreads; i++ )
326   {
327     if( sps->getUseReshaper() )
328     {
329       m_cReshaper[i].createDec( sps->getBitDepth( CHANNEL_TYPE_LUMA ) );
330       m_cReshaper[i].initSlice( pcPic->slices[0]->getNalUnitLayerId(), *pcPic->slices[0]->getPicHeader(), *pcPic->slices[0]->getVPS() );
331     }
332 
333     m_cIntraPred[i].init( sps->getChromaFormatIdc(), sps->getBitDepth( CHANNEL_TYPE_LUMA ) );
334     m_cInterPred[i].init( &m_cRdCost, sps->getChromaFormatIdc(), sps->getMaxCUHeight() );
335 
336     // Recursive structure
337     m_cTrQuant[i]  .init( pcPic );
338     m_cCuDecoder[i].init( &m_cIntraPred[i], &m_cInterPred[i], &m_cReshaper[i], &m_cTrQuant[i] );
339   }
340 
341   const uint32_t  log2SaoOffsetScaleLuma   = (uint32_t) std::max(0, sps->getBitDepth(CHANNEL_TYPE_LUMA  ) - MAX_SAO_TRUNCATED_BITDEPTH);
342   const uint32_t  log2SaoOffsetScaleChroma = (uint32_t) std::max(0, sps->getBitDepth(CHANNEL_TYPE_CHROMA) - MAX_SAO_TRUNCATED_BITDEPTH);
343   const int maxDepth = getLog2(sps->getMaxCUWidth()) - pps->pcv->minCUWidthLog2;
344   m_cSAO.create( pps->getPicWidthInLumaSamples(),
345                  pps->getPicHeightInLumaSamples(),
346                  sps->getChromaFormatIdc(),
347                  sps->getMaxCUWidth(),
348                  sps->getMaxCUHeight(),
349                  maxDepth,
350                  log2SaoOffsetScaleLuma,
351                  log2SaoOffsetScaleChroma
352                );
353 
354   if( sps->getUseALF() )
355   {
356     m_cALF.create( cs.picHeader, sps, pps, m_numDecThreads );
357   }
358 
359   const int widthInCtus = cs.pcv->widthInCtus;
360   const int heightInCtus = cs.pcv->heightInCtus;
361 
362   if( sps->getIBCFlag() )
363   {
364     cs.initVIbcBuf( heightInCtus, sps->getChromaFormatIdc(), sps->getMaxCUHeight() );
365   }
366   pcPic->startProcessingTimer();
367 
368   if( m_decodeThreadPool->numThreads() > 0 )
369   {
370     ITT_TASKSTART( itt_domain_dec, itt_handle_schedTasks );
371   }
372 
373   picBarriers.clear();
374 #if ALLOW_MIDER_LF_DURING_PICEXT
375   CBarrierVec  picExtBarriers;
376 #else
377   CBarrierVec &picExtBarriers = picBarriers;
378 #endif
379 
380   const int numSubPic = cs.pps->getNumSubPics();
381   if( numSubPic > 1 )
382   {
383     m_subPicExtTasks.clear();
384     m_subPicExtTasks.reserve( pcPic->slices.size() * MAX_NUM_REF_PICS * numSubPic );
385   }
386 
387   std::vector<Picture*> borderExtRefPics;
388   for( const Slice* slice : pcPic->slices )
389   {
390     if( slice->isIntra() )
391     {
392       continue;
393     }
394 
395     for( int iDir = REF_PIC_LIST_0; iDir < NUM_REF_PIC_LIST_01; ++iDir )
396     {
397       for( int iRefIdx = 0; iRefIdx < slice->getNumRefIdx( ( RefPicList ) iDir ); iRefIdx++ )
398       {
399         Picture* refPic = slice->getNoConstRefPic( ( RefPicList ) iDir, iRefIdx );
400 
401         if( std::find( borderExtRefPics.cbegin(), borderExtRefPics.cend(), refPic ) == borderExtRefPics.cend() )
402         {
403           borderExtRefPics.push_back( refPic );
404         }
405       }
406     }
407   }
408 
409   for( Picture* refPic : borderExtRefPics )
410   {
411     if( !refPic->borderExtStarted )
412     {
413       // TODO: (GH) Can we bypass this border extension, when all subpics (>1) are treated as pics?
414       borderExtPic( refPic );
415     }
416 
417     if( !refPic->subPicExtStarted && numSubPic > 1 && refPic->m_subPicRefBufs.size() != numSubPic )
418     {
419       CHECK( !refPic->m_subPicRefBufs.empty(), "Wrong number of subpics already present in reference picture" );
420       CHECK( cs.sps->getUseWrapAround(), "Wraparound + subpics not implemented" );
421 
422       createSubPicRefBufs( refPic );
423     }
424 
425     if( refPic->m_borderExtTaskCounter.isBlocked() &&
426         std::find( picExtBarriers.cbegin(), picExtBarriers.cend(), refPic->m_borderExtTaskCounter.donePtr() ) == picExtBarriers.cend() )
427     {
428       picExtBarriers.push_back( refPic->m_borderExtTaskCounter.donePtr() );
429     }
430 
431     if( refPic->m_dmvrTaskCounter.isBlocked() &&
432         std::find( picBarriers.cbegin(), picBarriers.cend(), refPic->m_dmvrTaskCounter.donePtr() ) == picBarriers.cend() )
433     {
434       picBarriers.push_back( refPic->m_dmvrTaskCounter.donePtr() );
435     }
436   }
437 
438   if( m_decodeThreadPool->numThreads() == 0 && (
439        std::any_of( picExtBarriers.cbegin(), picExtBarriers.cend(), []( const Barrier* b ) { return b->isBlocked(); } ) ||
440        std::any_of( picBarriers   .cbegin(), picBarriers   .cend(), []( const Barrier* b ) { return b->isBlocked(); } ) ) )
441   {
442     m_decodeThreadPool->processTasksOnMainThread();
443   }
444 
445   const bool isIntra = std::all_of( pcPic->slices.begin(), pcPic->slices.end(), []( const Slice* pcSlice ) { return pcSlice->isIntra(); } );
446 
447   const int numColPerTask = std::max( std::min( widthInCtus, ( widthInCtus / std::max( m_numDecThreads * ( isIntra ? 2 : 1 ), 1 ) ) + ( isIntra ? 0 : 1 ) ), 1 );
448   const int numTasksPerLine = widthInCtus / numColPerTask + !!( widthInCtus % numColPerTask );
449 
450 #if ALLOW_MIDER_LF_DURING_PICEXT
451   pcPic->refPicExtDepBarriers = std::move( picExtBarriers );
452 #endif
453 #if !RECO_WHILE_PARSE
454   picBarriers.push_back( &cs.slice->parseDone );
455 
456 #endif
457   const TaskType ctuStartState = MIDER;
458   const bool     doALF         = cs.sps->getUseALF() && !AdaptiveLoopFilter::getAlfSkipPic( cs );
459   commonTaskParam.reset( cs, ctuStartState, numTasksPerLine, doALF );
460 
461   tasksDMVR = std::vector<LineTaskParam>( heightInCtus, LineTaskParam{ commonTaskParam, -1 } );
462   tasksCtu  = std::vector<CtuTaskParam >( heightInCtus * numTasksPerLine, CtuTaskParam{ commonTaskParam, -1, -1, {} } );
463 
464   pcPic->done.lock();
465 
466 #if 0
467   // schedule in raster scan order
468   for( int line = 0; line < heightInCtus; ++line )
469   {
470     for( int col = 0; col < widthInCtus;  ++col )
471     {
472 #else
473   // schedule in zig-zag scan order
474   for( int i = 0; i < numTasksPerLine + heightInCtus; ++i )
475   {
476     int line = 0;
477     for( int col = i; col >= 0; --col, ++line )
478     {
479 #endif
480       if( line < heightInCtus && col < numTasksPerLine )
481       {
482         CBarrierVec ctuBarriesrs = picBarriers;
483 
484 #if RECO_WHILE_PARSE
485         const int ctuStart = col * numColPerTask;
486         const int ctuEnd   = std::min( ctuStart + numColPerTask, widthInCtus );
487         for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
488         {
489           ctuBarriesrs.push_back( &pcPic->ctuParsedBarrier[line * widthInCtus + ctu] );
490         }
491 #endif
492         CtuTaskParam* param = &tasksCtu[line * numTasksPerLine + col];
493         param->line         = line;
494         param->col          = col;
495         param->numColPerTask   = numColPerTask;
496         param->numTasksPerLine = numTasksPerLine;
497 
498         m_decodeThreadPool->addBarrierTask<CtuTaskParam>( ctuTask<false>,
499                                                           param,
500                                                           &pcPic->m_ctuTaskCounter,
501                                                           nullptr,
502                                                           std::move( ctuBarriesrs ),
503                                                           ctuTask<true> );
504       }
505     }
506   }
507 
508   if( commonTaskParam.doALF )
509   {
510     AdaptiveLoopFilter::preparePic( cs );
511     commonTaskParam.alfPrepared.unlock();
512   }
513 
514   {
515     static auto doneTask = []( int, Picture* picture )
516     {
517       CodingStructure& cs = *picture->cs;
518       if( cs.sps->getUseALF() && !AdaptiveLoopFilter::getAlfSkipPic( cs ) )
519       {
520         AdaptiveLoopFilter::swapBufs( cs );
521       }
522 
523       picture->reconstructed = true;
524 #ifdef TRACE_ENABLE_ITT
525       // mark end of frame
526       __itt_frame_end_v3( picture->m_itt_decLibInst, nullptr );
527 #endif
528       picture->stopProcessingTimer();
529 
530       return true;
531     };
532     m_decodeThreadPool->addBarrierTask<Picture>( doneTask, pcPic, nullptr, &pcPic->done, { pcPic->m_ctuTaskCounter.donePtr() } );
533   }
534 
535   if( pcPic->referenced )
536   {
537     static auto task = []( int tid, LineTaskParam* param )
538     {
539       ITT_TASKSTART( itt_domain_dec, itt_handle_dmvr );
540       auto& cs = *param->common.cs;
541       for( int col = 0; col < cs.pcv->widthInCtus; col++ )
542       {
543         param->common.decLib.m_cCuDecoder[tid].TaskDeriveDMVRMotionInfo( cs, getCtuArea( cs, col, param->line, true ) );
544       }
545       ITT_TASKEND( itt_domain_dec, itt_handle_dmvr );
546       return true;
547     };
548 
549     for( int taskLineDMVR = 0; taskLineDMVR < heightInCtus; taskLineDMVR++ )
550     {
551       auto param  = &tasksDMVR[taskLineDMVR];
552       param->line = taskLineDMVR;
553       m_decodeThreadPool->addBarrierTask<LineTaskParam>( task,
554                                                          param,
555                                                          &pcPic->m_dmvrTaskCounter,
556                                                          nullptr,
557                                                          { &commonTaskParam.dmvrTriggers[taskLineDMVR], &pcPic->parseDone } );
558     }
559 
560     {
561       // dummy task to propagate exceptions from the ctu-decoding tasks to the dmvrTaskCounter
562       static auto dummyTask = []( int, void* ) { return true; };
563       m_decodeThreadPool->addBarrierTask<void>( dummyTask, nullptr, &pcPic->m_dmvrTaskCounter, nullptr, { pcPic->m_ctuTaskCounter.donePtr() } );
564     }
565   }
566 
567   if( m_decodeThreadPool->numThreads() == 0 )
568   {
569   }
570   else
571   {
572     ITT_TASKEND( itt_domain_dec, itt_handle_schedTasks );
573   }
574 
575   m_currDecompPic = pcPic;
576 }
577 
578 Picture* DecLibRecon::waitForPrevDecompressedPic()
579 {
580   if( !m_currDecompPic )
581     return nullptr;
582 
583   ITT_TASKSTART( itt_domain_dec, itt_handle_waitTasks );
584   if( m_decodeThreadPool->numThreads() == 0 )
585   {
586     m_decodeThreadPool->processTasksOnMainThread();
587     CHECK( m_currDecompPic->m_dmvrTaskCounter.isBlocked() || m_currDecompPic->done.isBlocked(), "can't make progress. some dependecy has not been finished" );
588   }
589   m_currDecompPic->m_dmvrTaskCounter.wait();
590   m_currDecompPic->done.wait();
591   ITT_TASKEND( itt_domain_dec, itt_handle_waitTasks );
592 
593   m_currDecompPic->inProgress = false;
594   return std::exchange( m_currDecompPic, nullptr );
595 }
596 
597 template<bool onlyCheckReadyState>
598 bool DecLibRecon::ctuTask( int tid, CtuTaskParam* param )
599 {
600   const int       col          = param->col;
601   const int       line         = param->line;
602 
603   auto&           cs           = *param->common.cs;
604   auto&           decLib       = param->common.decLib;
605   const int       widthInCtus  = param->numTasksPerLine;
606   const int       heightInCtus = cs.pcv->heightInCtus;
607 
608   CtuState&       thisCtuState =  param->common.ctuStates[line * widthInCtus + col];
609   const CtuState* thisLine     = &param->common.ctuStates[line * widthInCtus];
610   const CtuState* lineAbove    = thisLine - widthInCtus;
611   const CtuState* lineBelow    = thisLine + widthInCtus;
612 
613   const int       ctuStart     = col * param->numColPerTask;
614   const int       ctuEnd       = std::min<int>( ctuStart + param->numColPerTask, cs.pcv->widthInCtus );
615 
616   try
617   {
618     if( cs.picture->m_ctuTaskCounter.hasException() )
619     {
620       std::rethrow_exception( cs.picture->m_ctuTaskCounter.getException() );
621     }
622 
623     switch( thisCtuState.load() )
624     {
625       // all case statements fall through to continue with next task, unless they return false due to unsatisfied preconditions
626 
627     case MIDER:
628     {
629       if( col > 0 && thisLine[col - 1] <= MIDER )
630         return false;
631       if( line > 0 && lineAbove[std::min( col + 1, widthInCtus - 1 )] <= MIDER )
632         return false;
633       if( onlyCheckReadyState )
634         return true;
635 
636       ITT_TASKSTART( itt_domain_dec, itt_handle_mider );
637 
638       for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
639       {
640         CtuData& ctuData = cs.getCtuData( ctu, line );
641         GCC_WARNING_DISABLE_class_memaccess
642         memset( ctuData.motion, 0, sizeof( CtuData::motion ) );
643         GCC_WARNING_RESET
644 
645         if( !ctuData.slice->isIntra() || cs.sps->getIBCFlag() )
646         {
647           const UnitArea ctuArea = getCtuArea( cs, ctu, line, true );
648           decLib.m_cCuDecoder[tid].TaskDeriveCtuMotionInfo( cs, ctuArea, param->common.perLineMiHist[line] );
649         }
650       }
651       thisCtuState = ( TaskType )( MIDER + 1 );
652 
653       ITT_TASKEND( itt_domain_dec, itt_handle_mider );
654     }
655 
656     case LF_INIT:
657     {
658       if( onlyCheckReadyState )
659         return true;
660 
661       ITT_TASKSTART( itt_domain_dec, itt_handle_lfcl );
662 
663       for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
664       {
665         CtuData& ctuData = cs.getCtuData( ctu, line );
666         memset( ctuData.lfParam, 0, sizeof( CtuData::lfParam ) );
667 
668         const UnitArea  ctuArea  = getCtuArea( cs, ctu, line, true );
669         decLib.m_cLoopFilter.calcFilterStrengthsCTU( cs, ctuArea );
670       }
671 
672       thisCtuState = ( TaskType )( LF_INIT + 1 );
673 
674       ITT_TASKEND( itt_domain_dec, itt_handle_lfcl );
675     }
676 
677     case INTER:
678     {
679       if( std::all_of( cs.picture->slices.begin(), cs.picture->slices.end(), []( const Slice* pcSlice ) { return pcSlice->isIntra(); } ) )
680       {
681         // not really necessary, but only for optimizing the wave-fronts
682         if( col > 1 && thisLine[col - 2] <= INTER )
683           return false;
684         if( line > 0 && lineAbove[col] <= INTER )
685           return false;
686       }
687 
688       if( std::any_of( cs.picture->refPicExtDepBarriers.cbegin(), cs.picture->refPicExtDepBarriers.cend(), []( const Barrier* b ) { return b->isBlocked(); } ) )
689       {
690         return false;
691       }
692 
693       if( onlyCheckReadyState )
694         return true;
695 
696       ITT_TASKSTART( itt_domain_dec, itt_handle_inter );
697 
698       for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
699       {
700         const CtuData& ctuData = cs.getCtuData( ctu, line );
701         const UnitArea ctuArea = getCtuArea( cs, ctu, line, true );
702 
703         decLib.m_cCuDecoder[tid].TaskTrafoCtu( cs, ctuArea );
704 
705         if( !ctuData.slice->isIntra() )
706         {
707           decLib.m_cCuDecoder[tid].TaskInterCtu( cs, ctuArea );
708         }
709       }
710 
711       thisCtuState = ( TaskType )( INTER + 1 );
712 
713       ITT_TASKEND( itt_domain_dec, itt_handle_inter );
714     }
715 
716     case INTRA:
717     {
718       if( col > 0 && thisLine[col - 1] <= INTRA )
719         return false;
720       if( line > 0 && lineAbove[std::min( col + 1, widthInCtus - 1 )] <= INTRA )
721         return false;
722       if( onlyCheckReadyState )
723         return true;
724 
725       ITT_TASKSTART( itt_domain_dec, itt_handle_intra );
726 
727       for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
728       {
729         const UnitArea  ctuArea = getCtuArea( cs, ctu, line, true );
730         decLib.m_cCuDecoder[tid].TaskCriticalIntraKernel( cs, ctuArea );
731       }
732 
733       thisCtuState = ( TaskType )( INTRA + 1 );
734 
735       ITT_TASKEND( itt_domain_dec, itt_handle_intra );
736     }
737 
738     case RSP:
739     {
740       // RIRZIIIII
741       // IIIIIXXXX
742       //
743       // - Z can be reshaped when it is no more an intra prediction source for X in the next line
744 
745 
746       if     ( line + 1 < heightInCtus && col + 1 < widthInCtus && lineBelow[col + 1] < RSP )
747         return false;
748       else if( line + 1 < heightInCtus &&                          lineBelow[col]     < RSP )
749         return false;
750       else if(                            col + 1 < widthInCtus && thisLine [col + 1] < RSP ) // need this for the last line
751         return false;
752 
753       if( onlyCheckReadyState )
754         return true;
755 
756       ITT_TASKSTART( itt_domain_dec, itt_handle_rsp );
757 
758       for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
759       {
760         decLib.m_cReshaper[tid].rspCtu( cs, ctu, line, 0 );
761       }
762 
763       ITT_TASKEND( itt_domain_dec, itt_handle_rsp );
764 
765       thisCtuState = ( TaskType )( RSP + 1 );
766     }
767 
768     case LF_V:
769     {
770       if( col > 0 && thisLine[col - 1] < LF_V )
771         return false;
772       if( onlyCheckReadyState )
773         return true;
774 
775       ITT_TASKSTART( itt_domain_dec, itt_handle_lfl );
776 
777       for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
778       {
779         decLib.m_cLoopFilter.loopFilterCTU( cs, MAX_NUM_CHANNEL_TYPE, ctu, line, 0, EDGE_VER );
780       }
781 
782       thisCtuState = ( TaskType )( LF_V + 1 );
783 
784       ITT_TASKEND( itt_domain_dec, itt_handle_lfl );
785     }
786 
787     case LF_H:
788     {
789       if( line > 0 && lineAbove[col] < LF_H )
790         return false;
791 
792       if( line > 0 && col + 1 < widthInCtus && lineAbove[col + 1] < LF_H )
793         return false;
794 
795       if( col + 1 < widthInCtus && thisLine[col + 1] < LF_H )
796         return false;
797 
798       if( onlyCheckReadyState )
799         return true;
800 
801       ITT_TASKSTART( itt_domain_dec, itt_handle_lfl );
802 
803       for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
804       {
805         decLib.m_cLoopFilter.loopFilterCTU( cs, MAX_NUM_CHANNEL_TYPE, ctu, line, 0, EDGE_HOR );
806       }
807 
808       thisCtuState = ( TaskType )( LF_H + 1 );
809 
810       ITT_TASKEND( itt_domain_dec, itt_handle_lfl );
811     }
812 
813     case PRESAO:
814     {
815       // only last CTU processes full line
816       if( col == widthInCtus - 1 )
817       {
818         if( line > 0 && lineAbove[col] <= PRESAO )
819           return false;
820 
821         for( int c = 0; c < widthInCtus; ++c )
822         {
823           if( thisLine[c] < PRESAO )
824             return false;
825 
826           if( line + 1 < heightInCtus && lineBelow[c] < PRESAO )
827             return false;
828         }
829         if( onlyCheckReadyState )
830           return true;
831 
832         ITT_TASKSTART( itt_domain_dec, itt_handle_presao );
833 
834         if( cs.sps->getUseSAO() )
835         {
836           decLib.m_cSAO.SAOPrepareCTULine( cs, getLineArea( cs, line, true ) );
837         }
838         param->common.dmvrTriggers[line].unlock();
839 
840         ITT_TASKEND( itt_domain_dec, itt_handle_presao );
841       }
842       else if( thisLine[widthInCtus - 1] <= PRESAO )   // wait for last CTU to finish PRESAO
843       {
844         return false;
845       }
846       if( onlyCheckReadyState )
847         return true;
848 
849       thisCtuState = ( TaskType )( PRESAO + 1 );
850     }
851 
852     case SAO:
853     {
854       if( onlyCheckReadyState )
855         return true;
856 
857       // only last CTU processes full line
858       if( cs.sps->getUseSAO() )
859       {
860         ITT_TASKSTART( itt_domain_dec, itt_handle_sao );
861 
862         for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
863         {
864           const UnitArea  ctuArea = getCtuArea( cs, ctu, line, true );
865           decLib.m_cSAO.SAOProcessCTU( cs, ctuArea );
866         }
867 
868         ITT_TASKEND( itt_domain_dec, itt_handle_sao );
869       }
870       if( param->common.doALF )
871       {
872         ITT_TASKSTART( itt_domain_dec, itt_handle_alf );
873 
874         for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
875         {
876           AdaptiveLoopFilter::prepareCTU( cs, ctu, line );
877         }
878 
879         ITT_TASKEND( itt_domain_dec, itt_handle_alf );
880       }
881 
882       thisCtuState = ( TaskType )( SAO + 1 );
883     }
884 
885     case ALF:
886     {
887       if( param->common.doALF )
888       {
889         const bool a = line > 0;
890         const bool b = line + 1 < heightInCtus;
891         const bool c = col > 0;
892         const bool d = col + 1 < widthInCtus;
893 
894         if( param->common.alfPrepared.isBlocked() )
895           return false;
896 
897         if( a )
898         {
899           if( c && lineAbove[col - 1] < ALF ) return false;
900           if(      lineAbove[col    ] < ALF ) return false;
901           if( d && lineAbove[col + 1] < ALF ) return false;
902         }
903 
904         if( b )
905         {
906           if( c && lineBelow[col - 1] < ALF ) return false;
907           if(      lineBelow[col    ] < ALF ) return false;
908           if( d && lineBelow[col + 1] < ALF ) return false;
909         }
910 
911         if( c && thisLine[col - 1] < ALF ) return false;
912         if( d && thisLine[col + 1] < ALF ) return false;
913 
914         if( onlyCheckReadyState )
915           return true;
916 
917         ITT_TASKSTART( itt_domain_dec, itt_handle_alf );
918         for( int ctu = ctuStart; ctu < ctuEnd; ctu++ )
919         {
920           decLib.m_cALF.processCTU( cs, ctu, line, tid );
921         }
922         ITT_TASKEND( itt_domain_dec, itt_handle_alf );
923       }
924       else if( onlyCheckReadyState )
925         return true;
926 
927       thisCtuState = ( TaskType )( ALF + 1 );
928     }
929 
930     default:
931       CHECKD( thisCtuState != DONE, "Wrong CTU state" );
932     }   // end switch
933   }
934   catch( ... )
935   {
936     for( auto& t: param->common.dmvrTriggers )
937     {
938       t.setException( std::current_exception() );
939     }
940     std::rethrow_exception( std::current_exception() );
941   }
942 
943   return true;
944 }
945 
946 }
947