1 /*!
2  * \copy
3  *     Copyright (c)  2010-2013, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    slice_multi_threading.h
33  *
34  * \brief   pSlice based multiple threading
35  *
36  * \date    04/16/2010 Created
37  *
38  *************************************************************************************
39  */
40 
41 
42 #include <assert.h>
43 #if !defined(_WIN32)
44 #include <semaphore.h>
45 #include <unistd.h>
46 #endif//!_WIN32
47 #ifndef SEM_NAME_MAX
48 // length of semaphore name should be system constrained at least on mac 10.7
49 #define  SEM_NAME_MAX 32
50 #endif//SEM_NAME_MAX
51 #include "slice_multi_threading.h"
52 #include "mt_defs.h"
53 #include "nal_encap.h"
54 #include "utils.h"
55 #include "encoder.h"
56 #include "svc_encode_slice.h"
57 #include "deblocking.h"
58 #include "svc_enc_golomb.h"
59 #include "crt_util_safe_x.h" // for safe crt like calls
60 #include "rc.h"
61 
62 #include "cpu.h"
63 
64 #include "measure_time.h"
65 #include "wels_task_management.h"
66 
67 #if defined(ENABLE_TRACE_MT)
68 #define MT_TRACE_LOG(pLog, x, ...) WelsLog(pLog, x, __VA_ARGS__)
69 #else
70 #define MT_TRACE_LOG(x, ...)
71 #endif
72 
73 namespace WelsEnc {
UpdateMbListNeighborParallel(SDqLayer * pCurDq,SMB * pMbList,const int32_t uiSliceIdc)74 void UpdateMbListNeighborParallel (SDqLayer* pCurDq,
75                                    SMB* pMbList,
76                                    const int32_t uiSliceIdc) {
77   SSliceCtx* pSliceCtx           = &pCurDq->sSliceEncCtx;
78   const int32_t kiMbWidth        = pSliceCtx->iMbWidth;
79   int32_t iIdx                   = pCurDq->pFirstMbIdxOfSlice[uiSliceIdc];
80   const int32_t kiEndMbInSlice   = iIdx + pCurDq->pCountMbNumInSlice[uiSliceIdc] - 1;
81 
82   do {
83     UpdateMbNeighbor (pCurDq, &pMbList[iIdx], kiMbWidth, uiSliceIdc);
84     ++ iIdx;
85   } while (iIdx <= kiEndMbInSlice);
86 }
87 
CalcSliceComplexRatio(SDqLayer * pCurDq)88 void CalcSliceComplexRatio (SDqLayer* pCurDq) {
89   SSliceCtx* pSliceCtx          = &pCurDq->sSliceEncCtx;
90   SSlice** ppSliceInLayer       = pCurDq->ppSliceInLayer;
91   int32_t iSumAv                = 0;
92   const int32_t kiSliceCount    = pSliceCtx->iSliceNumInFrame;
93   int32_t iSliceIdx             = 0;
94   int32_t iAvI[MAX_SLICES_NUM];
95 
96   assert (kiSliceCount <= MAX_SLICES_NUM);
97   WelsEmms();
98 
99   while (iSliceIdx < kiSliceCount) {
100     iAvI[iSliceIdx] = WELS_DIV_ROUND (INT_MULTIPLY * ppSliceInLayer[iSliceIdx]->iCountMbNumInSlice,
101                                       ppSliceInLayer[iSliceIdx]->uiSliceConsumeTime);
102     MT_TRACE_LOG (NULL, WELS_LOG_DEBUG, "[MT] CalcSliceComplexRatio(), uiSliceConsumeTime[%d]= %d us, slice_run= %d",
103                   iSliceIdx,
104                   ppSliceInLayer[iSliceIdx]->uiSliceConsumeTime, ppSliceInLayer[iSliceIdx]->iCountMbNumInSlice);
105     iSumAv += iAvI[iSliceIdx];
106 
107     ++ iSliceIdx;
108   }
109   while (-- iSliceIdx >= 0) {
110     ppSliceInLayer[iSliceIdx]->iSliceComplexRatio = WELS_DIV_ROUND (INT_MULTIPLY * iAvI[iSliceIdx], iSumAv);
111   }
112 }
113 
NeedDynamicAdjust(SSlice ** ppSliceInLayer,const int32_t iSliceNum)114 int32_t NeedDynamicAdjust (SSlice** ppSliceInLayer, const int32_t iSliceNum) {
115   if (NULL == ppSliceInLayer) {
116     return false;
117   }
118 
119   uint32_t uiTotalConsume       = 0;
120   int32_t iSliceIdx             = 0;
121   int32_t iNeedAdj              = false;
122 
123   WelsEmms();
124 
125   while (iSliceIdx < iSliceNum) {
126     if (NULL == ppSliceInLayer[iSliceIdx]) {
127       return false;
128     }
129 
130     uiTotalConsume += ppSliceInLayer[iSliceIdx]->uiSliceConsumeTime;
131     iSliceIdx ++;
132   }
133   if (uiTotalConsume == 0) {
134     MT_TRACE_LOG (NULL, WELS_LOG_DEBUG,
135                   "[MT] NeedDynamicAdjust(), herein do no adjust due first picture, iCountSliceNum= %d",
136                   iSliceNum);
137     return false;
138   }
139 
140   iSliceIdx = 0;
141   float fThr                    = EPSN; // threshold for various cores cases
142   float fRmse                   = .0f;  // root mean square error of pSlice consume ratios
143   const float kfMeanRatio       = 1.0f / iSliceNum;
144   do {
145     const float fRatio = 1.0f * ppSliceInLayer[iSliceIdx]->uiSliceConsumeTime / uiTotalConsume;
146     const float fDiffRatio = fRatio - kfMeanRatio;
147     fRmse += (fDiffRatio * fDiffRatio);
148     ++ iSliceIdx;
149   } while (iSliceIdx + 1 < iSliceNum);
150   fRmse = sqrtf (fRmse / iSliceNum);
151   if (iSliceNum >= 8) {
152     fThr += THRESHOLD_RMSE_CORE8;
153   } else if (iSliceNum >= 4) {
154     fThr += THRESHOLD_RMSE_CORE4;
155   } else if (iSliceNum >= 2) {
156     fThr += THRESHOLD_RMSE_CORE2;
157   } else
158     fThr = 1.0f;
159   if (fRmse > fThr)
160     iNeedAdj = true;
161   MT_TRACE_LOG (NULL, WELS_LOG_DEBUG,
162                 "[MT] NeedDynamicAdjust(), herein adjustment decision is made (iNeedAdj= %d) by: fRmse of pSlice complexity ratios %.6f, the corresponding threshold %.6f, iCountSliceNum %d",
163                 iNeedAdj, fRmse, fThr, iSliceNum);
164 
165   return iNeedAdj;
166 }
167 
DynamicAdjustSlicing(sWelsEncCtx * pCtx,SDqLayer * pCurDqLayer,int32_t iCurDid)168 void DynamicAdjustSlicing (sWelsEncCtx* pCtx,
169                            SDqLayer* pCurDqLayer,
170                            int32_t iCurDid) {
171   SSliceCtx* pSliceCtx          = &pCurDqLayer->sSliceEncCtx;
172   SSlice** ppSliceInLayer       = pCurDqLayer->ppSliceInLayer;
173   const int32_t kiCountSliceNum = pSliceCtx->iSliceNumInFrame;
174   const int32_t kiCountNumMb    = pSliceCtx->iMbNumInFrame;
175   int32_t iMinimalMbNum         =
176     pSliceCtx->iMbWidth;  // in theory we need only 1 SMB, here let it as one SMB row required
177   int32_t iMaximalMbNum         = 0;    // dynamically assign later
178   int32_t iMbNumLeft            = kiCountNumMb;
179   int32_t iRunLen[MAX_THREADS_NUM] = {0};
180   int32_t iSliceIdx             = 0;
181 
182   int32_t iNumMbInEachGom = 0;
183   SWelsSvcRc* pWelsSvcRc = &pCtx->pWelsSvcRc[iCurDid];
184   if (pCtx->pSvcParam->iRCMode != RC_OFF_MODE) {
185     iNumMbInEachGom = pWelsSvcRc->iNumberMbGom;
186 
187     if (iNumMbInEachGom <= 0) {
188       WelsLog (& (pCtx->sLogCtx), WELS_LOG_ERROR,
189                "[MT] DynamicAdjustSlicing(), invalid iNumMbInEachGom= %d from RC, iDid= %d, iCountNumMb= %d", iNumMbInEachGom,
190                iCurDid, kiCountNumMb);
191       return;
192     }
193 
194     // do not adjust in case no extra iNumMbInEachGom based left for slicing adjustment,
195     // extra MB of non integrated GOM assigned at the last pSlice in default, keep up on early initial result.
196     if (iNumMbInEachGom * kiCountSliceNum >= kiCountNumMb) {
197       return;
198     }
199     iMinimalMbNum = iNumMbInEachGom;
200   }
201 
202   if (kiCountSliceNum < 2 || (kiCountSliceNum & 0x01)) // we need suppose uiSliceNum is even for multiple threading
203     return;
204 
205   iMaximalMbNum = kiCountNumMb - (kiCountSliceNum - 1) * iMinimalMbNum;
206 
207   WelsEmms();
208 
209   MT_TRACE_LOG (& (pCtx->sLogCtx), WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), iDid= %d, iCountNumMb= %d", iCurDid,
210                 kiCountNumMb);
211 
212   iSliceIdx = 0;
213   while (iSliceIdx + 1 < kiCountSliceNum) {
214     int32_t iNumMbAssigning = WELS_DIV_ROUND (kiCountNumMb * ppSliceInLayer[iSliceIdx]->iSliceComplexRatio, INT_MULTIPLY);
215 
216     // GOM boundary aligned
217     if (pCtx->pSvcParam->iRCMode != RC_OFF_MODE) {
218       iNumMbAssigning = iNumMbAssigning / iNumMbInEachGom * iNumMbInEachGom;
219     }
220 
221     // make sure one GOM at least in each pSlice for safe
222     if (iNumMbAssigning < iMinimalMbNum)
223       iNumMbAssigning = iMinimalMbNum;
224     else if (iNumMbAssigning > iMaximalMbNum)
225       iNumMbAssigning = iMaximalMbNum;
226 
227     assert (iNumMbAssigning > 0);
228 
229     iMbNumLeft -= iNumMbAssigning;
230     if (iMbNumLeft <= 0) { // error due to we can not support slice_skip now yet, do not adjust this time
231       assert (0);
232       return;
233     }
234     iRunLen[iSliceIdx] = iNumMbAssigning;
235     MT_TRACE_LOG (& (pCtx->sLogCtx), WELS_LOG_DEBUG,
236                   "[MT] DynamicAdjustSlicing(), iSliceIdx= %d, iSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d",
237                   iSliceIdx, ppSliceInLayer[iSliceIdx]->iSliceComplexRatio * 1.0f / INT_MULTIPLY,
238                   ppSliceInLayer[iSliceIdx]->iCountMbNumInSlice,
239                   iNumMbAssigning);
240     ++ iSliceIdx;
241     iMaximalMbNum = iMbNumLeft - (kiCountSliceNum - iSliceIdx - 1) * iMinimalMbNum; // get maximal num_mb in left parts
242   }
243   iRunLen[iSliceIdx] = iMbNumLeft;
244   MT_TRACE_LOG (& (pCtx->sLogCtx), WELS_LOG_DEBUG,
245                 "[MT] DynamicAdjustSlicing(), iSliceIdx= %d, pSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d",
246                 iSliceIdx, ppSliceInLayer[iSliceIdx]->iSliceComplexRatio * 1.0f / INT_MULTIPLY,
247                 ppSliceInLayer[iSliceIdx]->iCountMbNumInSlice, iMbNumLeft);
248   pCurDqLayer->bNeedAdjustingSlicing = !DynamicAdjustSlicePEncCtxAll (pCurDqLayer, iRunLen);
249 }
250 
RequestMtResource(sWelsEncCtx ** ppCtx,SWelsSvcCodingParam * pCodingParam,const int32_t iCountBsLen,const int32_t iMaxSliceBufferSize,bool bDynamicSlice)251 int32_t RequestMtResource (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pCodingParam, const int32_t iCountBsLen,
252                            const int32_t iMaxSliceBufferSize, bool bDynamicSlice) {
253   CMemoryAlign* pMa             = NULL;
254   SWelsSvcCodingParam* pPara    = NULL;
255   SSliceThreading* pSmt         = NULL;
256   int32_t iNumSpatialLayers     = 0;
257   int32_t iThreadNum            = 0;
258   int32_t iIdx                  = 0;
259   int32_t iReturn               = ENC_RETURN_SUCCESS;
260 
261   if (NULL == ppCtx || NULL == pCodingParam || NULL == *ppCtx || iCountBsLen <= 0)
262     return 1;
263 #if defined(ENABLE_TRACE_MT)
264   SLogContext* pLogCtx = & ((*ppCtx)->sLogCtx);
265 #endif
266   pMa                  = (*ppCtx)->pMemAlign;
267   pPara                = pCodingParam;
268   iNumSpatialLayers    = pPara->iSpatialLayerNum;
269   iThreadNum           = pPara->iMultipleThreadIdc;
270 
271   assert (iThreadNum > 0);
272 
273   pSmt = (SSliceThreading*)pMa->WelsMalloc (sizeof (SSliceThreading), "SSliceThreading");
274   WELS_VERIFY_RETURN_IF (1, (NULL == pSmt))
275   memset (pSmt, 0, sizeof (SSliceThreading));
276   (*ppCtx)->pSliceThreading = pSmt;
277   pSmt->pThreadPEncCtx = (SSliceThreadPrivateData*)pMa->WelsMalloc (sizeof (SSliceThreadPrivateData) * iThreadNum,
278                          "pThreadPEncCtx");
279   WELS_VERIFY_RETURN_IF (1, (NULL == pSmt->pThreadPEncCtx))
280 
281 #ifdef _WIN32
282   // Dummy event namespace, the windows events don't actually use this
283   WelsSnprintf (pSmt->eventNamespace, sizeof (pSmt->eventNamespace), "%p", (void*) *ppCtx);
284 #else
285   WelsSnprintf (pSmt->eventNamespace, sizeof (pSmt->eventNamespace), "%p%x", (void*) *ppCtx, getpid());
286 #endif//!_WIN32
287 
288 #ifdef MT_DEBUG
289   // file handle for MT debug
290   pSmt->pFSliceDiff = NULL;
291 
292   if (pSmt->pFSliceDiff) {
293     fclose (pSmt->pFSliceDiff);
294     pSmt->pFSliceDiff = NULL;
295   }
296   pSmt->pFSliceDiff = fopen ("slice_time.txt", "wt+");
297 #endif//MT_DEBUG
298 
299   MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "encpEncCtx= 0x%p", (void*) *ppCtx);
300 
301   char name[SEM_NAME_MAX] = {0};
302   WELS_GCC_UNUSED WELS_THREAD_ERROR_CODE err = 0;
303 
304   iIdx = 0;
305   while (iIdx < iThreadNum) {
306     pSmt->pThreadPEncCtx[iIdx].pWelsPEncCtx   = (void*) *ppCtx;
307     pSmt->pThreadPEncCtx[iIdx].iSliceIndex    = iIdx;
308     pSmt->pThreadPEncCtx[iIdx].iThreadIndex   = iIdx;
309     pSmt->pThreadHandles[iIdx]                = 0;
310 
311     // length of semaphore name should be system constrained at least on mac 10.7
312     WelsSnprintf (name, SEM_NAME_MAX, "ud%d%s", iIdx, pSmt->eventNamespace);
313     err = WelsEventOpen (&pSmt->pUpdateMbListEvent[iIdx], name);
314     MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "[MT] Open pUpdateMbListEvent%d named(%s) ret%d err%d", iIdx, name, err, errno);
315     WelsSnprintf (name, SEM_NAME_MAX, "fu%d%s", iIdx, pSmt->eventNamespace);
316     err = WelsEventOpen (&pSmt->pFinUpdateMbListEvent[iIdx], name);
317     MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "[MT] Open pFinUpdateMbListEvent%d named(%s) ret%d err%d", iIdx, name, err,
318                   errno);
319     WelsSnprintf (name, SEM_NAME_MAX, "sc%d%s", iIdx, pSmt->eventNamespace);
320     err = WelsEventOpen (&pSmt->pSliceCodedEvent[iIdx], name);
321     MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "[MT] Open pSliceCodedEvent%d named(%s) ret%d err%d", iIdx, name, err, errno);
322     WelsSnprintf (name, SEM_NAME_MAX, "rc%d%s", iIdx, pSmt->eventNamespace);
323     err = WelsEventOpen (&pSmt->pReadySliceCodingEvent[iIdx], name);
324     MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "[MT] Open pReadySliceCodingEvent%d = 0x%p named(%s) ret%d err%d", iIdx,
325                   (void*)pSmt->pReadySliceCodingEvent[iIdx], name, err, errno);
326     ++ iIdx;
327   }
328 
329   WelsSnprintf (name, SEM_NAME_MAX, "scm%s", pSmt->eventNamespace);
330   err = WelsEventOpen (&pSmt->pSliceCodedMasterEvent, name);
331   MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "[MT] Open pSliceCodedMasterEvent named(%s) ret%d err%d", name, err, errno);
332 
333   iReturn = WelsMutexInit (&pSmt->mutexSliceNumUpdate);
334   WELS_VERIFY_RETURN_IF (1, (WELS_THREAD_ERROR_OK != iReturn))
335 
336   (*ppCtx)->pTaskManage = IWelsTaskManage::CreateTaskManage (*ppCtx, iNumSpatialLayers, bDynamicSlice);
337   WELS_VERIFY_RETURN_IF (1, (NULL == (*ppCtx)->pTaskManage))
338 
339   int32_t iThreadBufferNum = WELS_MIN ((*ppCtx)->pTaskManage->GetThreadPoolThreadNum(), MAX_THREADS_NUM);
340 
341   for (iIdx = 0; iIdx < iThreadBufferNum; iIdx++) {
342     pSmt->pThreadBsBuffer[iIdx] = (uint8_t*)pMa->WelsMallocz (iCountBsLen, "pSmt->pThreadBsBuffer");
343     WELS_VERIFY_RETURN_IF (1, (NULL == pSmt->pThreadBsBuffer[iIdx]))
344   }
345   iReturn = WelsMutexInit (&pSmt->mutexThreadBsBufferUsage);
346   WELS_VERIFY_RETURN_PROC_IF (1, (WELS_THREAD_ERROR_OK != iReturn), FreeMemorySvc (ppCtx))
347 
348   iReturn = WelsMutexInit (&pSmt->mutexEvent);
349   WELS_VERIFY_RETURN_PROC_IF (1, (WELS_THREAD_ERROR_OK != iReturn), FreeMemorySvc (ppCtx));
350 
351   iReturn = WelsMutexInit (&pSmt->mutexThreadSlcBuffReallocate);
352   WELS_VERIFY_RETURN_PROC_IF (1, (WELS_THREAD_ERROR_OK != iReturn), FreeMemorySvc (ppCtx))
353 
354   iReturn = WelsMutexInit (& (*ppCtx)->mutexEncoderError);
355   WELS_VERIFY_RETURN_IF (1, (WELS_THREAD_ERROR_OK != iReturn))
356 
357   MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "RequestMtResource(), iThreadNum=%d, iMultipleThreadIdc= %d",
358                 pPara->iMultipleThreadIdc,
359                 (*ppCtx)->iMaxSliceCount);
360   return 0;
361 }
362 
ReleaseMtResource(sWelsEncCtx ** ppCtx)363 void ReleaseMtResource (sWelsEncCtx** ppCtx) {
364   SSliceThreading* pSmt                 = NULL;
365   CMemoryAlign* pMa                     = NULL;
366   int32_t iIdx                          = 0;
367   int32_t iThreadNum                    = 0;
368 
369   if (NULL == ppCtx || NULL == *ppCtx)
370     return;
371 
372   pMa           = (*ppCtx)->pMemAlign;
373   iThreadNum    = (*ppCtx)->pSvcParam->iMultipleThreadIdc;
374   pSmt          = (*ppCtx)->pSliceThreading;
375 
376   if (NULL == pSmt)
377     return;
378 
379   char ename[SEM_NAME_MAX] = {0};
380   while (iIdx < iThreadNum) {
381     // length of semaphore name should be system constrained at least on mac 10.7
382     WelsSnprintf (ename, SEM_NAME_MAX, "sc%d%s", iIdx, pSmt->eventNamespace);
383     WelsEventClose (&pSmt->pSliceCodedEvent[iIdx], ename);
384     WelsSnprintf (ename, SEM_NAME_MAX, "rc%d%s", iIdx, pSmt->eventNamespace);
385     WelsEventClose (&pSmt->pReadySliceCodingEvent[iIdx], ename);
386     WelsSnprintf (ename, SEM_NAME_MAX, "ud%d%s", iIdx, pSmt->eventNamespace);
387     WelsEventClose (&pSmt->pUpdateMbListEvent[iIdx], ename);
388     WelsSnprintf (ename, SEM_NAME_MAX, "fu%d%s", iIdx, pSmt->eventNamespace);
389     WelsEventClose (&pSmt->pFinUpdateMbListEvent[iIdx], ename);
390 
391     ++ iIdx;
392   }
393   WelsSnprintf (ename, SEM_NAME_MAX, "scm%s", pSmt->eventNamespace);
394   WelsEventClose (&pSmt->pSliceCodedMasterEvent, ename);
395 
396   WelsMutexDestroy (&pSmt->mutexSliceNumUpdate);
397   WelsMutexDestroy (&pSmt->mutexThreadBsBufferUsage);
398   WelsMutexDestroy (&pSmt->mutexThreadSlcBuffReallocate);
399   WelsMutexDestroy (& ((*ppCtx)->mutexEncoderError));
400   WelsMutexDestroy (&pSmt->mutexEvent);
401   if (pSmt->pThreadPEncCtx != NULL) {
402     pMa->WelsFree (pSmt->pThreadPEncCtx, "pThreadPEncCtx");
403     pSmt->pThreadPEncCtx = NULL;
404   }
405 
406   for (int i = 0; i < MAX_THREADS_NUM; i++) {
407     if (pSmt->pThreadBsBuffer[i]) {
408       pMa->WelsFree (pSmt->pThreadBsBuffer[i], "pSmt->pThreadBsBuffer");
409       pSmt->pThreadBsBuffer[i] = NULL;
410     }
411   }
412   memset (&pSmt->bThreadBsBufferUsage, 0, MAX_THREADS_NUM * sizeof (bool));
413 
414   if ((*ppCtx)->pTaskManage != NULL) {
415     WELS_DELETE_OP ((*ppCtx)->pTaskManage);
416   }
417 
418 #ifdef MT_DEBUG
419   // file handle for debug
420   if (pSmt->pFSliceDiff) {
421     fclose (pSmt->pFSliceDiff);
422     pSmt->pFSliceDiff = NULL;
423   }
424 #endif//MT_DEBUG
425   pMa->WelsFree ((*ppCtx)->pSliceThreading, "SSliceThreading");
426   (*ppCtx)->pSliceThreading = NULL;
427 }
428 
AppendSliceToFrameBs(sWelsEncCtx * pCtx,SLayerBSInfo * pLbi,const int32_t iSliceCount)429 int32_t AppendSliceToFrameBs (sWelsEncCtx* pCtx, SLayerBSInfo* pLbi, const int32_t iSliceCount) {
430   SSlice** ppSliceInlayer = pCtx->pCurDqLayer->ppSliceInLayer;
431   SWelsSliceBs* pSliceBs  = NULL;
432   int32_t iLayerSize      = 0;
433   int32_t iNalIdxBase     = pLbi->iNalCount;
434   int32_t iSliceIdx       = 0;
435 
436   iNalIdxBase  = pLbi->iNalCount = 0;
437   while (iSliceIdx < iSliceCount) {
438     pSliceBs    = &ppSliceInlayer[iSliceIdx]->sSliceBs;
439     if (pSliceBs != NULL && pSliceBs->uiBsPos > 0) {
440       int32_t iNalIdx = 0;
441       const int32_t iCountNal = pSliceBs->iNalIndex;
442 
443 #if MT_DEBUG_BS_WR
444       assert (pSliceBs->bSliceCodedFlag);
445 #endif//MT_DEBUG_BS_WR
446 
447       memmove (pCtx->pFrameBs + pCtx->iPosBsBuffer, pSliceBs->pBs, pSliceBs->uiBsPos); // confirmed_safe_unsafe_usage
448       pCtx->iPosBsBuffer += pSliceBs->uiBsPos;
449 
450       iLayerSize += pSliceBs->uiBsPos;
451 
452       while (iNalIdx < iCountNal) {
453         pLbi->pNalLengthInByte[iNalIdxBase + iNalIdx] = pSliceBs->iNalLen[iNalIdx];
454         ++ iNalIdx;
455       }
456       pLbi->iNalCount += iCountNal;
457       iNalIdxBase     += iCountNal;
458     }
459     ++ iSliceIdx;
460   }
461 
462   return iLayerSize;
463 }
464 
WriteSliceBs(sWelsEncCtx * pCtx,SWelsSliceBs * pSliceBs,const int32_t iSliceIdx,int32_t & iSliceSize)465 int32_t WriteSliceBs (sWelsEncCtx* pCtx, SWelsSliceBs* pSliceBs, const int32_t iSliceIdx, int32_t& iSliceSize) {
466   const int32_t kiNalCnt        = pSliceBs->iNalIndex;
467   int32_t iNalIdx               = 0;
468   int32_t iNalSize              = 0;
469   int32_t iReturn               = ENC_RETURN_SUCCESS;
470   int32_t iTotalLeftLength      = pSliceBs->uiSize - pSliceBs->uiBsPos;
471   SNalUnitHeaderExt* pNalHdrExt = &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt;
472   uint8_t* pDst                 = pSliceBs->pBs;
473 
474   assert (kiNalCnt <= 2);
475   if (kiNalCnt > 2)
476     return 0;
477 
478   iSliceSize = 0;
479   while (iNalIdx < kiNalCnt) {
480     iNalSize = 0;
481     iReturn = WelsEncodeNal (&pSliceBs->sNalList[iNalIdx], pNalHdrExt, iTotalLeftLength - iSliceSize,
482                              pDst, &iNalSize);
483     WELS_VERIFY_RETURN_IFNEQ (iReturn, ENC_RETURN_SUCCESS)
484 
485     pSliceBs->iNalLen[iNalIdx] = iNalSize;
486     iSliceSize                += iNalSize;
487     pDst                      += iNalSize;
488     ++ iNalIdx;
489   }
490   pSliceBs->uiBsPos = iSliceSize;
491 
492   return iReturn;
493 }
494 
495 // thread process for coding one pSlice
DynamicDetectCpuCores()496 int32_t DynamicDetectCpuCores() {
497   WelsLogicalProcessInfo  info;
498   WelsQueryLogicalProcessInfo (&info);
499   return info.ProcessorCount;
500 }
501 
AdjustBaseLayer(sWelsEncCtx * pCtx)502 int32_t AdjustBaseLayer (sWelsEncCtx* pCtx) {
503   SDqLayer* pCurDq      = pCtx->ppDqLayerList[0];
504   int32_t iNeedAdj      = 1;
505 #ifdef MT_DEBUG
506   int64_t iT0 = WelsTime();
507 #endif//MT_DEBUG
508 
509   pCtx->pCurDqLayer = pCurDq;
510 
511   // do not need adjust due to not different at both slices of consumed time
512   iNeedAdj = NeedDynamicAdjust (pCtx->ppDqLayerList[0]->ppSliceInLayer
513                                 , pCurDq->sSliceEncCtx.iSliceNumInFrame);
514   if (iNeedAdj)
515     DynamicAdjustSlicing (pCtx,
516                           pCurDq,
517                           0);
518 #ifdef MT_DEBUG
519   iT0 = WelsTime() - iT0;
520   if (pCtx->pSliceThreading->pFSliceDiff) {
521     fprintf (pCtx->pSliceThreading->pFSliceDiff,
522              "%6" PRId64" us adjust time at base spatial layer, iNeedAdj %d, DynamicAdjustSlicing()\n",
523              iT0, iNeedAdj);
524   }
525 #endif//MT_DEBUG
526 
527   return iNeedAdj;
528 }
529 
AdjustEnhanceLayer(sWelsEncCtx * pCtx,int32_t iCurDid)530 int32_t AdjustEnhanceLayer (sWelsEncCtx* pCtx, int32_t iCurDid) {
531 #ifdef MT_DEBUG
532   int64_t iT1 = WelsTime();
533 #endif//MT_DEBUG
534   int32_t iNeedAdj = 1;
535   // uiSliceMode of referencing spatial should be SM_FIXEDSLCNUM_SLICE
536   // if using spatial base layer for complexity estimation
537 
538   const bool kbModelingFromSpatial = (pCtx->pCurDqLayer->pRefLayer != NULL && iCurDid > 0)
539                                      && (pCtx->pSvcParam->sSpatialLayers[iCurDid - 1].sSliceArgument.uiSliceMode == SM_FIXEDSLCNUM_SLICE
540                                          && pCtx->pSvcParam->iMultipleThreadIdc >= pCtx->pSvcParam->sSpatialLayers[iCurDid -
541                                              1].sSliceArgument.uiSliceNum);
542 
543   if (kbModelingFromSpatial) { // using spatial base layer for complexity estimation
544     // do not need adjust due to not different at both slices of consumed time
545     iNeedAdj = NeedDynamicAdjust (pCtx->ppDqLayerList[iCurDid - 1]->ppSliceInLayer,
546                                   pCtx->pCurDqLayer->sSliceEncCtx.iSliceNumInFrame);
547     if (iNeedAdj)
548       DynamicAdjustSlicing (pCtx,
549                             pCtx->pCurDqLayer,
550                             iCurDid
551                            );
552   } else { // use temporal layer for complexity estimation
553     // do not need adjust due to not different at both slices of consumed time
554     iNeedAdj = NeedDynamicAdjust (pCtx->ppDqLayerList[iCurDid]->ppSliceInLayer,
555                                   pCtx->pCurDqLayer->sSliceEncCtx.iSliceNumInFrame);
556     if (iNeedAdj)
557       DynamicAdjustSlicing (pCtx,
558                             pCtx->pCurDqLayer,
559                             iCurDid
560                            );
561   }
562 
563 #ifdef MT_DEBUG
564   iT1 = WelsTime() - iT1;
565   if (pCtx->pSliceThreading->pFSliceDiff) {
566     fprintf (pCtx->pSliceThreading->pFSliceDiff,
567              "%6" PRId64" us adjust time at spatial layer %d, iNeedAdj %d, DynamicAdjustSlicing()\n",
568              iT1, iCurDid, iNeedAdj);
569   }
570 #endif//MT_DEBUG
571 
572   return iNeedAdj;
573 }
574 
575 
576 
577 #if defined(MT_DEBUG)
TrackSliceComplexities(sWelsEncCtx * pCtx,const int32_t iCurDid)578 void TrackSliceComplexities (sWelsEncCtx* pCtx, const int32_t iCurDid) {
579   const int32_t kiCountSliceNum = pCtx->pCurDqLayer->sSliceEncCtx.iSliceNumInFrame;
580   SSlice** ppSliceInLayer = pCtx->pCurDqLayer->ppSliceInLayer;
581   if (kiCountSliceNum > 0) {
582     int32_t iSliceIdx = 0;
583     do {
584       fprintf (pCtx->pSliceThreading->pFSliceDiff, "%6.3f complexity pRatio at iDid %d pSlice %d\n",
585                ppSliceInLayer[iSliceIdx]->iSliceComplexRatio, iCurDid, iSliceIdx);
586       ++ iSliceIdx;
587     } while (iSliceIdx < kiCountSliceNum);
588   }
589 }
590 #endif
591 
592 #if defined(MT_DEBUG)
TrackSliceConsumeTime(sWelsEncCtx * pCtx,int32_t * pDidList,const int32_t iSpatialNum)593 void TrackSliceConsumeTime (sWelsEncCtx* pCtx, int32_t* pDidList, const int32_t iSpatialNum) {
594   SWelsSvcCodingParam* pPara = NULL;
595   int32_t iSpatialIdx = 0;
596 
597   if (iSpatialNum > MAX_DEPENDENCY_LAYER)
598     return;
599 
600   pPara = pCtx->pSvcParam;
601   while (iSpatialIdx < iSpatialNum) {
602     const int32_t kiDid             = pDidList[iSpatialIdx];
603     SSliceArgument* pSliceArgument  = &pPara->sSpatialLayers[kiDid].sSliceArgument;
604     SDqLayer* pCurDq                = pCtx->ppDqLayerList[kiDid];
605     SSlice** ppSliceInLayer         = pCurDq->ppSliceInLayer;
606     SSliceCtx* pSliceCtx            = &pCurDq->sSliceEncCtx;
607     const uint32_t kuiCountSliceNum = pSliceCtx->iSliceNumInFrame;
608     if (pCtx->pSliceThreading) {
609       if (pCtx->pSliceThreading->pFSliceDiff
610           && ((pSliceArgument->uiSliceMode == SM_FIXEDSLCNUM_SLICE) || (pSliceArgument->uiSliceMode == SM_SIZELIMITED_SLICE))
611           && pPara->iMultipleThreadIdc > 1
612           && pPara->iMultipleThreadIdc >= kuiCountSliceNum) {
613         uint32_t i = 0;
614         uint32_t uiMaxT = 0;
615         int32_t iMaxI = 0;
616         while (i < kuiCountSliceNum) {
617           fprintf (pCtx->pSliceThreading->pFSliceDiff, "%6d us consume_time coding_idx %d iDid %d pSlice %d\n",
618                    ppSliceInLayer[i]->uiSliceConsumeTime, pCtx->iCodingIndex, kiDid, i /*/ 1000*/);
619           if (ppSliceInLayer[i]->uiSliceConsumeTime > uiMaxT) {
620             uiMaxT = ppSliceInLayer[i]->uiSliceConsumeTime;
621             iMaxI = i;
622           }
623           ++ i;
624         }
625         fprintf (pCtx->pSliceThreading->pFSliceDiff, "%6d us consume_time_max coding_idx %d iDid %d pSlice %d\n", uiMaxT,
626                  pCtx->iCodingIndex, kiDid, iMaxI /*/ 1000*/);
627       }
628     }
629     ++ iSpatialIdx;
630   }
631 }
632 #endif//#if defined(MT_DEBUG)
633 
SetOneSliceBsBufferUnderMultithread(sWelsEncCtx * pCtx,const int32_t kiThreadIdx,SSlice * pSlice)634 void SetOneSliceBsBufferUnderMultithread (sWelsEncCtx* pCtx, const int32_t kiThreadIdx, SSlice* pSlice) {
635   SWelsSliceBs* pSliceBs  = &pSlice->sSliceBs;
636   pSliceBs->pBsBuffer     = pCtx->pSliceThreading->pThreadBsBuffer[kiThreadIdx];
637   pSliceBs->uiBsPos       = 0;
638 }
639 }
640 
641