1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 #include "EbDefinitions.h"
7 #include "EbModeDecisionProcess.h"
8 #include "EbTransforms.h"
9 #include "EbFullLoop.h"
10 #include "EbRateDistortionCost.h"
11 #include "EbErrorCodes.h"
12 #include "EbErrorHandling.h"
13 
14 static const EB_U64 depth0Th[2][MAX_HIERARCHICAL_LEVEL][MAX_TEMPORAL_LAYERS] = {
15 	{
16 		{ 1000 },
17 		{ 1000, 4000 },
18 		{ 1000, 4000, 9500 },
19 		{ 1000, 4000, 9500, 3000 },
20 		{ 1000, 4000, 9500, 3000, 3000 },
21 		{ 1000, 4000, 9500, 3000, 3000, 3000 }
22 	},
23 
24 	{
25 		{ 0 },
26 		{ 0, 1000 },
27 		{ 0, 1000, 7000 },
28 		{ 0, 1000, 7000, 9500 },
29 		{ 0, 1000, 7000, 9500, 9500 },
30 		{ 0, 1000, 7000, 9500, 9500, 9500 }
31 	}
32 };
33 static const EB_U64 depth1Th[2][MAX_HIERARCHICAL_LEVEL][MAX_TEMPORAL_LAYERS] = {
34 	{
35 		{ 0 },
36 		{ 0, 2000 },
37 		{ 0, 2000, 5500 },
38 		{ 0, 2000, 5500, 9500 },
39 		{ 0, 2000, 5500, 9500, 9500 },
40 		{ 0, 2000, 5500, 9500, 9500, 9500 }
41 	},
42 
43 	{
44 		{ 0 },
45 		{ 0, 1500 },
46 		{ 0, 1500, 1500 },
47 		{ 0, 1500, 1500, 1500 },
48 		{ 0, 1500, 1500, 1500, 1500 },
49 		{ 0, 1500, 1500, 1500, 1500, 1500 }
50 	}
51 };
52 static const EB_U64 depth2Th[2][MAX_HIERARCHICAL_LEVEL][MAX_TEMPORAL_LAYERS] = {
53 	{
54 		{ 0 },
55 		{ 0, 500 },
56 		{ 0, 500, 2000 },
57 		{ 0, 500, 2000, 2500 },
58 		{ 0, 500, 2000, 2500, 2500 },
59 		{ 0, 500, 2000, 2500, 2500, 2500 }
60 	},
61 
62 	{
63 		{ 0 },
64 		{ 0, 1500 },
65 		{ 0, 1500, 1000 },
66 		{ 0, 1500, 1000, 4500 },
67 		{ 0, 1500, 1000, 4500, 4500 },
68 		{ 0, 1500, 1000, 4500, 4500, 4500 }
69 	}
70 };
71 
72 /*********************************************************************
73  * UnifiedQuantizeInvQuantize
74  *
75  *  Unified Quant +iQuant
76  *********************************************************************/
ProductUnifiedQuantizeInvQuantizeMd(PictureControlSet_t * pictureControlSetPtr,EB_S16 * coeff,const EB_U32 coeffStride,EB_S16 * quantCoeff,EB_S16 * reconCoeff,EB_U32 qp,EB_U32 areaSize,EB_U32 * yCountNonZeroCoeffs,EB_PF_MODE pfMode,EB_U8 enableContouringQCUpdateFlag,EB_U32 componentType,EB_RDOQ_PMCORE_TYPE rdoqPmCoreMethod,CabacEncodeContext_t * cabacEncodeCtxPtr,EB_U64 lambda,EB_MODETYPE type,EB_U32 intraLumaMode,EB_U32 intraChromaMode,CabacCost_t * CabacCost)77 void ProductUnifiedQuantizeInvQuantizeMd(
78 	PictureControlSet_t  *pictureControlSetPtr,
79 	EB_S16               *coeff,
80 	const EB_U32          coeffStride,
81 	EB_S16               *quantCoeff,
82 	EB_S16               *reconCoeff,
83 	EB_U32                qp,
84 	EB_U32                areaSize,
85 	EB_U32               *yCountNonZeroCoeffs,
86 	EB_PF_MODE		      pfMode,
87 	EB_U8                 enableContouringQCUpdateFlag,
88 	EB_U32                componentType,
89     EB_RDOQ_PMCORE_TYPE   rdoqPmCoreMethod,
90 	CabacEncodeContext_t *cabacEncodeCtxPtr,
91 	EB_U64                lambda,
92 	EB_MODETYPE           type,
93 	EB_U32                intraLumaMode,
94 	EB_U32                intraChromaMode,
95 	CabacCost_t          *CabacCost)
96 
97 {
98     EB_PICTURE          sliceType                          = pictureControlSetPtr->sliceType;
99     EB_U32            temporalLayerIndex                 = pictureControlSetPtr->temporalLayerIndex;
100 	//for the Quant
101 	const EB_S32 qpRem = (EB_S32)QpModSix[qp]; //the output is between 0-5
102 	const EB_S32 qpPer = (EB_S32)QpDivSix[qp] + TRANS_BIT_INCREMENT; //the output is between 0 and 8+TRANS_BIT_INCREMENT   (CHKN TRANS_BIT_INCREMENT =   0)
103 	const EB_U32 qFunc = QFunc[qpRem]; // 15 bits
104 
105 	const EB_U32 transformShiftNum = 7 - Log2f(areaSize);
106 	const EB_S32 shiftedQBits = QUANT_SHIFT + qpPer + transformShiftNum;
107 	const EB_U32 q_offset = ((sliceType == EB_I_PICTURE || sliceType == EB_IDR_PICTURE) ? QUANT_OFFSET_I : QUANT_OFFSET_P) << (shiftedQBits - 9);
108 
109 	//for the iQuant
110 	const EB_S32 shiftedFFunc = (qpPer > 8) ? (EB_S32)FFunc[qpRem] << (qpPer - 2) : (EB_S32)FFunc[qpRem] << qpPer; // this is 6+8+TRANS_BIT_INCREMENT
111 	const EB_S32 shiftNum = (qpPer > 8) ? QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShiftNum - 2 : QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShiftNum;
112 	const EB_S32 iq_offset = 1 << (shiftNum - 1);
113 
114 
115 	if (pfMode == PF_N2) {
116 		areaSize = areaSize >> 1;
117 	} else if (pfMode == PF_N4) {
118 		areaSize = areaSize >> 2;
119 	}
120 
121 	if (rdoqPmCoreMethod){
122 		DecoupledQuantizeInvQuantizeLoops(
123 			coeff,
124 			coeffStride,
125 			quantCoeff,
126 			reconCoeff,
127 			cabacEncodeCtxPtr,
128 			lambda,
129 			type,
130 			intraLumaMode,
131 			intraChromaMode,
132 			componentType,
133 			pictureControlSetPtr->temporalLayerIndex,
134 			pictureControlSetPtr->ParentPcsPtr->isUsedAsReferenceFlag,
135             (EB_U8) 0,
136             (EB_U16)qp,
137             (EB_U32)EB_8BIT,
138 			CabacCost,
139 			qFunc,
140 			q_offset,
141 			shiftedQBits,
142 			shiftedFFunc,
143 			iq_offset,
144 			shiftNum,
145 			areaSize,
146 			&(*yCountNonZeroCoeffs),
147             rdoqPmCoreMethod);
148 	}
149 	else{
150 
151 	    QiQ_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)][areaSize >> 3](
152 		    coeff,
153 		    coeffStride,
154 		    quantCoeff,
155 		    reconCoeff,
156 		    qFunc,
157 		    q_offset,
158 		    shiftedQBits,
159 		    shiftedFFunc,
160 		    iq_offset,
161 		    shiftNum,
162 		    areaSize,
163 		    &(*yCountNonZeroCoeffs));
164 
165 	    UpdateQiQCoef(
166 		    quantCoeff,
167 		    reconCoeff,
168 		    coeffStride,
169 		    shiftedFFunc,
170 		    iq_offset,
171 		    shiftNum,
172 		    areaSize,
173 		    &(*yCountNonZeroCoeffs),
174 		    componentType,
175 		    sliceType,
176 		    temporalLayerIndex,
177 		    0,
178 		    enableContouringQCUpdateFlag);
179 	}
180 }
181 
182 /****************************************
183  ************  Full loop ****************
184  ****************************************/
ProductFullLoop(EbPictureBufferDesc_t * inputPicturePtr,EB_U32 inputOriginIndex,ModeDecisionCandidateBuffer_t * candidateBuffer,ModeDecisionContext_t * contextPtr,const CodedUnitStats_t * cuStatsPtr,PictureControlSet_t * pictureControlSetPtr,EB_U32 qp,EB_U32 * yCountNonZeroCoeffs,EB_U64 * yCoeffBits,EB_U64 * yFullDistortion)185 void ProductFullLoop(
186 	EbPictureBufferDesc_t         *inputPicturePtr,
187 	EB_U32                         inputOriginIndex,
188 	ModeDecisionCandidateBuffer_t  *candidateBuffer,
189 	ModeDecisionContext_t          *contextPtr,
190 	const CodedUnitStats_t         *cuStatsPtr,
191 	PictureControlSet_t            *pictureControlSetPtr,
192 	EB_U32                          qp,
193 	EB_U32						   *yCountNonZeroCoeffs,
194 	EB_U64                         *yCoeffBits,
195 	EB_U64                         *yFullDistortion)
196 {
197 	EB_U32                       tuOriginIndex;
198 
199 	EB_U32   currentTuIndex,tuIt;
200 	EB_U64   yTuCoeffBits;
201 	EB_U64   tuFullDistortion[3][DIST_CALC_TOTAL];
202 	candidateBuffer->yDc[0] = 0;
203 	candidateBuffer->yDc[1] = 0;
204 	candidateBuffer->yDc[2] = 0;
205 	candidateBuffer->yDc[3] = 0;
206 	candidateBuffer->yCountNonZeroCoeffs[0] = 0;
207 	candidateBuffer->yCountNonZeroCoeffs[1] = 0;
208 	candidateBuffer->yCountNonZeroCoeffs[2] = 0;
209 	candidateBuffer->yCountNonZeroCoeffs[3] = 0;
210 
211 	if (cuStatsPtr->size == MAX_LCU_SIZE){
212 
213 		for (tuIt = 0; tuIt < 4; tuIt++)
214 		{
215 
216 			tuOriginIndex = ((tuIt & 1) << 5) + ((tuIt>1) << 11);
217 			currentTuIndex  = tuIt + 1;
218 			yTuCoeffBits = 0;
219 			EstimateTransform(
220 				&(((EB_S16*)candidateBuffer->residualQuantCoeffPtr->bufferY)[tuOriginIndex]),
221 				MAX_LCU_SIZE,
222 				&(((EB_S16*)contextPtr->transQuantBuffersPtr->tuTransCoeffNxNPtr->bufferY)[tuOriginIndex]),
223 				MAX_LCU_SIZE,
224 				32,
225 				contextPtr->transformInnerArrayPtr,
226 				0,
227 				EB_FALSE,
228 				contextPtr->pfMdMode);
229 
230 
231 			ProductUnifiedQuantizeInvQuantizeMd(
232 				pictureControlSetPtr,
233 				&(((EB_S16*)contextPtr->transQuantBuffersPtr->tuTransCoeffNxNPtr->bufferY)[tuOriginIndex]),
234 				MAX_LCU_SIZE,
235 				&(((EB_S16*)candidateBuffer->residualQuantCoeffPtr->bufferY)[tuOriginIndex]),
236 				&(((EB_S16*)candidateBuffer->reconCoeffPtr->bufferY)[tuOriginIndex]),
237 				qp,
238 				32,
239 				&(yCountNonZeroCoeffs[currentTuIndex]),
240 				contextPtr->pfMdMode,
241 				0,
242 				COMPONENT_LUMA,
243                 contextPtr->rdoqPmCoreMethod,
244 				(CabacEncodeContext_t*)contextPtr->coeffEstEntropyCoderPtr->cabacEncodeContextPtr,
245 				contextPtr->fullLambda,
246 				candidateBuffer->candidatePtr->type,                 // Input: CU type (INTRA, INTER)
247 				candidateBuffer->candidatePtr->intraLumaMode,
248 				EB_INTRA_CHROMA_DM,
249 				pictureControlSetPtr->cabacCost);
250 
251 			PictureFullDistortionLuma(
252 				contextPtr->transQuantBuffersPtr->tuTransCoeffNxNPtr,
253 				tuOriginIndex,
254 				candidateBuffer->reconCoeffPtr,
255 				tuOriginIndex,
256 				(32 >> contextPtr->pfMdMode),
257 				tuFullDistortion[0],
258 				yCountNonZeroCoeffs[currentTuIndex],
259 				candidateBuffer->candidatePtr->type);
260 
261 
262 			tuFullDistortion[0][DIST_CALC_RESIDUAL] = (tuFullDistortion[0][DIST_CALC_RESIDUAL] + 8) >> 4;
263 			tuFullDistortion[0][DIST_CALC_PREDICTION] = (tuFullDistortion[0][DIST_CALC_PREDICTION] + 8) >> 4;
264 
265 			TuEstimateCoeffBitsLuma(
266 				tuOriginIndex,
267 				contextPtr->coeffEstEntropyCoderPtr,
268 				candidateBuffer->residualQuantCoeffPtr,
269 				yCountNonZeroCoeffs[currentTuIndex],
270 				&yTuCoeffBits,
271 				32,
272 				candidateBuffer->candidatePtr->type,
273 				candidateBuffer->candidatePtr->intraLumaMode,
274 				contextPtr->pfMdMode,
275 				contextPtr->coeffCabacUpdate,
276 				&(candidateBuffer->candBuffCoeffCtxModel),
277 				contextPtr->CabacCost);
278 
279 			TuCalcCostLuma(
280 				MAX_LCU_SIZE,
281 				candidateBuffer->candidatePtr,
282 				currentTuIndex,
283 				32,
284 				yCountNonZeroCoeffs[currentTuIndex],
285 				tuFullDistortion[0],
286 				&yTuCoeffBits,
287 				contextPtr->qp,
288 				contextPtr->fullLambda,
289 				contextPtr->fullChromaLambda);
290 
291 			(*yCoeffBits)                         += yTuCoeffBits;
292 			yFullDistortion[DIST_CALC_RESIDUAL]   += tuFullDistortion[0][DIST_CALC_RESIDUAL];
293 			yFullDistortion[DIST_CALC_PREDICTION] += tuFullDistortion[0][DIST_CALC_PREDICTION];
294 			candidateBuffer->yDc[tuIt] = ABS(((EB_S16*)candidateBuffer->residualQuantCoeffPtr->bufferY)[tuOriginIndex]);
295 			candidateBuffer->yCountNonZeroCoeffs[tuIt] = (EB_U16)yCountNonZeroCoeffs[currentTuIndex];
296 
297 		}
298 
299 	}else{
300 
301 		tuOriginIndex = cuStatsPtr->originX + (cuStatsPtr->originY<<6);
302 		yTuCoeffBits    = 0;
303 		EstimateTransform(
304 			&(((EB_S16*)candidateBuffer->residualQuantCoeffPtr->bufferY)[tuOriginIndex]),
305 			MAX_LCU_SIZE,
306 			&(((EB_S16*)contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->bufferY)[tuOriginIndex]),
307 			MAX_LCU_SIZE,
308 			cuStatsPtr->size,
309 			contextPtr->transformInnerArrayPtr,
310 			0,
311 			EB_FALSE,
312 			contextPtr->pfMdMode);
313 
314 		ProductUnifiedQuantizeInvQuantizeMd(
315 			pictureControlSetPtr,
316 			&(((EB_S16*)contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->bufferY)[tuOriginIndex]),
317 			MAX_LCU_SIZE,
318 			&(((EB_S16*)candidateBuffer->residualQuantCoeffPtr->bufferY)[tuOriginIndex]),
319 			&(((EB_S16*)candidateBuffer->reconCoeffPtr->bufferY)[tuOriginIndex]),
320 			qp,
321 			cuStatsPtr->size,
322 			&(yCountNonZeroCoeffs[0]),
323 			contextPtr->pfMdMode,
324 			0,
325 			COMPONENT_LUMA,
326             contextPtr->rdoqPmCoreMethod,
327 			(CabacEncodeContext_t*)contextPtr->coeffEstEntropyCoderPtr->cabacEncodeContextPtr,
328 			contextPtr->fullLambda,
329 			candidateBuffer->candidatePtr->type,                 // Input: CU type (INTRA, INTER)
330 			candidateBuffer->candidatePtr->intraLumaMode,
331 			EB_INTRA_CHROMA_DM,
332 			pictureControlSetPtr->cabacCost);
333 
334 		if (contextPtr->spatialSseFullLoop == EB_TRUE) {
335 
336 			if (yCountNonZeroCoeffs[0]) {
337 				//since we are missing PF-N2 version for 16x16 and 8x8 iT, do zero out.
338 				if (cuStatsPtr->size < 32 && contextPtr->pfMdMode == PF_N2) {
339 					PfZeroOutUselessQuadrants(
340 						&(((EB_S16*)candidateBuffer->reconCoeffPtr->bufferY)[tuOriginIndex]),
341 						candidateBuffer->reconCoeffPtr->strideY,
342 						(cuStatsPtr->size >> 1));
343 				}
344 
345 				EstimateInvTransform(
346 					&(((EB_S16*)candidateBuffer->reconCoeffPtr->bufferY)[tuOriginIndex]),
347 					candidateBuffer->reconCoeffPtr->strideY,
348 					&(((EB_S16*)contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->bufferY)[tuOriginIndex]),
349 					contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->strideY,
350 					cuStatsPtr->size,
351 					contextPtr->transformInnerArrayPtr,
352 					BIT_INCREMENT_8BIT,
353 					EB_FALSE,
354 					cuStatsPtr->size < 32 ? PF_OFF : contextPtr->pfMdMode);
355 
356                 if ((cuStatsPtr->size >> 3) < 9)
357 				    AdditionKernel_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)][cuStatsPtr->size >> 3](
358 					    &(candidateBuffer->predictionPtr->bufferY[tuOriginIndex]),
359 					    64,
360 					    &(((EB_S16*)(contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->bufferY))[tuOriginIndex]),
361 					    contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->strideY,
362 					    &(candidateBuffer->reconPtr->bufferY[tuOriginIndex]),
363 					    candidateBuffer->reconPtr->strideY,
364 					    cuStatsPtr->size,
365 					    cuStatsPtr->size);
366 
367 			}
368 			else {
369 
370 				PictureCopy8Bit(
371 					candidateBuffer->predictionPtr,
372 					tuOriginIndex,
373 					0,
374 					candidateBuffer->reconPtr,
375 					tuOriginIndex,
376 					0,
377 					cuStatsPtr->size,
378 					cuStatsPtr->size,
379 					0,
380 					0,
381 					PICTURE_BUFFER_DESC_Y_FLAG);
382 			}
383 
384 			tuFullDistortion[0][DIST_CALC_RESIDUAL] = SpatialFullDistortionKernel_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)][Log2f(cuStatsPtr->size) - 2](
385 				&(inputPicturePtr->bufferY[inputOriginIndex]),
386 				inputPicturePtr->strideY,
387 				&(candidateBuffer->reconPtr->bufferY[tuOriginIndex]),
388 				candidateBuffer->reconPtr->strideY,
389 				cuStatsPtr->size,
390 				cuStatsPtr->size);
391 
392 			tuFullDistortion[0][DIST_CALC_PREDICTION] = SpatialFullDistortionKernel_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)][Log2f(cuStatsPtr->size) - 2](
393 				&(inputPicturePtr->bufferY[inputOriginIndex]),
394 				inputPicturePtr->strideY,
395 				&(candidateBuffer->predictionPtr->bufferY[tuOriginIndex]),
396 				candidateBuffer->predictionPtr->strideY,
397 				cuStatsPtr->size,
398 				cuStatsPtr->size);
399 		}
400 		else {
401 
402 			PictureFullDistortionLuma(
403 				contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr,
404 				tuOriginIndex,
405 				candidateBuffer->reconCoeffPtr,
406 				tuOriginIndex,
407 				(contextPtr->cuStats->size >> contextPtr->pfMdMode),
408 				tuFullDistortion[0],
409 				yCountNonZeroCoeffs[0],
410 				candidateBuffer->candidatePtr->type);
411 
412 			const EB_U32 lumaShift = 2 * (7 - Log2f(cuStatsPtr->size));
413 			tuFullDistortion[0][DIST_CALC_RESIDUAL] = (tuFullDistortion[0][DIST_CALC_RESIDUAL] + (EB_U64)(1 << (lumaShift - 1))) >> lumaShift;
414 			tuFullDistortion[0][DIST_CALC_PREDICTION] = (tuFullDistortion[0][DIST_CALC_PREDICTION] + (EB_U64)(1 << (lumaShift - 1))) >> lumaShift;
415 		}
416 
417 		TuEstimateCoeffBitsLuma(
418 			tuOriginIndex,
419 			contextPtr->coeffEstEntropyCoderPtr,
420 			candidateBuffer->residualQuantCoeffPtr,
421 			yCountNonZeroCoeffs[0],
422 			&yTuCoeffBits,
423 			contextPtr->cuStats->size,
424 			candidateBuffer->candidatePtr->type,
425 			candidateBuffer->candidatePtr->intraLumaMode,
426 			contextPtr->pfMdMode,
427 			contextPtr->coeffCabacUpdate,
428 			&(candidateBuffer->candBuffCoeffCtxModel),
429 			contextPtr->CabacCost);
430 
431 		TuCalcCostLuma(
432 			cuStatsPtr->size,
433 			candidateBuffer->candidatePtr,
434 			0,
435 			cuStatsPtr->size,
436 			yCountNonZeroCoeffs[0],
437 			tuFullDistortion[0],
438 			&yTuCoeffBits,
439 			contextPtr->qp,
440 			contextPtr->fullLambda,
441 			contextPtr->fullChromaLambda);
442 
443 		(*yCoeffBits)  += yTuCoeffBits;
444 		yFullDistortion[DIST_CALC_RESIDUAL]   = tuFullDistortion[0][DIST_CALC_RESIDUAL];
445 		yFullDistortion[DIST_CALC_PREDICTION] = tuFullDistortion[0][DIST_CALC_PREDICTION];
446 		candidateBuffer->yDc[0] = ABS(((EB_S16*)candidateBuffer->residualQuantCoeffPtr->bufferY)[tuOriginIndex]);
447 		candidateBuffer->yCountNonZeroCoeffs[0] = (EB_U16)yCountNonZeroCoeffs[0];
448 	}
449 }
450 
451 
UnifiedQuantizeInvQuantize_R(EB_S16 * coeff,const EB_U32 coeffStride,EB_S16 * quantCoeff,EB_S16 * reconCoeff,EB_U32 qp,EB_U32 bitDepth,EB_U32 areaSize,EB_PICTURE sliceType,EB_U32 * yCountNonZeroCoeffs,EB_S8 mdNonZeroCoeff,EB_PF_MODE pfMode,EB_U32 tuOriginX,EB_U32 tuOriginY,EB_U32 lcuOriginY,EB_U32 enableCbflag,EB_U8 enableContouringQCUpdateFlag,EB_MODETYPE type,EB_U32 componentType,EB_U32 temporalLayerIndex,EB_BOOL encDecFlag,EB_U32 dZoffset,EB_RDOQ_PMCORE_TYPE rdoqPmCoreMethod,CabacEncodeContext_t * cabacEncodeCtxPtr,EB_U64 lambda,EB_U32 intraLumaMode,EB_U32 intraChromaMode,CabacCost_t * CabacCost)452 void UnifiedQuantizeInvQuantize_R(
453 	EB_S16                *coeff,
454 	const EB_U32           coeffStride,
455 	EB_S16                *quantCoeff,
456 	EB_S16                *reconCoeff,
457 	EB_U32                 qp,
458 	EB_U32                 bitDepth,
459 	EB_U32                 areaSize,
460 	EB_PICTURE               sliceType,
461 	EB_U32                *yCountNonZeroCoeffs,
462 	EB_S8                  mdNonZeroCoeff,
463 	EB_PF_MODE		       pfMode,
464 	EB_U32                 tuOriginX,
465 	EB_U32                 tuOriginY,
466 	EB_U32                 lcuOriginY,
467 	EB_U32                 enableCbflag,
468 	EB_U8                  enableContouringQCUpdateFlag,
469 	EB_MODETYPE		       type,
470 	EB_U32                 componentType,
471 	EB_U32                 temporalLayerIndex,
472 	EB_BOOL                encDecFlag,
473 	EB_U32                 dZoffset,
474     EB_RDOQ_PMCORE_TYPE    rdoqPmCoreMethod,
475     CabacEncodeContext_t  *cabacEncodeCtxPtr,
476 	EB_U64                 lambda,
477 	EB_U32                 intraLumaMode,
478 	EB_U32                 intraChromaMode,
479 	CabacCost_t           *CabacCost)
480 
481 {
482 
483 	//for the Quant
484 	const EB_S32 qpRem = (EB_S32)QpModSix[qp]; //the output is between 0-5
485 	const EB_S32 qpPer = (EB_S32)QpDivSix[qp] + TRANS_BIT_INCREMENT; //the output is between 0 and 8+TRANS_BIT_INCREMENT   (CHKN TRANS_BIT_INCREMENT =   0)
486 	const EB_U32 qFunc = QFunc[qpRem]; // 15 bits
487 
488 	const EB_U32 internalBitDepth = (EB_U32)bitDepth + TRANS_BIT_INCREMENT;  //CHKN always 8 for 8 bit
489 
490 	const EB_U32 transformShiftNum = MAX_TR_DYNAMIC_RANGE - internalBitDepth - Log2f(areaSize);
491 	const EB_S32 shiftedQBits = QUANT_SHIFT + qpPer + transformShiftNum;
492 	const EB_U32 q_offset = ((sliceType == EB_I_PICTURE || sliceType == EB_IDR_PICTURE) ? QUANT_OFFSET_I : QUANT_OFFSET_P) << (shiftedQBits - 9);
493 
494 	//for the iQuant
495 	const EB_S32 shiftedFFunc = (qpPer > 8) ? (EB_S32)FFunc[qpRem] << (qpPer - 2) : (EB_S32)FFunc[qpRem] << qpPer; // this is 6+8+TRANS_BIT_INCREMENT
496 	const EB_S32 shiftNum = (qpPer > 8) ? QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShiftNum - 2 : QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShiftNum;
497 	const EB_S32 iq_offset = 1 << (shiftNum - 1);
498 	EB_U32 adptive_qp_offset;
499 
500 	adptive_qp_offset = q_offset;
501 
502 	(void)(encDecFlag);
503 	(void)(mdNonZeroCoeff);
504 	adptive_qp_offset = dZoffset ? (dZoffset * (1 << shiftedQBits) / 20) : adptive_qp_offset;
505 
506 	if (pfMode == PF_N2) {
507 		areaSize = areaSize >> 1;
508 	} else if (pfMode == PF_N4) {
509 		areaSize = areaSize >> 2;
510 	}
511 
512 	if (rdoqPmCoreMethod){
513 
514 		DecoupledQuantizeInvQuantizeLoops(
515 			coeff,
516 			coeffStride,
517 			quantCoeff,
518 			reconCoeff,
519 			cabacEncodeCtxPtr,
520 			lambda,
521 			type,
522 			intraLumaMode,
523 			intraChromaMode,
524 			componentType,
525             (EB_U8)temporalLayerIndex,
526 			temporalLayerIndex < 3 ? EB_TRUE : EB_FALSE,
527             (EB_U8)0,
528             (EB_U16)qp,
529             bitDepth,
530 			CabacCost,
531 			qFunc,
532 			q_offset,
533 			shiftedQBits,
534 			shiftedFFunc,
535 			iq_offset,
536 			shiftNum,
537 			areaSize,
538 			&(*yCountNonZeroCoeffs),
539             rdoqPmCoreMethod);
540 	}else{
541 
542 		QiQ_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)][areaSize >> 3](
543 			coeff,
544 			coeffStride,
545 			quantCoeff,
546 			reconCoeff,
547 			qFunc,
548 			adptive_qp_offset,
549 			shiftedQBits,
550 			shiftedFFunc,
551 			iq_offset,
552 			shiftNum,
553 			areaSize,
554 			&(*yCountNonZeroCoeffs));
555 
556 		UpdateQiQCoef_R(
557 			quantCoeff,
558 			reconCoeff,
559 			coeffStride,
560 			shiftedFFunc,
561 			iq_offset,
562 			shiftNum,
563 			areaSize,
564 			&(*yCountNonZeroCoeffs),
565 			componentType,
566 			sliceType,
567 			temporalLayerIndex,
568 			enableCbflag,
569 			enableContouringQCUpdateFlag);
570 	}
571 
572 	(void)tuOriginX;
573 	(void)tuOriginY;
574 	(void)lcuOriginY;
575     (void)type;
576 }
577 /****************************************
578  ************  Full loop ****************
579 ****************************************/
FullLoop_R(LargestCodingUnit_t * lcuPtr,ModeDecisionCandidateBuffer_t * candidateBuffer,ModeDecisionContext_t * contextPtr,const CodedUnitStats_t * cuStatsPtr,EbPictureBufferDesc_t * inputPicturePtr,PictureControlSet_t * pictureControlSetPtr,EB_U32 componentMask,EB_U32 cbQp,EB_U32 crQp,EB_U32 * cbCountNonZeroCoeffs,EB_U32 * crCountNonZeroCoeffs)580 void FullLoop_R (
581 	LargestCodingUnit_t           *lcuPtr,
582    ModeDecisionCandidateBuffer_t  *candidateBuffer,
583    ModeDecisionContext_t          *contextPtr,
584    const CodedUnitStats_t         *cuStatsPtr,
585    EbPictureBufferDesc_t          *inputPicturePtr,
586    PictureControlSet_t            *pictureControlSetPtr,
587    EB_U32                          componentMask,
588    EB_U32                          cbQp,
589    EB_U32                          crQp,
590    EB_U32						  *cbCountNonZeroCoeffs,
591    EB_U32						  *crCountNonZeroCoeffs)
592 {
593 	(void)lcuPtr;
594 
595     EB_S16                *chromaResidualPtr;
596     EB_U32                 tuIndex;
597     EB_U32                 tuOriginIndex;
598     EB_U32                 tuCbOriginIndex;
599 	EB_U32                 tuCrOriginIndex;
600 	EB_U32                 tuCount;
601     const TransformUnitStats_t  *tuStatPtr;
602     EB_U32                 tuItr;
603     EB_U32                 tuSize;
604     EB_U32                 chromatTuSize;
605     EB_U32                 tuOriginX;
606     EB_U32                 tuOriginY;
607 
608     EbPictureBufferDesc_t         * tuTransCoeffTmpPtr;
609     EbPictureBufferDesc_t         * tuQuantCoeffTmpPtr;
610 
611     if (cuStatsPtr->size == MAX_LCU_SIZE) {
612         tuCount = 4;
613         tuIndex = 1;
614         tuTransCoeffTmpPtr = contextPtr->transQuantBuffersPtr->tuTransCoeffNxNPtr;
615         tuQuantCoeffTmpPtr = candidateBuffer->residualQuantCoeffPtr;
616 
617     }
618     else {
619         tuCount = 1;
620         tuIndex = 0;
621         tuTransCoeffTmpPtr = contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr;
622         tuQuantCoeffTmpPtr = candidateBuffer->residualQuantCoeffPtr;
623     }
624 
625     tuItr = 0;
626     do {
627         tuStatPtr = GetTransformUnitStats(tuIndex);
628         tuOriginX = TU_ORIGIN_ADJUST(cuStatsPtr->originX, cuStatsPtr->size, tuStatPtr->offsetX);
629         tuOriginY = TU_ORIGIN_ADJUST(cuStatsPtr->originY, cuStatsPtr->size, tuStatPtr->offsetY);
630         tuSize = cuStatsPtr->size >> tuStatPtr->depth;
631         chromatTuSize = tuSize == 4 ? tuSize : (tuSize >> 1);
632         tuOriginIndex = tuOriginX + tuOriginY * candidateBuffer->residualQuantCoeffPtr->strideY;
633         tuCbOriginIndex = tuSize == 4 ?
634             tuOriginIndex :
635             ((tuOriginX + tuOriginY * candidateBuffer->residualQuantCoeffPtr->strideCb) >> 1);
636         tuCrOriginIndex = tuSize == 4 ?
637             tuOriginIndex :
638             ((tuOriginX + tuOriginY * candidateBuffer->residualQuantCoeffPtr->strideCr) >> 1);
639 
640         //    This function replaces the previous Intra Chroma mode if the LM fast
641         //    cost is better.
642         //    *Note - this might require that we have inv transform in the loop
643         EB_PF_MODE    correctedPFMode = contextPtr->pfMdMode;
644 
645         if (chromatTuSize == 4)
646             correctedPFMode = PF_OFF;
647         else if (chromatTuSize == 8 && contextPtr->pfMdMode == PF_N4)
648             correctedPFMode = PF_N2;
649 
650         if (componentMask & PICTURE_BUFFER_DESC_Cb_FLAG) {
651             // Configure the Chroma Residual Ptr
652             chromaResidualPtr = //(candidateBuffer->candidatePtr->type  == INTRA_MODE )?
653                  //&(((EB_S16*) candidateBuffer->intraChromaResidualPtr->bufferCb)[tuChromaOriginIndex]):
654                 &(((EB_S16*)candidateBuffer->residualQuantCoeffPtr->bufferCb)[tuCbOriginIndex]);
655 
656             // Cb Transform
657             EstimateTransform(
658                 chromaResidualPtr,
659                 candidateBuffer->residualQuantCoeffPtr->strideCb,
660                 &(((EB_S16*)tuTransCoeffTmpPtr->bufferCb)[tuCbOriginIndex]),
661                 tuTransCoeffTmpPtr->strideCb,
662                 chromatTuSize,
663                 contextPtr->transformInnerArrayPtr,
664                 0,
665                 EB_FALSE,
666                 correctedPFMode);
667 
668 			UnifiedQuantizeInvQuantize_R(
669 				&(((EB_S16*)tuTransCoeffTmpPtr->bufferCb)[tuCbOriginIndex]),
670 				tuTransCoeffTmpPtr->strideCb,
671 				&(((EB_S16*)tuQuantCoeffTmpPtr->bufferCb)[tuCbOriginIndex]),
672 				&(((EB_S16*)candidateBuffer->reconCoeffPtr->bufferCb)[tuCbOriginIndex]),
673 				cbQp,
674 				inputPicturePtr->bitDepth,
675 				chromatTuSize,
676 				pictureControlSetPtr->sliceType,
677 				&(cbCountNonZeroCoeffs[tuIndex]),
678 				-1,
679 				correctedPFMode,
680 				0,
681 				0,
682 				0,
683 				0,
684 				0,
685 				candidateBuffer->candidatePtr->type,
686 				COMPONENT_CHROMA,
687 				pictureControlSetPtr->temporalLayerIndex,
688 				EB_FALSE,
689 				0,
690                 contextPtr->rdoqPmCoreMethod,
691 				(CabacEncodeContext_t*)contextPtr->coeffEstEntropyCoderPtr->cabacEncodeContextPtr,
692 				contextPtr->fullLambda,
693 				candidateBuffer->candidatePtr->intraLumaMode,
694 				EB_INTRA_CHROMA_DM,
695 				pictureControlSetPtr->cabacCost);
696 
697             if (contextPtr->spatialSseFullLoop == EB_TRUE) {
698                 if (cbCountNonZeroCoeffs[tuIndex]) {
699 
700                     EB_PF_MODE    correctedPFMode = contextPtr->pfMdMode;
701                     EB_U32 chromatTuSize = (tuSize >> 1);
702                     if (chromatTuSize == 4)
703                         correctedPFMode = PF_OFF;
704                     else if (chromatTuSize == 8 && contextPtr->pfMdMode == PF_N4)
705                         correctedPFMode = PF_N2;
706 
707                     if (correctedPFMode) {
708                         PfZeroOutUselessQuadrants(
709                             &(((EB_S16*)candidateBuffer->reconCoeffPtr->bufferCb)[tuCbOriginIndex]),
710                             candidateBuffer->reconCoeffPtr->strideCb,
711                             (chromatTuSize >> 1));
712                     }
713 
714                     EstimateInvTransform(
715                         &(((EB_S16*)candidateBuffer->reconCoeffPtr->bufferCb)[tuCbOriginIndex]),
716                         candidateBuffer->reconCoeffPtr->strideCb,
717                         &(((EB_S16*)contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->bufferCb)[tuCbOriginIndex]),
718                         contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->strideCb,
719                         chromatTuSize,
720                         contextPtr->transformInnerArrayPtr,
721                         BIT_INCREMENT_8BIT,
722                         EB_FALSE,
723                         EB_FALSE);
724 
725                     PictureAddition(
726                         &(candidateBuffer->predictionPtr->bufferCb[tuCbOriginIndex]),
727                         candidateBuffer->predictionPtr->strideCb,
728                         &(((EB_S16*)(contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->bufferCb))[tuCbOriginIndex]),
729                         contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->strideCb,
730                         &(candidateBuffer->reconPtr->bufferCb[tuCbOriginIndex]),
731                         candidateBuffer->reconPtr->strideCb,
732                         chromatTuSize,
733                         chromatTuSize);
734 
735                 }
736                 else {
737 
738                     PictureCopy8Bit(
739                         candidateBuffer->predictionPtr,
740                         tuOriginIndex,
741                         tuCbOriginIndex,
742                         candidateBuffer->reconPtr,
743                         tuOriginIndex,
744                         tuCbOriginIndex,
745                         tuSize,
746                         tuSize,
747                         chromatTuSize,
748                         chromatTuSize,
749                         PICTURE_BUFFER_DESC_Cb_FLAG);
750                 }
751             }
752 
753         }
754 
755 
756          if (componentMask & PICTURE_BUFFER_DESC_Cr_FLAG) {
757              // Configure the Chroma Residual Ptr
758              chromaResidualPtr = //(candidateBuffer->candidatePtr->type  == INTRA_MODE )?
759                   //&(((EB_S16*) candidateBuffer->intraChromaResidualPtr->bufferCr)[tuChromaOriginIndex]):
760                  &(((EB_S16*)candidateBuffer->residualQuantCoeffPtr->bufferCr)[tuCrOriginIndex]);
761 
762              // Cr Transform
763              EstimateTransform(
764                  chromaResidualPtr,
765                  candidateBuffer->residualQuantCoeffPtr->strideCr,
766                  &(((EB_S16*)tuTransCoeffTmpPtr->bufferCr)[tuCrOriginIndex]),
767                  tuTransCoeffTmpPtr->strideCr,
768                  chromatTuSize,
769                  contextPtr->transformInnerArrayPtr,
770                  0,
771                  EB_FALSE,
772                  correctedPFMode);
773 
774              UnifiedQuantizeInvQuantize_R(
775                  &(((EB_S16*)tuTransCoeffTmpPtr->bufferCr)[tuCrOriginIndex]),
776                  tuTransCoeffTmpPtr->strideCr,
777                  &(((EB_S16*)tuQuantCoeffTmpPtr->bufferCr)[tuCrOriginIndex]),
778                  &(((EB_S16*)candidateBuffer->reconCoeffPtr->bufferCr)[tuCrOriginIndex]),
779                  crQp,
780                  inputPicturePtr->bitDepth,
781                  chromatTuSize,
782                  pictureControlSetPtr->sliceType,
783                  &(crCountNonZeroCoeffs[tuIndex]),
784                  -1,
785                  correctedPFMode,
786                  0,
787                  0,
788                  0,
789                  0,
790                  0,
791                  candidateBuffer->candidatePtr->type,
792                  COMPONENT_CHROMA,
793                  pictureControlSetPtr->temporalLayerIndex,
794                  EB_FALSE,
795                  0,
796                  contextPtr->rdoqPmCoreMethod,
797 				 (CabacEncodeContext_t*)contextPtr->coeffEstEntropyCoderPtr->cabacEncodeContextPtr,
798 				 contextPtr->fullLambda,
799 				 candidateBuffer->candidatePtr->intraLumaMode,
800 				 EB_INTRA_CHROMA_DM,
801 				 pictureControlSetPtr->cabacCost);
802 
803              if (contextPtr->spatialSseFullLoop == EB_TRUE) {
804                  if (crCountNonZeroCoeffs[tuIndex]) {
805 
806                      EB_PF_MODE    correctedPFMode = contextPtr->pfMdMode;
807                      EB_U32 chromatTuSize = (tuSize >> 1);
808                      if (chromatTuSize == 4)
809                          correctedPFMode = PF_OFF;
810                      else if (chromatTuSize == 8 && contextPtr->pfMdMode == PF_N4)
811                          correctedPFMode = PF_N2;
812 
813                      if (correctedPFMode) {
814                          PfZeroOutUselessQuadrants(
815                              &(((EB_S16*)candidateBuffer->reconCoeffPtr->bufferCr)[tuCbOriginIndex]),
816                              candidateBuffer->reconCoeffPtr->strideCr,
817                              (chromatTuSize >> 1));
818                      }
819 
820                      EstimateInvTransform(
821                          &(((EB_S16*)candidateBuffer->reconCoeffPtr->bufferCr)[tuCbOriginIndex]),
822                          candidateBuffer->reconCoeffPtr->strideCr,
823                          &(((EB_S16*)contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->bufferCr)[tuCbOriginIndex]),
824                          contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->strideCr,
825                          chromatTuSize,
826                          contextPtr->transformInnerArrayPtr,
827                          BIT_INCREMENT_8BIT,
828                          EB_FALSE,
829                          EB_FALSE);
830 
831                      PictureAddition(
832                          &(candidateBuffer->predictionPtr->bufferCr[tuCbOriginIndex]),
833                          candidateBuffer->predictionPtr->strideCr,
834                          &(((EB_S16*)(contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->bufferCr))[tuCbOriginIndex]),
835                          contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr->strideCr,
836                          &(candidateBuffer->reconPtr->bufferCr[tuCbOriginIndex]),
837                          candidateBuffer->reconPtr->strideCr,
838                          chromatTuSize,
839                          chromatTuSize);
840 
841                  }
842                  else {
843 
844                      PictureCopy8Bit(
845                          candidateBuffer->predictionPtr,
846                          tuOriginIndex,
847                          tuCbOriginIndex,
848                          candidateBuffer->reconPtr,
849                          tuOriginIndex,
850                          tuCbOriginIndex,
851                          tuSize,
852                          tuSize,
853                          chromatTuSize,
854                          chromatTuSize,
855                          PICTURE_BUFFER_DESC_Cr_FLAG);
856                  }
857              }
858 
859          }
860 
861         ++tuItr;
862 		tuIndex = tuIndexList[tuStatPtr->depth][tuItr];
863 
864     } while (tuItr < tuCount);
865 
866 }
867 
868 //****************************************
869 // ************ CuFullDistortionFastTuMode ****************
870 //****************************************/
CuFullDistortionFastTuMode_R(EbPictureBufferDesc_t * inputPicturePtr,EB_U32 inputCbOriginIndex,LargestCodingUnit_t * lcuPtr,ModeDecisionCandidateBuffer_t * candidateBuffer,ModeDecisionContext_t * contextPtr,ModeDecisionCandidate_t * candidatePtr,const CodedUnitStats_t * cuStatsPtr,EB_U64 cbFullDistortion[DIST_CALC_TOTAL],EB_U64 crFullDistortion[DIST_CALC_TOTAL],EB_U32 countNonZeroCoeffs[3][MAX_NUM_OF_TU_PER_CU],EB_U32 componentMask,EB_U64 * cbCoeffBits,EB_U64 * crCoeffBits)871 void CuFullDistortionFastTuMode_R (
872     EbPictureBufferDesc_t          *inputPicturePtr,
873     EB_U32                          inputCbOriginIndex,
874 	LargestCodingUnit_t            *lcuPtr,
875 	ModeDecisionCandidateBuffer_t  *candidateBuffer,
876 	ModeDecisionContext_t            *contextPtr ,
877 	ModeDecisionCandidate_t		   *candidatePtr,
878 	const CodedUnitStats_t		   *cuStatsPtr,
879 	EB_U64                          cbFullDistortion[DIST_CALC_TOTAL] ,
880 	EB_U64                          crFullDistortion[DIST_CALC_TOTAL] ,
881 	EB_U32                          countNonZeroCoeffs[3][MAX_NUM_OF_TU_PER_CU],
882 	EB_U32							componentMask,
883     EB_U64                         *cbCoeffBits,
884     EB_U64                         *crCoeffBits)
885 {
886 	(void)lcuPtr;
887 
888     EB_U64                          yTuCoeffBits;
889     EB_U64                          cbTuCoeffBits;
890     EB_U64                          crTuCoeffBits;
891 	EB_U32                          tuOriginIndex;
892 	EB_U32                          tuOriginX;
893 	EB_U32                          tuOriginY;
894 	EB_U32                          currentTuIndex;
895 	EB_U32                          chromaShift;
896     EB_U32                          tuChromaOriginIndex;
897 	EB_U64                          tuFullDistortion[3][DIST_CALC_TOTAL];
898 	EbPictureBufferDesc_t          *transformBuffer;
899 	EB_U32                          tuTotalCount;
900 	EB_U32							tuSize;
901 	EB_U32							chromaTuSize;
902 	const TransformUnitStats_t     *tuStatPtr;
903 	EB_U32                          tuItr = 0;
904 
905     if (cuStatsPtr->size == MAX_LCU_SIZE){
906         currentTuIndex = 1;
907         transformBuffer = contextPtr->transQuantBuffersPtr->tuTransCoeffNxNPtr;
908         tuTotalCount = 4;
909 
910     }
911     else{
912         currentTuIndex = 0;
913         transformBuffer = contextPtr->transQuantBuffersPtr->tuTransCoeff2Nx2NPtr;
914         tuTotalCount = 1;
915     }
916 
917 	do {
918             tuStatPtr = GetTransformUnitStats(currentTuIndex);
919 
920             tuOriginX = TU_ORIGIN_ADJUST(cuStatsPtr->originX, cuStatsPtr->size, tuStatPtr->offsetX);
921             tuOriginY = TU_ORIGIN_ADJUST(cuStatsPtr->originY, cuStatsPtr->size, tuStatPtr->offsetY);
922             tuSize    = cuStatsPtr->size >> tuStatPtr->depth;
923             chromaTuSize = tuSize == 4 ? tuSize : (tuSize >> 1);
924             tuOriginIndex = tuOriginX + tuOriginY * candidateBuffer->residualQuantCoeffPtr->strideY ;
925             tuChromaOriginIndex = tuSize == 4 ?
926                 tuOriginIndex :
927                 ((tuOriginX + tuOriginY * candidateBuffer->residualQuantCoeffPtr->strideCb) >> 1);
928 
929             // Reset the Bit Costs
930             yTuCoeffBits  = 0;
931 			cbTuCoeffBits = 0;
932             crTuCoeffBits = 0;
933 
934 			if (componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK){
935 
936                 EB_U32 countNonZeroCoeffsAll[3];
937                 countNonZeroCoeffsAll[0] = countNonZeroCoeffs[0][currentTuIndex];
938                 countNonZeroCoeffsAll[1] = countNonZeroCoeffs[1][currentTuIndex];
939                 countNonZeroCoeffsAll[2] = countNonZeroCoeffs[2][currentTuIndex];
940 
941             EB_PF_MODE    correctedPFMode = contextPtr->pfMdMode;
942 
943             if(chromaTuSize == 4)
944                 correctedPFMode = PF_OFF;
945             else if(chromaTuSize == 8 && contextPtr->pfMdMode == PF_N4)
946                 correctedPFMode = PF_N2;
947 
948             if (contextPtr->spatialSseFullLoop == EB_TRUE) {
949 
950                 tuFullDistortion[1][DIST_CALC_RESIDUAL] = SpatialFullDistortionKernel_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)][Log2f(chromaTuSize) - 2](
951                     &(inputPicturePtr->bufferCb[inputCbOriginIndex]),
952                     inputPicturePtr->strideCb,
953                     &(candidateBuffer->reconPtr->bufferCb[tuChromaOriginIndex]),
954                     candidateBuffer->reconPtr->strideCb,
955                     chromaTuSize,
956                     chromaTuSize);
957 
958 
959                 tuFullDistortion[1][DIST_CALC_PREDICTION] = SpatialFullDistortionKernel_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)][Log2f(chromaTuSize) - 2](
960                     &(inputPicturePtr->bufferCb[inputCbOriginIndex]),
961                     inputPicturePtr->strideCb,
962                     &(candidateBuffer->predictionPtr->bufferCb[tuChromaOriginIndex]),
963                     candidateBuffer->predictionPtr->strideCb,
964                     chromaTuSize,
965                     chromaTuSize);
966 
967                 tuFullDistortion[2][DIST_CALC_RESIDUAL] = SpatialFullDistortionKernel_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)][Log2f(chromaTuSize) - 2](
968                     &(inputPicturePtr->bufferCr[inputCbOriginIndex]),
969                     inputPicturePtr->strideCr,
970                     &(candidateBuffer->reconPtr->bufferCr[tuChromaOriginIndex]),
971                     candidateBuffer->reconPtr->strideCr,
972                     chromaTuSize,
973                     chromaTuSize);
974 
975                 tuFullDistortion[2][DIST_CALC_PREDICTION] = SpatialFullDistortionKernel_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)][Log2f(chromaTuSize) - 2](
976                     &(inputPicturePtr->bufferCr[inputCbOriginIndex]),
977                     inputPicturePtr->strideCr,
978                     &(candidateBuffer->predictionPtr->bufferCr[tuChromaOriginIndex]),
979                     candidateBuffer->predictionPtr->strideCr,
980                     chromaTuSize,
981                     chromaTuSize);
982 
983             }
984             else {
985                 // *Full Distortion (SSE)
986                 // *Note - there are known issues with how this distortion metric is currently
987                 //    calculated.  The amount of scaling between the two arrays is not
988                 //    equivalent.
989                 PictureFullDistortion_R(
990                     transformBuffer,
991                     tuOriginIndex,
992                     tuChromaOriginIndex,
993                     candidateBuffer->reconCoeffPtr,
994                     (tuSize >> contextPtr->pfMdMode),
995                     (chromaTuSize >> correctedPFMode),
996                     PICTURE_BUFFER_DESC_CHROMA_MASK,//componentMask,
997                     tuFullDistortion[0],
998                     tuFullDistortion[1],
999                     tuFullDistortion[2],
1000                     countNonZeroCoeffsAll,
1001                     candidateBuffer->candidatePtr->type);
1002 
1003 
1004                 chromaShift = 2 * (7 - Log2f(chromaTuSize));
1005                 tuFullDistortion[1][DIST_CALC_RESIDUAL] = (tuFullDistortion[1][DIST_CALC_RESIDUAL] + (EB_U64)(1 << (chromaShift - 1))) >> chromaShift;
1006                 tuFullDistortion[1][DIST_CALC_PREDICTION] = (tuFullDistortion[1][DIST_CALC_PREDICTION] + (EB_U64)(1 << (chromaShift - 1))) >> chromaShift;
1007                 tuFullDistortion[2][DIST_CALC_RESIDUAL] = (tuFullDistortion[2][DIST_CALC_RESIDUAL] + (EB_U64)(1 << (chromaShift - 1))) >> chromaShift;
1008                 tuFullDistortion[2][DIST_CALC_PREDICTION] = (tuFullDistortion[2][DIST_CALC_PREDICTION] + (EB_U64)(1 << (chromaShift - 1))) >> chromaShift;
1009 
1010             }
1011 
1012              TuEstimateCoeffBits_R(
1013                  tuOriginIndex,
1014                  tuChromaOriginIndex,
1015                  PICTURE_BUFFER_DESC_CHROMA_MASK,//componentMask,
1016                  contextPtr->coeffEstEntropyCoderPtr,
1017                  candidateBuffer->residualQuantCoeffPtr,
1018                  countNonZeroCoeffs[0][currentTuIndex],
1019                  countNonZeroCoeffs[1][currentTuIndex],
1020                  countNonZeroCoeffs[2][currentTuIndex],
1021                  &yTuCoeffBits,
1022                  &cbTuCoeffBits,
1023                  &crTuCoeffBits,
1024                  candidateBuffer->candidatePtr->transformSize,
1025                  candidateBuffer->candidatePtr->transformChromaSize,
1026                  candidateBuffer->candidatePtr->type,
1027                  candidateBuffer->candidatePtr->intraLumaMode,
1028                  EB_INTRA_CHROMA_DM,
1029                  correctedPFMode,
1030                  contextPtr->coeffCabacUpdate,
1031                  &(candidateBuffer->candBuffCoeffCtxModel),
1032                  contextPtr->CabacCost);
1033 
1034 			TuCalcCost(
1035                 contextPtr->cuSize,
1036                 candidatePtr,
1037                 currentTuIndex,
1038                 tuSize,
1039                 chromaTuSize,
1040                 countNonZeroCoeffs[0][currentTuIndex],
1041 			    countNonZeroCoeffs[1][currentTuIndex],
1042 			    countNonZeroCoeffs[2][currentTuIndex],
1043                 tuFullDistortion[0],
1044                 tuFullDistortion[1],
1045                 tuFullDistortion[2],
1046                 PICTURE_BUFFER_DESC_CHROMA_MASK,//componentMask,
1047                 &yTuCoeffBits,
1048                 &cbTuCoeffBits,
1049                 &crTuCoeffBits,
1050                 contextPtr->qp,
1051                 contextPtr->fullLambda,
1052                 contextPtr->fullChromaLambda);
1053 
1054 			 *cbCoeffBits += cbTuCoeffBits;
1055              *crCoeffBits += crTuCoeffBits;
1056              cbFullDistortion[DIST_CALC_RESIDUAL] += tuFullDistortion[1][DIST_CALC_RESIDUAL];
1057              crFullDistortion[DIST_CALC_RESIDUAL] += tuFullDistortion[2][DIST_CALC_RESIDUAL];
1058              cbFullDistortion[DIST_CALC_PREDICTION] += tuFullDistortion[1][DIST_CALC_PREDICTION];
1059              crFullDistortion[DIST_CALC_PREDICTION] += tuFullDistortion[2][DIST_CALC_PREDICTION];
1060 
1061 			}
1062 
1063             ++tuItr;
1064 			currentTuIndex = tuIndexList[tuStatPtr->depth][tuItr];
1065 
1066 	} while (tuItr < tuTotalCount);
1067 }
1068 
1069 
ExitInterDepthDecision(ModeDecisionContext_t * contextPtr,EB_U32 leafIndex,LargestCodingUnit_t * tbPtr,EB_U32 lcuAddr,EB_U32 tbOriginX,EB_U32 tbOriginY,EB_U64 fullLambda,MdRateEstimationContext_t * mdRateEstimationPtr,PictureControlSet_t * pictureControlSetPtr)1070 EB_U32 ExitInterDepthDecision(
1071 	ModeDecisionContext_t          *contextPtr,
1072 	EB_U32                          leafIndex,
1073 	LargestCodingUnit_t            *tbPtr,
1074 	EB_U32                          lcuAddr,
1075 	EB_U32                          tbOriginX,
1076 	EB_U32                          tbOriginY,
1077 	EB_U64                          fullLambda,
1078 	MdRateEstimationContext_t      *mdRateEstimationPtr,
1079 	PictureControlSet_t            *pictureControlSetPtr)
1080 {
1081 	EB_U32                     lastCuIndex;
1082 	EB_U32                     leftCuIndex;
1083 	EB_U32                     topCuIndex;
1084 	EB_U32                     topLeftCuIndex;
1085 	EB_U32                     depthZeroCandidateCuIndex;
1086 	EB_U32                     depthOneCandidateCuIndex = leafIndex;
1087 	EB_U32                     depthTwoCandidateCuIndex = leafIndex;
1088 	EB_U64                     depthNRate = 0;
1089 	EB_U64                     depthNPlusOneRate = 0;
1090 	EB_U64                     depthNCost = 0;
1091 	EB_U64                     depthNPlusOneCost = 0;
1092 	EB_U32                     cuOriginX;
1093 	EB_U32                     cuOriginY;
1094 	EB_U32                     tbMaxDepth = ((SequenceControlSet_t*)pictureControlSetPtr->sequenceControlSetWrapperPtr->objectPtr)->maxLcuDepth;
1095 
1096 	EncodeContext_t           *encodeContextPtr = ((SequenceControlSet_t*)(pictureControlSetPtr->sequenceControlSetWrapperPtr->objectPtr))->encodeContextPtr;
1097 	SequenceControlSet_t      *sequenceControlSetPtr = (SequenceControlSet_t*)pictureControlSetPtr->sequenceControlSetWrapperPtr->objectPtr;
1098 	const CodedUnitStats_t    *curCuStatsPtr;
1099 	const CodedUnitStats_t    *depthTwoCuStatsPtr;
1100 	const CodedUnitStats_t    *depthOneCuStatsPtr;
1101 	const CodedUnitStats_t    *depthZeroCuStatsPtr;
1102 
1103 	(void)lcuAddr;
1104 	lastCuIndex = leafIndex;
1105 	curCuStatsPtr = GetCodedUnitStats(leafIndex);
1106 	cuOriginX = tbOriginX + curCuStatsPtr->originX;
1107 	cuOriginY = tbOriginY + curCuStatsPtr->originY;
1108 
1109 
1110 
1111 	//Parent is winner, update its cost, and trigger and inter-depth check-point.
1112 	EB_U64 SplitRate = 0;
1113 	SplitFlagRate(
1114 		contextPtr,
1115 		tbPtr->codedLeafArrayPtr[leafIndex],
1116 		0,
1117 		&SplitRate,
1118 		contextPtr->fullLambda,
1119 		contextPtr->mdRateEstimationPtr,
1120 		sequenceControlSetPtr->maxLcuDepth);
1121 
1122 	contextPtr->mdLocalCuUnit[leafIndex].cost += SplitRate;
1123 
1124 	if (curCuStatsPtr->depth == 0) {
1125 		contextPtr->groupOf16x16BlocksCount = 0;
1126 	}
1127 	else if (curCuStatsPtr->depth == 1) {
1128 		contextPtr->groupOf16x16BlocksCount++;
1129 		contextPtr->groupOf8x8BlocksCount = 0;
1130 	}
1131 	else if (curCuStatsPtr->depth == 2) {
1132 		contextPtr->groupOf8x8BlocksCount++;
1133 	}
1134 
1135 
1136 	/*** Stage 0: Inter depth decision: depth 2 vs depth 3 ***/
1137 
1138 	// Walks to the last coded 8x8 block for merging
1139 	if ((GROUP_OF_4_8x8_BLOCKS(cuOriginX, cuOriginY))) {
1140 
1141 		depthTwoCandidateCuIndex = leafIndex - DEPTH_THREE_STEP - DEPTH_THREE_STEP - DEPTH_THREE_STEP - 1;
1142 
1143 		contextPtr->groupOf8x8BlocksCount++;
1144 
1145 		// From the last coded cu index, get the indices of the left, top, and top left cus
1146 		leftCuIndex = leafIndex - DEPTH_THREE_STEP;
1147 		topCuIndex = leftCuIndex - DEPTH_THREE_STEP;
1148 		topLeftCuIndex = topCuIndex - DEPTH_THREE_STEP;
1149 
1150 		// From the top left index, get the index of the candidate pu for merging
1151 		depthTwoCandidateCuIndex = topLeftCuIndex - 1;
1152 
1153 		// Copy the Mode & Depth of the Top-Left N+1 block to the N block for the SplitContext calculation
1154 		//   This needs to be done in the case that the N block was initially not calculated.
1155 
1156 		contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].leftNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborMode;
1157 		contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].leftNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborDepth;
1158 		contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].topNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborMode;
1159 		contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].topNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborDepth;
1160 
1161 		// Compute depth N cost
1162 		SplitFlagRate(
1163 			contextPtr,
1164 			tbPtr->codedLeafArrayPtr[depthTwoCandidateCuIndex],
1165 			0,
1166 			&depthNRate,
1167 			fullLambda,
1168 			mdRateEstimationPtr,
1169 			tbMaxDepth);
1170 		if (contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].testedCuFlag == EB_FALSE)
1171 			contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost = MAX_CU_COST;
1172 
1173 		depthNCost = contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost + depthNRate;
1174 		// Compute depth N+1 cost
1175 		SplitFlagRate(
1176 			contextPtr,
1177 			tbPtr->codedLeafArrayPtr[depthTwoCandidateCuIndex],
1178 			1,
1179 			&depthNPlusOneRate,
1180 			fullLambda,
1181 			mdRateEstimationPtr,
1182 			tbMaxDepth);
1183 		depthNPlusOneCost =
1184 			contextPtr->mdLocalCuUnit[leafIndex].cost +
1185 			contextPtr->mdLocalCuUnit[leftCuIndex].cost +
1186 			contextPtr->mdLocalCuUnit[topCuIndex].cost +
1187 			contextPtr->mdLocalCuUnit[topLeftCuIndex].cost +
1188 			depthNPlusOneRate;
1189 
1190 
1191 		// Inter depth comparison: depth 2 vs depth 3
1192 		if (depthNCost <= depthNPlusOneCost){
1193 
1194 			// If the cost is low enough to warrant not spliting further:
1195 			// 1. set the split flag of the candidate pu for merging to false
1196 			// 2. update the last pu index
1197 			tbPtr->codedLeafArrayPtr[depthTwoCandidateCuIndex]->splitFlag = EB_FALSE;
1198 			contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost = depthNCost;
1199 			lastCuIndex = depthTwoCandidateCuIndex;
1200 		}
1201 		else {
1202 			// If the cost is not low enough:
1203 			// update the cost of the candidate pu for merging
1204 			// this update is required for the next inter depth decision
1205 			contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost = depthNPlusOneCost;
1206 		}
1207 
1208 
1209 	}
1210 
1211 	// Stage 1: Inter depth decision: depth 1 vs depth 2
1212 
1213 	// Walks to the last coded 16x16 block for merging
1214 	depthTwoCuStatsPtr = GetCodedUnitStats(depthTwoCandidateCuIndex);
1215 	cuOriginX = tbOriginX + depthTwoCuStatsPtr->originX;
1216 	cuOriginY = tbOriginY + depthTwoCuStatsPtr->originY;
1217 
1218 	if (GROUP_OF_4_16x16_BLOCKS(cuOriginX, cuOriginY) &&
1219 		(contextPtr->groupOf8x8BlocksCount == 4)){
1220 
1221 
1222 		depthOneCandidateCuIndex = depthTwoCandidateCuIndex - DEPTH_TWO_STEP - DEPTH_TWO_STEP - DEPTH_TWO_STEP - 1;
1223 
1224 		contextPtr->groupOf8x8BlocksCount = 0;
1225 		contextPtr->groupOf16x16BlocksCount++;
1226 
1227 		// From the last coded pu index, get the indices of the left, top, and top left pus
1228 		leftCuIndex = depthTwoCandidateCuIndex - DEPTH_TWO_STEP;
1229 		topCuIndex = leftCuIndex - DEPTH_TWO_STEP;
1230 		topLeftCuIndex = topCuIndex - DEPTH_TWO_STEP;
1231 
1232 		// Copy the Mode & Depth of the Top-Left N+1 block to the N block for the SplitContext calculation
1233 		//   This needs to be done in the case that the N block was initially not calculated.
1234 
1235 		contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].leftNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborMode;
1236 		contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].leftNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborDepth;
1237 		contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].topNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborMode;
1238 		contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].topNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborDepth;
1239 
1240 		// From the top left index, get the index of the candidate pu for merging
1241 		depthOneCandidateCuIndex = topLeftCuIndex - 1;
1242 
1243 		depthOneCuStatsPtr = GetCodedUnitStats(depthOneCandidateCuIndex);
1244 		if (depthOneCuStatsPtr->depth == 1) {
1245 
1246 			// Compute depth N cost
1247 			SplitFlagRate(
1248 				contextPtr,
1249 				tbPtr->codedLeafArrayPtr[depthOneCandidateCuIndex],
1250 				0,
1251 				&depthNRate,
1252 				fullLambda,
1253 				mdRateEstimationPtr,
1254 				tbMaxDepth);
1255 			if (contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].testedCuFlag == EB_FALSE)
1256 				contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost = MAX_CU_COST;
1257 			depthNCost = contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost + depthNRate;
1258 
1259 			// Compute depth N+1 cost
1260 			SplitFlagRate(
1261 				contextPtr,
1262 				tbPtr->codedLeafArrayPtr[depthOneCandidateCuIndex],
1263 				1,
1264 				&depthNPlusOneRate,
1265 				fullLambda,
1266 				mdRateEstimationPtr,
1267 				tbMaxDepth);
1268 			depthNPlusOneCost =
1269 				contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost +
1270 				contextPtr->mdLocalCuUnit[leftCuIndex].cost +
1271 				contextPtr->mdLocalCuUnit[topCuIndex].cost +
1272 				contextPtr->mdLocalCuUnit[topLeftCuIndex].cost +
1273 				depthNPlusOneRate;
1274 			CHECK_REPORT_ERROR(
1275 				(contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost != MAX_CU_COST),
1276 				encodeContextPtr->appCallbackPtr,
1277 				EB_ENC_FL_ERROR4);
1278 			CHECK_REPORT_ERROR(
1279 				(contextPtr->mdLocalCuUnit[leftCuIndex].cost != MAX_CU_COST),
1280 				encodeContextPtr->appCallbackPtr,
1281 				EB_ENC_FL_ERROR4);
1282 			CHECK_REPORT_ERROR(
1283 				(contextPtr->mdLocalCuUnit[topCuIndex].cost != MAX_CU_COST),
1284 				encodeContextPtr->appCallbackPtr,
1285 				EB_ENC_FL_ERROR4);
1286 			CHECK_REPORT_ERROR(
1287 				(contextPtr->mdLocalCuUnit[topLeftCuIndex].cost != MAX_CU_COST),
1288 				encodeContextPtr->appCallbackPtr,
1289 				EB_ENC_FL_ERROR4);
1290 
1291 
1292 			// Inter depth comparison: depth 1 vs depth 2
1293 			if (depthNCost <= depthNPlusOneCost){
1294 
1295 				// If the cost is low enough to warrant not spliting further:
1296 				// 1. set the split flag of the candidate pu for merging to false
1297 				// 2. update the last pu index
1298 				tbPtr->codedLeafArrayPtr[depthOneCandidateCuIndex]->splitFlag = EB_FALSE;
1299 				contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost = depthNCost;
1300 				lastCuIndex = depthOneCandidateCuIndex;
1301 			}
1302 			else {
1303 				// If the cost is not low enough:
1304 				// update the cost of the candidate pu for merging
1305 				// this update is required for the next inter depth decision
1306 				contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost = depthNPlusOneCost;
1307 			}
1308 
1309 
1310 		}
1311 	}
1312 
1313 	// Stage 2: Inter depth decision: depth 0 vs depth 1
1314 
1315 	// Walks to the last coded 32x32 block for merging
1316 	// Stage 2 isn't performed in I slices since the abcense of 64x64 candidates
1317 	depthOneCuStatsPtr = GetCodedUnitStats(depthOneCandidateCuIndex);
1318 	cuOriginX = tbOriginX + depthTwoCuStatsPtr->originX;
1319 	cuOriginY = tbOriginY + depthTwoCuStatsPtr->originY;
1320 	if ((pictureControlSetPtr->sliceType == EB_P_PICTURE || pictureControlSetPtr->sliceType == EB_B_PICTURE)
1321 		&& GROUP_OF_4_32x32_BLOCKS(cuOriginX, cuOriginY) &&
1322 		(contextPtr->groupOf16x16BlocksCount == 4)) {
1323 
1324 		depthZeroCandidateCuIndex = depthOneCandidateCuIndex - DEPTH_ONE_STEP - DEPTH_ONE_STEP - DEPTH_ONE_STEP - 1;
1325 
1326 		contextPtr->groupOf16x16BlocksCount = 0;
1327 
1328 		// From the last coded pu index, get the indices of the left, top, and top left pus
1329 		leftCuIndex = depthOneCandidateCuIndex - DEPTH_ONE_STEP;
1330 		topCuIndex = leftCuIndex - DEPTH_ONE_STEP;
1331 		topLeftCuIndex = topCuIndex - DEPTH_ONE_STEP;
1332 
1333 		// Copy the Mode & Depth of the Top-Left N+1 block to the N block for the SplitContext calculation
1334 		//   This needs to be done in the case that the N block was initially not calculated.
1335 
1336 		contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].leftNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborMode;
1337 		contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].leftNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborDepth;
1338 		contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].topNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborMode;
1339 		contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].topNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborDepth;
1340 
1341 		// From the top left index, get the index of the candidate pu for merging
1342 		depthZeroCandidateCuIndex = topLeftCuIndex - 1;
1343 
1344 		depthZeroCuStatsPtr = GetCodedUnitStats(depthZeroCandidateCuIndex);
1345 		if (depthZeroCuStatsPtr->depth == 0) {
1346 
1347 			// Compute depth N cost
1348 			SplitFlagRate(
1349 				contextPtr,
1350 				tbPtr->codedLeafArrayPtr[depthZeroCandidateCuIndex],
1351 				0,
1352 				&depthNRate,
1353 				fullLambda,
1354 				mdRateEstimationPtr,
1355 				tbMaxDepth);
1356 			if (contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].testedCuFlag == EB_FALSE)
1357 				contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].cost = MAX_CU_COST;
1358 			depthNCost = contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].cost + depthNRate;
1359 			// Compute depth N+1 cost
1360 			SplitFlagRate(
1361 				contextPtr,
1362 				tbPtr->codedLeafArrayPtr[depthZeroCandidateCuIndex],
1363 				1,
1364 				&depthNPlusOneRate,
1365 				fullLambda,
1366 				mdRateEstimationPtr,
1367 				tbMaxDepth);
1368 			depthNPlusOneCost =
1369 				contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost +
1370 				contextPtr->mdLocalCuUnit[leftCuIndex].cost +
1371 				contextPtr->mdLocalCuUnit[topCuIndex].cost +
1372 				contextPtr->mdLocalCuUnit[topLeftCuIndex].cost +
1373 				depthNPlusOneRate;
1374 
1375 			// Inter depth comparison: depth 0 vs depth 1
1376 			if (depthNCost <= depthNPlusOneCost){
1377 
1378 				// If the cost is low enough to warrant not spliting further:
1379 				// 1. set the split flag of the candidate pu for merging to false
1380 				// 2. update the last pu index
1381 				tbPtr->codedLeafArrayPtr[depthZeroCandidateCuIndex]->splitFlag = EB_FALSE;
1382 				lastCuIndex = depthZeroCandidateCuIndex;
1383 			}
1384 
1385 
1386 		}
1387 	}
1388 
1389 	return lastCuIndex;
1390 }
1391 
StopSplitCondition(SequenceControlSet_t * sequenceControlSetPtr,PictureControlSet_t * pictureControlSetPtr,ModeDecisionContext_t * contextPtr,const CodedUnitStats_t * curCuStatsPtr,EB_U32 lcuAddr,EB_U32 leafIndex)1392 EB_BOOL  StopSplitCondition(
1393     SequenceControlSet_t    *sequenceControlSetPtr,
1394     PictureControlSet_t     *pictureControlSetPtr,
1395     ModeDecisionContext_t   *contextPtr,
1396     const CodedUnitStats_t  *curCuStatsPtr,
1397     EB_U32                   lcuAddr,
1398     EB_U32                   leafIndex)
1399 {
1400 
1401     LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuAddr];
1402 
1403     EB_BOOL stopSplitFlag = EB_TRUE;
1404 
1405     if ( pictureControlSetPtr->ParentPcsPtr->depthMode == PICT_FULL85_DEPTH_MODE ||
1406          pictureControlSetPtr->ParentPcsPtr->depthMode == PICT_FULL84_DEPTH_MODE ||
1407          (pictureControlSetPtr->ParentPcsPtr->depthMode == PICT_LCU_SWITCH_DEPTH_MODE && (pictureControlSetPtr->ParentPcsPtr->lcuMdModeArray[lcuAddr] == LCU_FULL85_DEPTH_MODE || pictureControlSetPtr->ParentPcsPtr->lcuMdModeArray[lcuAddr] == LCU_FULL84_DEPTH_MODE || pictureControlSetPtr->ParentPcsPtr->lcuMdModeArray[lcuAddr] == LCU_AVC_DEPTH_MODE))
1408         ) {
1409 
1410         stopSplitFlag = EB_FALSE;
1411     }
1412     else if (pictureControlSetPtr->temporalLayerIndex == 0) {
1413         stopSplitFlag = EB_FALSE;
1414     }
1415     else{
1416         if (sequenceControlSetPtr->staticConfig.qp >= 20    &&
1417             pictureControlSetPtr->sliceType != EB_I_PICTURE      &&
1418             pictureControlSetPtr->temporalLayerIndex == 0   &&
1419             pictureControlSetPtr->ParentPcsPtr->logoPicFlag &&
1420             pictureControlSetPtr->ParentPcsPtr->edgeResultsPtr[lcuAddr].edgeBlockNum) {
1421 
1422             stopSplitFlag = EB_FALSE;
1423         }
1424 
1425 
1426         if (stopSplitFlag != EB_FALSE)
1427         {
1428 
1429             EB_U32 lcuEdgeFlag = pictureControlSetPtr->ParentPcsPtr->edgeResultsPtr[lcuAddr].edgeBlockNum == 0 ? 0 : 1;
1430             EB_U64 d0Th;
1431             EB_U64 d1Th;
1432             EB_U64 d2Th;
1433 
1434             d0Th = depth0Th[lcuEdgeFlag][pictureControlSetPtr->ParentPcsPtr->hierarchicalLevels][pictureControlSetPtr->temporalLayerIndex];
1435             d1Th = depth1Th[lcuEdgeFlag][pictureControlSetPtr->ParentPcsPtr->hierarchicalLevels][pictureControlSetPtr->temporalLayerIndex];
1436             d2Th = depth2Th[lcuEdgeFlag][pictureControlSetPtr->ParentPcsPtr->hierarchicalLevels][pictureControlSetPtr->temporalLayerIndex];
1437 
1438             EB_BOOL interSlice =    (pictureControlSetPtr->sliceType == EB_P_PICTURE) || (pictureControlSetPtr->sliceType == EB_B_PICTURE) ? EB_TRUE : EB_FALSE;
1439             EB_BOOL stopAtDepth0 = ((curCuStatsPtr->depth == 0) && (contextPtr->mdLocalCuUnit[leafIndex].fullDistortion < d0Th)) ? EB_TRUE : EB_FALSE;
1440             EB_BOOL stopAtDepth1 = ((curCuStatsPtr->depth == 1) && (contextPtr->mdLocalCuUnit[leafIndex].fullDistortion < d1Th)) ? EB_TRUE : EB_FALSE;
1441             EB_BOOL stopAtDepth2 = ((curCuStatsPtr->depth == 2) && (contextPtr->mdLocalCuUnit[leafIndex].fullDistortion < d2Th)) ? EB_TRUE : EB_FALSE;
1442 
1443             stopSplitFlag = (interSlice && (stopAtDepth0 || stopAtDepth1 || stopAtDepth2)) ? EB_TRUE : EB_FALSE;
1444 
1445             if  (!lcuParams->isCompleteLcu                                                        ||
1446                  pictureControlSetPtr->ParentPcsPtr->lcuIsolatedNonHomogeneousAreaArray[lcuAddr] ||
1447                  (sequenceControlSetPtr->inputResolution < INPUT_SIZE_4K_RANGE && pictureControlSetPtr->lcuPtrArray[lcuAddr]->auraStatus == AURA_STATUS_1)) {
1448 
1449                 stopSplitFlag = EB_FALSE;
1450             }
1451 
1452         }
1453 
1454     }
1455     return stopSplitFlag;
1456 }
1457 
1458 /**********************************************
1459  * Inter Depth Split Decision
1460  **********************************************/
ProductPerformInterDepthDecision(ModeDecisionContext_t * contextPtr,EB_U32 leafIndex,LargestCodingUnit_t * tbPtr,EB_U32 lcuAddr,EB_U32 tbOriginX,EB_U32 tbOriginY,EB_U64 fullLambda,MdRateEstimationContext_t * mdRateEstimationPtr,PictureControlSet_t * pictureControlSetPtr)1461 EB_U32 ProductPerformInterDepthDecision(
1462     ModeDecisionContext_t          *contextPtr,
1463     EB_U32                          leafIndex,
1464     LargestCodingUnit_t            *tbPtr,
1465     EB_U32                          lcuAddr,
1466     EB_U32                          tbOriginX,
1467     EB_U32                          tbOriginY,
1468     EB_U64                          fullLambda,
1469     MdRateEstimationContext_t      *mdRateEstimationPtr,
1470     PictureControlSet_t            *pictureControlSetPtr)
1471 {
1472     EB_U32                     lastCuIndex;
1473     EB_U32                     leftCuIndex;
1474     EB_U32                     topCuIndex;
1475     EB_U32                     topLeftCuIndex;
1476     EB_U32                     depthZeroCandidateCuIndex;
1477     EB_U32                     depthOneCandidateCuIndex = leafIndex;
1478     EB_U32                     depthTwoCandidateCuIndex = leafIndex;
1479     EB_U64                     depthNRate = 0;
1480     EB_U64                     depthNPlusOneRate = 0;
1481     EB_U64                     depthNCost = 0;
1482     EB_U64                     depthNPlusOneCost = 0;
1483     EB_U32                     cuOriginX;
1484     EB_U32                     cuOriginY;
1485 
1486     EB_U32                     tbMaxDepth = ((SequenceControlSet_t*)pictureControlSetPtr->sequenceControlSetWrapperPtr->objectPtr)->maxLcuDepth;
1487     EB_BOOL                    stopSplitFlag ;
1488     EB_BOOL                    lastDepthFlag = tbPtr->codedLeafArrayPtr[leafIndex]->splitFlag == EB_FALSE ? EB_TRUE : EB_FALSE;
1489     EncodeContext_t           *encodeContextPtr = ((SequenceControlSet_t*)(pictureControlSetPtr->sequenceControlSetWrapperPtr->objectPtr))->encodeContextPtr;
1490     SequenceControlSet_t      *sequenceControlSetPtr = (SequenceControlSet_t*)pictureControlSetPtr->sequenceControlSetWrapperPtr->objectPtr;
1491     const CodedUnitStats_t    *curCuStatsPtr;
1492     const CodedUnitStats_t    *depthTwoCuStatsPtr;
1493     const CodedUnitStats_t    *depthOneCuStatsPtr;
1494     const CodedUnitStats_t    *depthZeroCuStatsPtr;
1495 
1496     lastCuIndex = leafIndex;
1497     curCuStatsPtr = GetCodedUnitStats(leafIndex);
1498     cuOriginX = tbOriginX + curCuStatsPtr->originX;
1499     cuOriginY = tbOriginY + curCuStatsPtr->originY;
1500     EB_U8 interDepthW12 = 0;
1501     EB_U8 interDepthW01 = 0;
1502 
1503     stopSplitFlag = StopSplitCondition(
1504         sequenceControlSetPtr,
1505         pictureControlSetPtr,
1506         contextPtr,
1507         curCuStatsPtr,
1508         lcuAddr,
1509         leafIndex);
1510 
1511     if (lastDepthFlag || stopSplitFlag) {
1512 		tbPtr->codedLeafArrayPtr[leafIndex]->splitFlag = EB_FALSE;
1513 
1514 
1515         if (curCuStatsPtr->depth == 1) {
1516             contextPtr->groupOf16x16BlocksCount ++;
1517         } else if (curCuStatsPtr->depth == 2) {
1518             contextPtr->groupOf8x8BlocksCount ++;
1519         }
1520     }
1521 
1522     /*** Stage 0: Inter depth decision: depth 2 vs depth 3 ***/
1523 
1524     // Walks to the last coded 8x8 block for merging
1525     if ((GROUP_OF_4_8x8_BLOCKS(cuOriginX, cuOriginY))) {
1526 
1527         depthTwoCandidateCuIndex = leafIndex - DEPTH_THREE_STEP - DEPTH_THREE_STEP - DEPTH_THREE_STEP - 1;
1528 
1529         contextPtr->groupOf8x8BlocksCount ++;
1530 
1531         // From the last coded cu index, get the indices of the left, top, and top left cus
1532         leftCuIndex = leafIndex - DEPTH_THREE_STEP;
1533         topCuIndex = leftCuIndex - DEPTH_THREE_STEP;
1534         topLeftCuIndex = topCuIndex - DEPTH_THREE_STEP;
1535 
1536         // From the top left index, get the index of the candidate pu for merging
1537         depthTwoCandidateCuIndex = topLeftCuIndex - 1;
1538 
1539         // Copy the Mode & Depth of the Top-Left N+1 block to the N block for the SplitContext calculation
1540         //   This needs to be done in the case that the N block was initially not calculated.
1541 
1542 		contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].leftNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborMode;
1543 		contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].leftNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborDepth;
1544 		contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].topNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborMode;
1545 		contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].topNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborDepth;
1546 
1547         // Compute depth N cost
1548 		SplitFlagRate(
1549 			contextPtr,
1550             tbPtr->codedLeafArrayPtr[depthTwoCandidateCuIndex],
1551             0,
1552             &depthNRate,
1553             fullLambda,
1554             mdRateEstimationPtr,
1555 			tbMaxDepth);
1556 		if (contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].testedCuFlag == EB_FALSE)
1557 			contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost = MAX_CU_COST;
1558 
1559 		depthNCost = contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost + depthNRate;
1560 
1561 		// Compute depth N+1 cost
1562 		SplitFlagRate(
1563 			contextPtr,
1564 			tbPtr->codedLeafArrayPtr[depthTwoCandidateCuIndex],
1565 			1,
1566 			&depthNPlusOneRate,
1567 			fullLambda,
1568 			mdRateEstimationPtr,
1569 			tbMaxDepth);
1570 		depthNPlusOneCost =
1571 			contextPtr->mdLocalCuUnit[leafIndex].cost +
1572 			contextPtr->mdLocalCuUnit[leftCuIndex].cost +
1573 			contextPtr->mdLocalCuUnit[topCuIndex].cost +
1574 			contextPtr->mdLocalCuUnit[topLeftCuIndex].cost +
1575 			depthNPlusOneRate;
1576 		// Inter depth comparison: depth 2 vs depth 3
1577 		if (depthNCost <= depthNPlusOneCost){
1578 
1579 			// If the cost is low enough to warrant not spliting further:
1580 			// 1. set the split flag of the candidate pu for merging to false
1581 			// 2. update the last pu index
1582 			tbPtr->codedLeafArrayPtr[depthTwoCandidateCuIndex]->splitFlag = EB_FALSE;
1583 			contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost = depthNCost;
1584 			lastCuIndex = depthTwoCandidateCuIndex;
1585 		}
1586 		else {
1587 			// If the cost is not low enough:
1588 			// update the cost of the candidate pu for merging
1589 			// this update is required for the next inter depth decision
1590 			contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost = depthNPlusOneCost;
1591 		}
1592 
1593 
1594     }
1595 
1596     // Stage 1: Inter depth decision: depth 1 vs depth 2
1597 
1598     // Walks to the last coded 16x16 block for merging
1599     depthTwoCuStatsPtr = GetCodedUnitStats(depthTwoCandidateCuIndex);
1600     cuOriginX  = tbOriginX + depthTwoCuStatsPtr->originX;
1601     cuOriginY  = tbOriginY + depthTwoCuStatsPtr->originY;
1602     if (GROUP_OF_4_16x16_BLOCKS(cuOriginX, cuOriginY) &&
1603         (contextPtr->groupOf8x8BlocksCount == 4 ) ){
1604 
1605         depthOneCandidateCuIndex = depthTwoCandidateCuIndex - DEPTH_TWO_STEP - DEPTH_TWO_STEP - DEPTH_TWO_STEP - 1;
1606 
1607         contextPtr->groupOf8x8BlocksCount = 0;
1608         contextPtr->groupOf16x16BlocksCount ++;
1609 
1610         // From the last coded pu index, get the indices of the left, top, and top left pus
1611         leftCuIndex = depthTwoCandidateCuIndex - DEPTH_TWO_STEP;
1612         topCuIndex = leftCuIndex - DEPTH_TWO_STEP;
1613         topLeftCuIndex = topCuIndex - DEPTH_TWO_STEP;
1614 
1615         // Copy the Mode & Depth of the Top-Left N+1 block to the N block for the SplitContext calculation
1616         //   This needs to be done in the case that the N block was initially not calculated.
1617 
1618 		contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].leftNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborMode;
1619 		contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].leftNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborDepth;
1620 		contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].topNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborMode;
1621 		contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].topNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborDepth;
1622 
1623         // From the top left index, get the index of the candidate pu for merging
1624         depthOneCandidateCuIndex = topLeftCuIndex - 1;
1625 
1626         depthOneCuStatsPtr = GetCodedUnitStats(depthOneCandidateCuIndex);
1627         if (depthOneCuStatsPtr->depth == 1) {
1628 
1629             // Compute depth N cost
1630 			SplitFlagRate(
1631 				contextPtr,
1632                 tbPtr->codedLeafArrayPtr[depthOneCandidateCuIndex],
1633                 0,
1634                 &depthNRate,
1635                 fullLambda,
1636                 mdRateEstimationPtr,
1637 				tbMaxDepth);
1638 			if (contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].testedCuFlag == EB_FALSE)
1639 				contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost = MAX_CU_COST;
1640 			depthNCost = contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost + depthNRate;
1641 
1642 			// Compute depth N+1 cost
1643 			SplitFlagRate(
1644 				contextPtr,
1645 				tbPtr->codedLeafArrayPtr[depthOneCandidateCuIndex],
1646 				1,
1647 				&depthNPlusOneRate,
1648 				fullLambda,
1649 				mdRateEstimationPtr,
1650 				tbMaxDepth);
1651 			depthNPlusOneCost =
1652 				contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost +
1653 				contextPtr->mdLocalCuUnit[leftCuIndex].cost +
1654 				contextPtr->mdLocalCuUnit[topCuIndex].cost +
1655 				contextPtr->mdLocalCuUnit[topLeftCuIndex].cost +
1656 				depthNPlusOneRate;
1657 			CHECK_REPORT_ERROR(
1658 				(contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost != MAX_CU_COST),
1659 				encodeContextPtr->appCallbackPtr,
1660 				EB_ENC_FL_ERROR4);
1661 			CHECK_REPORT_ERROR(
1662 				(contextPtr->mdLocalCuUnit[leftCuIndex].cost != MAX_CU_COST),
1663 				encodeContextPtr->appCallbackPtr,
1664 				EB_ENC_FL_ERROR4);
1665 			CHECK_REPORT_ERROR(
1666 				(contextPtr->mdLocalCuUnit[topCuIndex].cost != MAX_CU_COST),
1667 				encodeContextPtr->appCallbackPtr,
1668 				EB_ENC_FL_ERROR4);
1669 			CHECK_REPORT_ERROR(
1670 				(contextPtr->mdLocalCuUnit[topLeftCuIndex].cost != MAX_CU_COST),
1671 				encodeContextPtr->appCallbackPtr,
1672 				EB_ENC_FL_ERROR4);
1673 
1674 			if (depthNPlusOneCost < MAX_CU_COST)
1675 				depthNPlusOneCost = depthNPlusOneCost + ((EB_S64)depthNPlusOneCost*interDepthW12) / 100;
1676 
1677 			// Inter depth comparison: depth 1 vs depth 2
1678 			if (depthNCost <= depthNPlusOneCost){
1679 
1680 				// If the cost is low enough to warrant not spliting further:
1681 				// 1. set the split flag of the candidate pu for merging to false
1682 				// 2. update the last pu index
1683 				tbPtr->codedLeafArrayPtr[depthOneCandidateCuIndex]->splitFlag = EB_FALSE;
1684 				contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost = depthNCost;
1685 				lastCuIndex = depthOneCandidateCuIndex;
1686 			}
1687 			else {
1688 				// If the cost is not low enough:
1689 				// update the cost of the candidate pu for merging
1690 				// this update is required for the next inter depth decision
1691 				contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost = depthNPlusOneCost;
1692 			}
1693 
1694 
1695         }
1696     }
1697 
1698     // Stage 2: Inter depth decision: depth 0 vs depth 1
1699 
1700     // Walks to the last coded 32x32 block for merging
1701     // Stage 2 isn't performed in I slices since the abcense of 64x64 candidates
1702     depthOneCuStatsPtr = GetCodedUnitStats(depthOneCandidateCuIndex);
1703     cuOriginX  = tbOriginX + depthTwoCuStatsPtr->originX;
1704     cuOriginY  = tbOriginY + depthTwoCuStatsPtr->originY;
1705     if ((pictureControlSetPtr->sliceType == EB_P_PICTURE || pictureControlSetPtr->sliceType == EB_B_PICTURE )
1706         && GROUP_OF_4_32x32_BLOCKS(cuOriginX, cuOriginY) &&
1707         (contextPtr->groupOf16x16BlocksCount == 4)) {
1708 
1709         depthZeroCandidateCuIndex = depthOneCandidateCuIndex - DEPTH_ONE_STEP - DEPTH_ONE_STEP - DEPTH_ONE_STEP - 1;
1710 
1711         contextPtr->groupOf16x16BlocksCount = 0;
1712 
1713         // From the last coded pu index, get the indices of the left, top, and top left pus
1714         leftCuIndex = depthOneCandidateCuIndex - DEPTH_ONE_STEP;
1715         topCuIndex = leftCuIndex - DEPTH_ONE_STEP;
1716         topLeftCuIndex = topCuIndex - DEPTH_ONE_STEP;
1717 
1718         // Copy the Mode & Depth of the Top-Left N+1 block to the N block for the SplitContext calculation
1719         //   This needs to be done in the case that the N block was initially not calculated.
1720 
1721 		contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].leftNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborMode;
1722 		contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].leftNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborDepth;
1723 		contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].topNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborMode;
1724 		contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].topNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborDepth;
1725 
1726         // From the top left index, get the index of the candidate pu for merging
1727         depthZeroCandidateCuIndex = topLeftCuIndex - 1;
1728 
1729         depthZeroCuStatsPtr = GetCodedUnitStats(depthZeroCandidateCuIndex);
1730         if (depthZeroCuStatsPtr->depth == 0) {
1731 
1732             // Compute depth N cost
1733 			SplitFlagRate(
1734 				contextPtr,
1735                 tbPtr->codedLeafArrayPtr[depthZeroCandidateCuIndex],
1736                 0,
1737                 &depthNRate,
1738                 fullLambda,
1739                 mdRateEstimationPtr,
1740 				tbMaxDepth);
1741 			if (contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].testedCuFlag == EB_FALSE)
1742 				contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].cost = MAX_CU_COST;
1743 			depthNCost = contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].cost + depthNRate;
1744 			// Compute depth N+1 cost
1745 			SplitFlagRate(
1746 				contextPtr,
1747 				tbPtr->codedLeafArrayPtr[depthZeroCandidateCuIndex],
1748 				1,
1749 				&depthNPlusOneRate,
1750 				fullLambda,
1751 				mdRateEstimationPtr,
1752 				tbMaxDepth);
1753 			depthNPlusOneCost =
1754 				contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost +
1755 				contextPtr->mdLocalCuUnit[leftCuIndex].cost +
1756 				contextPtr->mdLocalCuUnit[topCuIndex].cost +
1757 				contextPtr->mdLocalCuUnit[topLeftCuIndex].cost +
1758 				depthNPlusOneRate;
1759 			if (depthNPlusOneCost < MAX_CU_COST)
1760 				depthNPlusOneCost = depthNPlusOneCost + ((EB_S64)depthNPlusOneCost*interDepthW01) / 100;
1761 
1762 			// Inter depth comparison: depth 0 vs depth 1
1763 			if (depthNCost <= depthNPlusOneCost){
1764 
1765 				// If the cost is low enough to warrant not spliting further:
1766 				// 1. set the split flag of the candidate pu for merging to false
1767 				// 2. update the last pu index
1768 				tbPtr->codedLeafArrayPtr[depthZeroCandidateCuIndex]->splitFlag = EB_FALSE;
1769 				lastCuIndex = depthZeroCandidateCuIndex;
1770 			}
1771 
1772 
1773          }
1774     }
1775 
1776     return lastCuIndex;
1777 }
1778 
PillarInterDepthDecision(ModeDecisionContext_t * contextPtr,EB_U32 leafIndex,LargestCodingUnit_t * tbPtr,EB_U32 tbOriginX,EB_U32 tbOriginY,EB_U64 fullLambda,MdRateEstimationContext_t * mdRateEstimationPtr,PictureControlSet_t * pictureControlSetPtr)1779 EB_U32 PillarInterDepthDecision(
1780     ModeDecisionContext_t          *contextPtr,
1781     EB_U32                          leafIndex,
1782     LargestCodingUnit_t            *tbPtr,
1783     EB_U32                          tbOriginX,
1784     EB_U32                          tbOriginY,
1785     EB_U64                          fullLambda,
1786     MdRateEstimationContext_t      *mdRateEstimationPtr,
1787     PictureControlSet_t            *pictureControlSetPtr)
1788 {
1789     EB_U32                     lastCuIndex;
1790     EB_U32                     leftCuIndex;
1791     EB_U32                     topCuIndex;
1792     EB_U32                     topLeftCuIndex;
1793     EB_U32                     depthZeroCandidateCuIndex;
1794     EB_U32                     depthOneCandidateCuIndex = leafIndex;
1795     EB_U32                     depthTwoCandidateCuIndex = leafIndex;
1796     EB_U64                     depthNRate = 0;
1797     EB_U64                     depthNPlusOneRate = 0;
1798     EB_U64                     depthNCost = 0;
1799     EB_U64                     depthNPlusOneCost = 0;
1800     EB_U32                     cuOriginX;
1801     EB_U32                     cuOriginY;
1802 
1803     SequenceControlSet_t *sequenceControlSetPtr = (SequenceControlSet_t*)pictureControlSetPtr->sequenceControlSetWrapperPtr->objectPtr;
1804     EncodeContext_t      *encodeContextPtr = sequenceControlSetPtr->encodeContextPtr;
1805     EB_U32                tbMaxDepth = sequenceControlSetPtr->maxLcuDepth;
1806     EB_BOOL               lastDepthFlag = tbPtr->codedLeafArrayPtr[leafIndex]->splitFlag == EB_FALSE ? EB_TRUE : EB_FALSE;
1807 
1808     const CodedUnitStats_t    *curCuStatsPtr;
1809     const CodedUnitStats_t    *depthTwoCuStatsPtr;
1810     const CodedUnitStats_t    *depthOneCuStatsPtr;
1811     const CodedUnitStats_t    *depthZeroCuStatsPtr;
1812 
1813     lastCuIndex = leafIndex;
1814     curCuStatsPtr = GetCodedUnitStats(leafIndex);
1815     cuOriginX = tbOriginX + curCuStatsPtr->originX;
1816     cuOriginY = tbOriginY + curCuStatsPtr->originY;
1817     EB_U8 interDepthW12 = 0;
1818     EB_U8 interDepthW01 = 0;
1819 
1820     if (lastDepthFlag) {
1821         tbPtr->codedLeafArrayPtr[leafIndex]->splitFlag = EB_FALSE;
1822 
1823 
1824         if (curCuStatsPtr->depth == 1) {
1825             contextPtr->groupOf16x16BlocksCount++;
1826         }
1827         else if (curCuStatsPtr->depth == 2) {
1828             contextPtr->groupOf8x8BlocksCount++;
1829         }
1830     }
1831 
1832     /*** Stage 0: Inter depth decision: depth 2 vs depth 3 ***/
1833 
1834     // Walks to the last coded 8x8 block for merging
1835     if ((GROUP_OF_4_8x8_BLOCKS(cuOriginX, cuOriginY))) {
1836 
1837         depthTwoCandidateCuIndex = leafIndex - DEPTH_THREE_STEP - DEPTH_THREE_STEP - DEPTH_THREE_STEP - 1;
1838 
1839         contextPtr->groupOf8x8BlocksCount++;
1840 
1841         // From the last coded cu index, get the indices of the left, top, and top left cus
1842         leftCuIndex = leafIndex - DEPTH_THREE_STEP;
1843         topCuIndex = leftCuIndex - DEPTH_THREE_STEP;
1844         topLeftCuIndex = topCuIndex - DEPTH_THREE_STEP;
1845 
1846         // From the top left index, get the index of the candidate pu for merging
1847         depthTwoCandidateCuIndex = topLeftCuIndex - 1;
1848 
1849         // Copy the Mode & Depth of the Top-Left N+1 block to the N block for the SplitContext calculation
1850         //   This needs to be done in the case that the N block was initially not calculated.
1851 
1852 		contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].leftNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborMode;
1853 		contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].leftNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborDepth;
1854 		contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].topNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborMode;
1855 		contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].topNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborDepth;
1856 
1857         // Compute depth N cost
1858 		SplitFlagRate(
1859 			contextPtr,
1860             tbPtr->codedLeafArrayPtr[depthTwoCandidateCuIndex],
1861             0,
1862             &depthNRate,
1863             fullLambda,
1864             mdRateEstimationPtr,
1865 			tbMaxDepth);
1866 		if (contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].testedCuFlag == EB_FALSE)
1867 			contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost = MAX_CU_COST;
1868 
1869 		depthNCost = contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost + depthNRate;
1870 		// Compute depth N+1 cost
1871 		SplitFlagRate(
1872 			contextPtr,
1873 			tbPtr->codedLeafArrayPtr[depthTwoCandidateCuIndex],
1874 			1,
1875 			&depthNPlusOneRate,
1876 			fullLambda,
1877 			mdRateEstimationPtr,
1878 			tbMaxDepth);
1879 		depthNPlusOneCost =
1880 			contextPtr->mdLocalCuUnit[leafIndex].cost +
1881 			contextPtr->mdLocalCuUnit[leftCuIndex].cost +
1882 			contextPtr->mdLocalCuUnit[topCuIndex].cost +
1883 			contextPtr->mdLocalCuUnit[topLeftCuIndex].cost +
1884 			depthNPlusOneRate;
1885 		// Inter depth comparison: depth 2 vs depth 3
1886 		if (depthNCost <= depthNPlusOneCost){
1887 
1888 			// If the cost is low enough to warrant not spliting further:
1889 			// 1. set the split flag of the candidate pu for merging to false
1890 			// 2. update the last pu index
1891 			tbPtr->codedLeafArrayPtr[depthTwoCandidateCuIndex]->splitFlag = EB_FALSE;
1892 			contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost = depthNCost;
1893 			lastCuIndex = depthTwoCandidateCuIndex;
1894 		}
1895 		else {
1896 			// If the cost is not low enough:
1897 			// update the cost of the candidate pu for merging
1898 			// this update is required for the next inter depth decision
1899 			contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost = depthNPlusOneCost;
1900 		}
1901 
1902 
1903     }
1904 
1905     // Stage 1: Inter depth decision: depth 1 vs depth 2
1906 
1907     // Walks to the last coded 16x16 block for merging
1908     depthTwoCuStatsPtr = GetCodedUnitStats(depthTwoCandidateCuIndex);
1909     cuOriginX = tbOriginX + depthTwoCuStatsPtr->originX;
1910     cuOriginY = tbOriginY + depthTwoCuStatsPtr->originY;
1911     if (GROUP_OF_4_16x16_BLOCKS(cuOriginX, cuOriginY) &&
1912         (contextPtr->groupOf8x8BlocksCount == 4)){
1913 
1914         depthOneCandidateCuIndex = depthTwoCandidateCuIndex - DEPTH_TWO_STEP - DEPTH_TWO_STEP - DEPTH_TWO_STEP - 1;
1915 
1916         contextPtr->groupOf8x8BlocksCount = 0;
1917         contextPtr->groupOf16x16BlocksCount++;
1918 
1919         // From the last coded pu index, get the indices of the left, top, and top left pus
1920         leftCuIndex = depthTwoCandidateCuIndex - DEPTH_TWO_STEP;
1921         topCuIndex = leftCuIndex - DEPTH_TWO_STEP;
1922         topLeftCuIndex = topCuIndex - DEPTH_TWO_STEP;
1923 
1924         // Copy the Mode & Depth of the Top-Left N+1 block to the N block for the SplitContext calculation
1925         //   This needs to be done in the case that the N block was initially not calculated.
1926 
1927 		contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].leftNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborMode;
1928 		contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].leftNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborDepth;
1929 		contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].topNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborMode;
1930 		contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].topNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborDepth;
1931 
1932         // From the top left index, get the index of the candidate pu for merging
1933         depthOneCandidateCuIndex = topLeftCuIndex - 1;
1934 
1935         depthOneCuStatsPtr = GetCodedUnitStats(depthOneCandidateCuIndex);
1936         if (depthOneCuStatsPtr->depth == 1) {
1937 
1938             // Compute depth N cost
1939 			SplitFlagRate(
1940 				contextPtr,
1941                 tbPtr->codedLeafArrayPtr[depthOneCandidateCuIndex],
1942                 0,
1943                 &depthNRate,
1944                 fullLambda,
1945                 mdRateEstimationPtr,
1946 				tbMaxDepth);
1947 			if (contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].testedCuFlag == EB_FALSE)
1948 				contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost = MAX_CU_COST;
1949 			depthNCost = contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost + depthNRate;
1950 
1951 			// Compute depth N+1 cost
1952 			SplitFlagRate(
1953 				contextPtr,
1954 				tbPtr->codedLeafArrayPtr[depthOneCandidateCuIndex],
1955 				1,
1956 				&depthNPlusOneRate,
1957 				fullLambda,
1958 				mdRateEstimationPtr,
1959 				tbMaxDepth);
1960 			depthNPlusOneCost =
1961 				contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost +
1962 				contextPtr->mdLocalCuUnit[leftCuIndex].cost +
1963 				contextPtr->mdLocalCuUnit[topCuIndex].cost +
1964 				contextPtr->mdLocalCuUnit[topLeftCuIndex].cost +
1965 				depthNPlusOneRate;
1966 			CHECK_REPORT_ERROR(
1967 				(contextPtr->mdLocalCuUnit[depthTwoCandidateCuIndex].cost != MAX_CU_COST),
1968 				encodeContextPtr->appCallbackPtr,
1969 				EB_ENC_FL_ERROR4);
1970 			CHECK_REPORT_ERROR(
1971 				(contextPtr->mdLocalCuUnit[leftCuIndex].cost != MAX_CU_COST),
1972 				encodeContextPtr->appCallbackPtr,
1973 				EB_ENC_FL_ERROR4);
1974 			CHECK_REPORT_ERROR(
1975 				(contextPtr->mdLocalCuUnit[topCuIndex].cost != MAX_CU_COST),
1976 				encodeContextPtr->appCallbackPtr,
1977 				EB_ENC_FL_ERROR4);
1978 			CHECK_REPORT_ERROR(
1979 				(contextPtr->mdLocalCuUnit[topLeftCuIndex].cost != MAX_CU_COST),
1980 				encodeContextPtr->appCallbackPtr,
1981 				EB_ENC_FL_ERROR4);
1982 
1983 			if (depthNPlusOneCost < MAX_CU_COST)
1984 				depthNPlusOneCost = depthNPlusOneCost + ((EB_S64)depthNPlusOneCost*interDepthW12) / 100;
1985 
1986 			// Inter depth comparison: depth 1 vs depth 2
1987 			if (depthNCost <= depthNPlusOneCost) {
1988 				// If the cost is low enough to warrant not spliting further:
1989 				// 1. set the split flag of the candidate pu for merging to false
1990 				// 2. update the last pu index
1991 				tbPtr->codedLeafArrayPtr[depthOneCandidateCuIndex]->splitFlag = EB_FALSE;
1992 				contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost = depthNCost;
1993 				lastCuIndex = depthOneCandidateCuIndex;
1994 			}
1995 			else {
1996 				// If the cost is not low enough:
1997 				// update the cost of the candidate pu for merging
1998 				// this update is required for the next inter depth decision
1999 				contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost = depthNPlusOneCost;
2000 			}
2001 
2002 
2003         }
2004     }
2005 
2006     // Stage 2: Inter depth decision: depth 0 vs depth 1
2007 
2008     // Walks to the last coded 32x32 block for merging
2009     // Stage 2 isn't performed in I slices since the abcense of 64x64 candidates
2010     depthOneCuStatsPtr = GetCodedUnitStats(depthOneCandidateCuIndex);
2011     cuOriginX = tbOriginX + depthTwoCuStatsPtr->originX;
2012     cuOriginY = tbOriginY + depthTwoCuStatsPtr->originY;
2013     if ((pictureControlSetPtr->sliceType == EB_P_PICTURE || pictureControlSetPtr->sliceType == EB_B_PICTURE)
2014         && GROUP_OF_4_32x32_BLOCKS(cuOriginX, cuOriginY) &&
2015         (contextPtr->groupOf16x16BlocksCount == 4)) {
2016 
2017         depthZeroCandidateCuIndex = depthOneCandidateCuIndex - DEPTH_ONE_STEP - DEPTH_ONE_STEP - DEPTH_ONE_STEP - 1;
2018 
2019         contextPtr->groupOf16x16BlocksCount = 0;
2020 
2021         // From the last coded pu index, get the indices of the left, top, and top left pus
2022         leftCuIndex = depthOneCandidateCuIndex - DEPTH_ONE_STEP;
2023         topCuIndex = leftCuIndex - DEPTH_ONE_STEP;
2024         topLeftCuIndex = topCuIndex - DEPTH_ONE_STEP;
2025 
2026         // Copy the Mode & Depth of the Top-Left N+1 block to the N block for the SplitContext calculation
2027         //   This needs to be done in the case that the N block was initially not calculated.
2028 
2029 		contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].leftNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborMode;
2030 		contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].leftNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].leftNeighborDepth;
2031 		contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].topNeighborMode = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborMode;
2032 		contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].topNeighborDepth = contextPtr->mdLocalCuUnit[topLeftCuIndex].topNeighborDepth;
2033 
2034         // From the top left index, get the index of the candidate pu for merging
2035         depthZeroCandidateCuIndex = topLeftCuIndex - 1;
2036 
2037         depthZeroCuStatsPtr = GetCodedUnitStats(depthZeroCandidateCuIndex);
2038         if (depthZeroCuStatsPtr->depth == 0) {
2039 
2040             // Compute depth N cost
2041 			SplitFlagRate(
2042 				contextPtr,
2043                 tbPtr->codedLeafArrayPtr[depthZeroCandidateCuIndex],
2044                 0,
2045                 &depthNRate,
2046                 fullLambda,
2047                 mdRateEstimationPtr,
2048 				tbMaxDepth);
2049 			if (contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].testedCuFlag == EB_FALSE)
2050 				contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].cost = MAX_CU_COST;
2051 			depthNCost = contextPtr->mdLocalCuUnit[depthZeroCandidateCuIndex].cost + depthNRate;
2052 			// Compute depth N+1 cost
2053 			SplitFlagRate(
2054 				contextPtr,
2055 				tbPtr->codedLeafArrayPtr[depthZeroCandidateCuIndex],
2056 				1,
2057 				&depthNPlusOneRate,
2058 				fullLambda,
2059 				mdRateEstimationPtr,
2060 				tbMaxDepth);
2061 			depthNPlusOneCost =
2062 				contextPtr->mdLocalCuUnit[depthOneCandidateCuIndex].cost +
2063 				contextPtr->mdLocalCuUnit[leftCuIndex].cost +
2064 				contextPtr->mdLocalCuUnit[topCuIndex].cost +
2065 				contextPtr->mdLocalCuUnit[topLeftCuIndex].cost +
2066 				depthNPlusOneRate;
2067 			if (depthNPlusOneCost < MAX_CU_COST)
2068 				depthNPlusOneCost = depthNPlusOneCost + ((EB_S64)depthNPlusOneCost*interDepthW01) / 100;
2069 
2070 			// Inter depth comparison: depth 0 vs depth 1
2071 			if (depthNCost <= depthNPlusOneCost){
2072 
2073 				// If the cost is low enough to warrant not spliting further:
2074 				// 1. set the split flag of the candidate pu for merging to false
2075 				// 2. update the last pu index
2076 				tbPtr->codedLeafArrayPtr[depthZeroCandidateCuIndex]->splitFlag = EB_FALSE;
2077 				lastCuIndex = depthZeroCandidateCuIndex;
2078 			}
2079 
2080         }
2081     }
2082 
2083     return lastCuIndex;
2084 }
2085 
2086