1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 #include <string.h>
7 
8 #include "EbDefinitions.h"
9 #include "EbUtility.h"
10 #include "EbTransformUnit.h"
11 #include "EbRateDistortionCost.h"
12 #include "EbDeblockingFilter.h"
13 #include "EbSampleAdaptiveOffset.h"
14 #include "EbPictureOperators.h"
15 
16 #include "EbModeDecisionProcess.h"
17 #include "EbEncDecProcess.h"
18 #include "EbErrorCodes.h"
19 #include "EbErrorHandling.h"
20 #include "EbComputeSAD.h"
21 #include "EbTransforms.h"
22 #include "EbModeDecisionConfiguration.h"
23 #include "emmintrin.h"
24 
25 //#define DEBUG_REF_INFO
26 //#define DUMP_RECON
27 #ifdef DUMP_RECON
dump_buf_desc_to_file(EbPictureBufferDesc_t * reconBuffer,const char * filename,int POC)28 static void dump_buf_desc_to_file(EbPictureBufferDesc_t* reconBuffer, const char* filename, int POC)
29 {
30     if (POC == 0) {
31         FILE* tmp=fopen(filename, "w");
32         fclose(tmp);
33     }
34     FILE* fp = fopen(filename, "r+");
35     assert(fp);
36     long descSize = reconBuffer->height * reconBuffer->width; //Luma
37     descSize += 2 * ((reconBuffer->height * reconBuffer->width) >> (3 - reconBuffer->colorFormat));
38     long offset = descSize * POC;
39     fseek(fp, 0, SEEK_END);
40     long fileSize = ftell(fp);
41     if (offset > fileSize) {
42         int count = (offset - fileSize) / descSize;
43         char *tmpBuf = (char*)malloc(descSize);
44         for (int i=0;i<count;i++) {
45             fwrite(tmpBuf, 1, descSize, fp);
46         }
47         free(tmpBuf);
48     }
49     //printf("---Seek to offset %d(POC pos) for writting\n", offset/descSize);
50     fseek(fp, offset, SEEK_SET);
51     assert(ftell(fp) == offset);
52 
53     EB_COLOR_FORMAT colorFormat = reconBuffer->colorFormat;    // Chroma format
54     EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
55     EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
56     unsigned char* luma_ptr = reconBuffer->bufferY + reconBuffer->strideY*(reconBuffer->originY) + reconBuffer->originX;
57     unsigned char* cb_ptr =  reconBuffer->bufferCb + reconBuffer->strideCb*(reconBuffer->originY>>subHeightCMinus1) + (reconBuffer->originX>>subWidthCMinus1);
58     unsigned char* cr_ptr =  reconBuffer->bufferCr + reconBuffer->strideCr*(reconBuffer->originY>>subHeightCMinus1) + (reconBuffer->originX>>subWidthCMinus1);
59     for (int i=0;i<reconBuffer->height;i++) {
60         fwrite(luma_ptr, 1, reconBuffer->width, fp);
61         luma_ptr += reconBuffer->strideY;
62     }
63 
64     for (int i=0;i<reconBuffer->height>>subHeightCMinus1;i++) {
65         fwrite(cb_ptr, 1, reconBuffer->width>>subWidthCMinus1, fp);
66         cb_ptr += reconBuffer->strideCb;
67     }
68 
69     for (int i=0;i<reconBuffer->height>>subHeightCMinus1;i++) {
70         fwrite(cr_ptr, 1, reconBuffer->width>>subWidthCMinus1, fp);
71         cr_ptr += reconBuffer->strideCr;
72     }
73     fseek(fp, 0, SEEK_END);
74     //printf("After write POC %d, filesize %d\n", POC, ftell(fp));
75     fclose(fp);
76 
77 }
78 #endif
79 
80 #ifdef DEBUG_REF_INFO
dump_left_array(NeighborArrayUnit_t * neighbor,int y_pos,int size)81 static void dump_left_array(NeighborArrayUnit_t *neighbor, int y_pos, int size)
82 {
83     printf("*Dump left array\n");
84     for (int i=0; i<size; i++) {
85         printf("%3u ", neighbor->leftArray[i+y_pos]);
86     }
87     printf("\n----------------------\n");
88 }
89 
dump_intra_ref(IntraReferenceSamples_t * ref,int size,int mask)90 static void dump_intra_ref(IntraReferenceSamples_t* ref, int size, int mask)
91 {
92     unsigned char* ptr = NULL;
93     if (mask==0) {
94         ptr = ref->yIntraReferenceArray;
95     } else if (mask == 1) {
96         ptr = ref->cbIntraReferenceArray;
97     } else if (mask ==2) {
98         ptr = ref->crIntraReferenceArray;
99     } else {
100         assert(0);
101     }
102 
103     printf("*Dumping intra reference array for component %d\n", mask);
104     for (int i=0; i<size; i++) {
105         printf("%3u ", ptr[i]);
106     }
107     printf("\n----------------------\n");
108 }
109 
dump_block_from_desc(int size,EbPictureBufferDesc_t * buf_tmp,int startX,int startY,int componentMask)110 static void dump_block_from_desc(int size, EbPictureBufferDesc_t *buf_tmp, int startX, int startY, int componentMask)
111 {
112     unsigned char* buf=NULL;
113     int stride=0;
114     int bitDepth = buf_tmp->bitDepth;
115     int val=(bitDepth==8)?1:2;
116     EB_COLOR_FORMAT colorFormat = buf_tmp->colorFormat;    // Chroma format
117     EB_U16 subWidthCMinus1 = (colorFormat==EB_YUV444?1:2)-1;
118     EB_U16 subHeightCMinus1 = (colorFormat>=EB_YUV422?1:2)-1;
119     if (componentMask ==0) {
120         buf=buf_tmp->bufferY;
121         stride=buf_tmp->strideY;
122         subWidthCMinus1=0;
123         subHeightCMinus1=0;
124     } else if (componentMask == 1) {
125         buf=buf_tmp->bufferCb;
126         stride=buf_tmp->strideCb;
127     } else if (componentMask == 2) {
128         buf=buf_tmp->bufferCr;
129         stride=buf_tmp->strideCr;
130     } else {
131         assert(0);
132     }
133 
134     int offset=((stride*(buf_tmp->originY+startY))>>subHeightCMinus1) +((startX+buf_tmp->originX)>>subWidthCMinus1);
135     printf("bitDepth is %d, dump block size %d at offset %d, (%d, %d), component is %s\n",
136             bitDepth, size, offset, startX, startY, componentMask==0?"luma":(componentMask==1?"Cb":"Cr"));
137             unsigned char* start_tmp=buf+offset*val;
138             for (int i=0;i<size;i++) {
139                 for (int j=0;j<size+1;j++) {
140                     if (j==size) {
141                         printf("|||");
142                     } else if (j%4 == 0) {
143                         printf("|");
144                     }
145 
146                     if (bitDepth == 8) {
147                         printf("%4u ", start_tmp[j]);
148                     } else if (bitDepth == 16) {
149                         printf("%4d ", *((EB_S16*)start_tmp + j));
150                     } else {
151                         printf("bitDepth is %d\n", bitDepth);
152                         assert(0);
153                     }
154                 }
155                 printf("\n");
156                 start_tmp += stride*val;
157             }
158     printf("------------------------\n");
159 }
160 #endif
161 /*******************************************
162 * set Penalize Skip Flag
163 *
164 * Summary: Set the PenalizeSkipFlag to true
165 * When there is luminance/chrominance change
166 * or in noisy clip with low motion at meduim
167 * varince area
168 *
169 *******************************************/
170 typedef void (*EB_ENCODE_LOOP_FUNC_PTR)(
171 	EncDecContext_t				*contextPtr,
172 	LargestCodingUnit_t			*lcuPtr,
173 	EB_U32						 originX,
174 	EB_U32						 originY,
175 	EB_U32						 cbQp,
176 	EbPictureBufferDesc_t		*predSamples,         // no basis/offset
177 	EbPictureBufferDesc_t		*coeffSamplesTB,      // lcu based
178 	EbPictureBufferDesc_t		*residual16bit,       // no basis/offset
179 	EbPictureBufferDesc_t		*transform16bit,      // no basis/offset
180 	EB_S16						*transformScratchBuffer,
181 	EB_U32						*countNonZeroCoeffs,
182 	EB_U32						 useDeltaQp,
183 	CabacEncodeContext_t        *cabacEncodeCtxPtr,
184 	EB_U32						 intraLumaMode,
185     EB_U32                       componentMask,
186     EB_COLOR_FORMAT              colorFormat,
187     EB_BOOL                      secondChroma,
188     EB_U32                       tuSize,
189 	CabacCost_t                 *CabacCost,
190 	EB_U32						 dZoffset) ;
191 
192 typedef void (*EB_GENERATE_RECON_FUNC_PTR)(
193     EncDecContext_t       *contextPtr,
194 	EB_U32                 originX,
195 	EB_U32                 originY,
196     EB_U32                 componentMask,
197     EB_COLOR_FORMAT        colorFormat,
198     EB_BOOL                secondChroma,
199     EB_U32                 tuSize,
200 	EbPictureBufferDesc_t *predSamples,     // no basis/offset
201 	EbPictureBufferDesc_t *residual16bit,    // no basis/offset
202 	EB_S16                *transformScratchBuffer);
203 
204 typedef void (*EB_ENCODE_LOOP_INTRA_4x4_FUNC_PTR)(
205 	EncDecContext_t				*contextPtr,
206 	LargestCodingUnit_t			*lcuPtr,
207 	EB_U32						 originX,
208 	EB_U32						 originY,
209 	EB_U32						 cbQp,
210 	EbPictureBufferDesc_t		*predSamples,         // no basis/offset
211 	EbPictureBufferDesc_t		*coeffSamplesTB,      // lcu based
212 	EbPictureBufferDesc_t		*residual16bit,       // no basis/offset
213 	EbPictureBufferDesc_t		*transform16bit,      // no basis/offset
214 	EB_S16						*transformScratchBuffer,
215 	EB_U32						*countNonZeroCoeffs,
216 	EB_U32						 componentMask,
217 	EB_U32						 useDeltaQp,
218 	CabacEncodeContext_t        *cabacEncodeCtxPtr,
219 	EB_U32                        intraLumaMode,
220 	CabacCost_t                 *CabacCost,
221 	EB_U32						  dZoffset) ;
222 
223 typedef void (*EB_GENERATE_RECON_INTRA_4x4_FUNC_PTR)(
224     EncDecContext_t       *contextPtr,
225 	EB_U32                 originX,
226 	EB_U32                 originY,
227 	EbPictureBufferDesc_t *predSamples,     // no basis/offset
228 	EbPictureBufferDesc_t *residual16bit,    // no basis/offset
229 	EB_S16                *transformScratchBuffer,
230 	EB_U32                 componentMask);
231 
232 typedef EB_ERRORTYPE(*EB_GENERATE_INTRA_SAMPLES_FUNC_PTR)(
233     EB_BOOL                     constrainedIntraFlag,   //input parameter, indicates if constrained intra is switched on/off
234     EB_BOOL                     strongIntraSmoothingFlag,
235     EB_U32                      originX,
236     EB_U32                      originY,
237     EB_U32                      size,
238     EB_U32                      lcuSize,
239     EB_U32                      cuDepth,
240     NeighborArrayUnit_t        *modeTypeNeighborArray,
241     NeighborArrayUnit_t        *lumaReconNeighborArray,
242     NeighborArrayUnit_t        *cbReconNeighborArray,
243     NeighborArrayUnit_t        *crReconNeighborArray,
244     void                       *refWrapperPtr,
245     EB_COLOR_FORMAT             colorFormat,
246     EB_BOOL                     pictureLeftBoundary,
247     EB_BOOL                     pictureTopBoundary,
248     EB_BOOL                     pictureRightBoundary);
249 
250 typedef EB_ERRORTYPE(*EB_GENERATE_LUMA_INTRA_SAMPLES_FUNC_PTR)(
251     EB_BOOL                     constrainedIntraFlag,   //input parameter, indicates if constrained intra is switched on/off
252     EB_BOOL                     strongIntraSmoothingFlag,
253     EB_U32                      originX,
254     EB_U32                      originY,
255     EB_U32                      size,
256     EB_U32                      lcuSize,
257     EB_U32                      cuDepth,
258     NeighborArrayUnit_t        *modeTypeNeighborArray,
259     NeighborArrayUnit_t        *lumaReconNeighborArray,
260     NeighborArrayUnit_t        *cbReconNeighborArray,
261     NeighborArrayUnit_t        *crReconNeighborArray,
262     void                       *refWrapperPtr,
263     EB_BOOL                     pictureLeftBoundary,
264     EB_BOOL                     pictureTopBoundary,
265     EB_BOOL                     pictureRightBoundary);
266 
267 typedef EB_ERRORTYPE(*EB_GENERATE_CHROMA_INTRA_SAMPLES_FUNC_PTR)(
268     EB_BOOL                     constrainedIntraFlag,   //input parameter, indicates if constrained intra is switched on/off
269     EB_BOOL                     strongIntraSmoothingFlag,
270     EB_U32                      originX,
271     EB_U32                      originY,
272     EB_U32                      size,
273     EB_U32                      lcuSize,
274     EB_U32                      cuDepth,
275     NeighborArrayUnit_t        *modeTypeNeighborArray,
276     NeighborArrayUnit_t        *lumaReconNeighborArray,
277     NeighborArrayUnit_t        *cbReconNeighborArray,
278     NeighborArrayUnit_t        *crReconNeighborArray,
279     void                       *refWrapperPtr,
280     EB_COLOR_FORMAT             colorFormat,
281     EB_BOOL                     secondChroma,
282     EB_BOOL                     pictureLeftBoundary,
283     EB_BOOL                     pictureTopBoundary,
284     EB_BOOL                     pictureRightBoundary);
285 
286 typedef EB_ERRORTYPE(*EB_ENC_PASS_INTRA_FUNC_PTR)(
287     void                                   *refSamples,
288     EB_U32                                  originX,
289     EB_U32                                  originY,
290     EB_U32                                  puSize,
291     EB_U32                                  puChromaSize,
292     EbPictureBufferDesc_t                  *predictionPtr,
293     EB_COLOR_FORMAT                         colorFormat,
294     EB_BOOL                                 secondChroma,
295     EB_U32                                  lumaMode,
296     EB_U32                                  chromaMode,
297     EB_U32                                  componentMask);
298 
299 typedef EB_ERRORTYPE(*EB_ENC_PASS_INTRA4X4_FUNC_PTR)(
300     void                       *referenceSamples,
301     EB_U32                      originX,
302     EB_U32                      originY,
303     EB_U32                      puSize,
304     EB_U32                      chromaPuSize,
305     EbPictureBufferDesc_t       *predictionPtr,
306     EB_U32                      lumaMode,
307     EB_U32                      chromaMode,
308     EB_COLOR_FORMAT             colorFormat,
309     EB_BOOL                     secondChroma,
310     EB_U32                      componentMask);
311 
312 typedef EB_ERRORTYPE (*EB_LCU_INTERNAL_DLF_FUNC_PTR)(
313 	EbPictureBufferDesc_t *reconpicture,
314     EB_U32                 lcuPosx,
315     EB_U32                 lcuPosy,
316     EB_U32                 lcuWidth,
317     EB_U32                 lcuHeight,
318     EB_U8                 *verticalEdgeBSArray,
319     EB_U8                 *horizontalEdgeBSArray,
320     PictureControlSet_t   *reconPictureControlSet);
321 typedef void (*EB_LCU_BOUNDARY_DLF_FUNC_PTR)(
322 	EbPictureBufferDesc_t *reconpicture,
323     EB_U32                 lcuPos_x,
324     EB_U32                 lcuPos_y,
325     EB_U32                 lcuWidth,
326     EB_U32                 lcuHeight,
327     EB_U8                 *lcuVerticalEdgeBSArray,
328     EB_U8                 *lcuHorizontalEdgeBSArray,
329     EB_U8                 *topLcuVerticalEdgeBSArray,
330     EB_U8                 *leftLcuHorizontalEdgeBSArray,
331     PictureControlSet_t   *pictureControlSetPtr);
332 typedef void (*EB_LCU_PIC_EDGE_DLF_FUNC_PTR)(
333     EbPictureBufferDesc_t *reconPic,
334     EB_U32                 lcuIdx,
335     EB_U32                 lcuPos_x,
336     EB_U32                 lcuPos_y,
337     EB_U32                 lcuWidth,
338     EB_U32                 lcuHeight,
339     PictureControlSet_t   *pictureControlSetPtr);
340 
341 void AddChromaEncDec(
342     PictureControlSet_t     *pictureControlSetPtr,
343     LargestCodingUnit_t     *lcuPtr,
344     CodingUnit_t            *cuPtr,
345     ModeDecisionContext_t   *contextPtr,
346     EncDecContext_t         *contextPtrED,
347     EbPictureBufferDesc_t   *inputPicturePtr,
348     EB_U32                   inputCbOriginIndex,
349     EB_U32					 cuChromaOriginIndex,
350     EB_U32                   candIdxInput);
351 /***************************************************
352 * Update Coding Unit Neighbor Arrays
353 ***************************************************/
EncodePassUpdateLeafDepthNeighborArrays(NeighborArrayUnit_t * leafDepthNeighborArray,EB_U8 depth,EB_U32 originX,EB_U32 originY,EB_U32 size)354 static void EncodePassUpdateLeafDepthNeighborArrays(
355 	NeighborArrayUnit_t     *leafDepthNeighborArray,
356     EB_U8                    depth,
357 	EB_U32                   originX,
358 	EB_U32                   originY,
359 	EB_U32                   size)
360 {
361 	// Mode Type Update
362 	NeighborArrayUnitModeWrite(
363 		leafDepthNeighborArray,
364         &depth,
365 		originX,
366 		originY,
367 		size,
368 		size,
369 		NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
370 
371 	return;
372 }
373 
374 /***************************************************
375 * Update Intra Mode Neighbor Arrays
376 ***************************************************/
EncodePassUpdateIntraModeNeighborArrays(NeighborArrayUnit_t * modeTypeNeighborArray,NeighborArrayUnit_t * intraLumaModeNeighborArray,EB_U8 lumaMode,EB_U32 originX,EB_U32 originY,EB_U32 size)377 static void EncodePassUpdateIntraModeNeighborArrays(
378 	NeighborArrayUnit_t     *modeTypeNeighborArray,
379 	NeighborArrayUnit_t     *intraLumaModeNeighborArray,
380     EB_U8                    lumaMode,
381 	EB_U32                   originX,
382 	EB_U32                   originY,
383 	EB_U32                   size)
384 {
385 	EB_U8 modeType = INTRA_MODE;
386 
387 	// Mode Type Update
388 	NeighborArrayUnitModeWrite(
389 		modeTypeNeighborArray,
390 		&modeType,
391 		originX,
392 		originY,
393 		size,
394 		size,
395 		NEIGHBOR_ARRAY_UNIT_FULL_MASK);
396 
397 	// Intra Luma Mode Update
398 	NeighborArrayUnitModeWrite(
399 		intraLumaModeNeighborArray,
400         &lumaMode,
401 		originX,
402 		originY,
403 		size,
404 		size,
405 		NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
406 
407 	return;
408 }
409 
410 /***************************************************
411 * Update Inter Mode Neighbor Arrays
412 ***************************************************/
EncodePassUpdateInterModeNeighborArrays(NeighborArrayUnit_t * modeTypeNeighborArray,NeighborArrayUnit_t * mvNeighborArray,NeighborArrayUnit_t * skipNeighborArray,MvUnit_t * mvUnit,EB_U8 * skipFlag,EB_U32 originX,EB_U32 originY,EB_U32 size)413 static void EncodePassUpdateInterModeNeighborArrays(
414     NeighborArrayUnit_t     *modeTypeNeighborArray,
415     NeighborArrayUnit_t     *mvNeighborArray,
416     NeighborArrayUnit_t     *skipNeighborArray,
417     MvUnit_t                *mvUnit,
418     EB_U8                   *skipFlag,
419     EB_U32                   originX,
420     EB_U32                   originY,
421     EB_U32                   size)
422 {
423     EB_U8 modeType = INTER_MODE;
424 
425     // Mode Type Update
426     NeighborArrayUnitModeWrite(
427         modeTypeNeighborArray,
428         &modeType,
429         originX,
430         originY,
431         size,
432         size,
433         NEIGHBOR_ARRAY_UNIT_FULL_MASK);
434 
435     // Motion Vector Unit
436     NeighborArrayUnitModeWrite(
437         mvNeighborArray,
438         (EB_U8*)mvUnit,
439         originX,
440         originY,
441         size,
442         size,
443         NEIGHBOR_ARRAY_UNIT_FULL_MASK);
444 
445     // Skip Flag
446     NeighborArrayUnitModeWrite(
447         skipNeighborArray,
448         skipFlag,
449         originX,
450         originY,
451         size,
452         size,
453         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
454 
455     return;
456 }
457 
458 /***************************************************
459 * Update Recon Samples Neighbor Arrays
460 ***************************************************/
EncodePassUpdateReconSampleNeighborArrays(NeighborArrayUnit_t * lumaReconSampleNeighborArray,NeighborArrayUnit_t * cbReconSampleNeighborArray,NeighborArrayUnit_t * crReconSampleNeighborArray,EbPictureBufferDesc_t * reconBuffer,EB_U32 originX,EB_U32 originY,EB_U32 size,EB_U32 componentMask,EB_COLOR_FORMAT colorFormat,EB_BOOL is16bit)461 static void EncodePassUpdateReconSampleNeighborArrays(
462     NeighborArrayUnit_t     *lumaReconSampleNeighborArray,
463     NeighborArrayUnit_t     *cbReconSampleNeighborArray,
464     NeighborArrayUnit_t     *crReconSampleNeighborArray,
465     EbPictureBufferDesc_t   *reconBuffer,
466     EB_U32                   originX,
467     EB_U32                   originY,
468     EB_U32                   size,
469     EB_U32                   componentMask,
470     EB_COLOR_FORMAT          colorFormat,
471     EB_BOOL                  is16bit)
472 {
473     const EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
474     const EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
475 
476     if (is16bit == EB_TRUE){
477         if (componentMask & PICTURE_BUFFER_DESC_LUMA_MASK) {
478             // Recon Samples - Luma
479             NeighborArrayUnit16bitSampleWrite(
480                 lumaReconSampleNeighborArray,
481                 (EB_U16*)(reconBuffer->bufferY),
482                 reconBuffer->strideY,
483                 reconBuffer->originX + originX,
484                 reconBuffer->originY + originY,
485                 originX,
486                 originY,
487                 size,
488                 size,
489                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
490         }
491 
492         if (componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK)
493         {
494             // Recon Samples - Cb
495             NeighborArrayUnit16bitSampleWrite(
496                 cbReconSampleNeighborArray,
497                 (EB_U16*)(reconBuffer->bufferCb),
498                 reconBuffer->strideCb,
499                 (reconBuffer->originX + originX) >> subWidthCMinus1,
500                 (reconBuffer->originY + originY) >> subHeightCMinus1,
501                 originX >> subWidthCMinus1,
502                 originY >> subHeightCMinus1,
503                 size > MIN_PU_SIZE ? (size >> subWidthCMinus1) : size,
504                 size > MIN_PU_SIZE ? (size >> subWidthCMinus1) : size,
505                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
506 
507             // Recon Samples - Cr
508             NeighborArrayUnit16bitSampleWrite(
509                 crReconSampleNeighborArray,
510                 (EB_U16*)(reconBuffer->bufferCr),
511                 reconBuffer->strideCr,
512                 (reconBuffer->originX + originX) >> subWidthCMinus1,
513                 (reconBuffer->originY + originY) >> subHeightCMinus1,
514                 originX >> subWidthCMinus1,
515                 originY >> subHeightCMinus1,
516                 size > MIN_PU_SIZE ? (size >> subWidthCMinus1) : size,
517                 size > MIN_PU_SIZE ? (size >> subWidthCMinus1) : size,
518                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
519         }
520 
521     } else {
522         if (componentMask & PICTURE_BUFFER_DESC_LUMA_MASK) {
523             // Recon Samples - Luma
524             NeighborArrayUnitSampleWrite(
525                 lumaReconSampleNeighborArray,
526                 reconBuffer->bufferY,
527                 reconBuffer->strideY,
528                 reconBuffer->originX + originX,
529                 reconBuffer->originY + originY,
530                 originX,
531                 originY,
532                 size,
533                 size,
534                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
535         }
536 
537         if (componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK)
538         {
539             // Recon Samples - Cb
540             NeighborArrayUnitSampleWrite(
541                 cbReconSampleNeighborArray,
542                 reconBuffer->bufferCb,
543                 reconBuffer->strideCb,
544                 (reconBuffer->originX + originX) >> subWidthCMinus1,
545                 (reconBuffer->originY + originY) >> subHeightCMinus1,
546                 originX >> subWidthCMinus1,
547                 originY >> subHeightCMinus1,
548                 size > MIN_PU_SIZE ? (size >> subWidthCMinus1) : size,
549                 size > MIN_PU_SIZE ? (size >> subWidthCMinus1) : size,
550                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
551 
552             // Recon Samples - Cr
553             NeighborArrayUnitSampleWrite(
554                 crReconSampleNeighborArray,
555                 reconBuffer->bufferCr,
556                 reconBuffer->strideCr,
557                 (reconBuffer->originX + originX) >> subWidthCMinus1,
558                 (reconBuffer->originY + originY) >> subHeightCMinus1,
559                 originX >> subWidthCMinus1,
560                 originY >> subHeightCMinus1,
561                 size > MIN_PU_SIZE ? (size >> subWidthCMinus1) : size,
562                 size > MIN_PU_SIZE ? (size >> subWidthCMinus1) : size,
563                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
564         }
565     }
566 
567     return;
568 }
569 
570 
571 
572 
573 /************************************************************
574 * Update Intra Luma Neighbor Modes
575 ************************************************************/
EbHevcGeneratePuIntraLumaNeighborModes(CodingUnit_t * cuPtr,EB_U32 puOriginX,EB_U32 puOriginY,EB_U32 lcuSize,NeighborArrayUnit_t * intraLumaNeighborArray,NeighborArrayUnit_t * modeTypeNeighborArray)576 void EbHevcGeneratePuIntraLumaNeighborModes(
577 	CodingUnit_t            *cuPtr,
578 	EB_U32                   puOriginX,
579 	EB_U32                   puOriginY,
580 	EB_U32                   lcuSize,
581 	NeighborArrayUnit_t     *intraLumaNeighborArray,
582 	NeighborArrayUnit_t     *modeTypeNeighborArray)
583 {
584     EB_U32 modeTypeLeftNeighborIndex = GetNeighborArrayUnitLeftIndex(
585 		modeTypeNeighborArray,
586 		puOriginY);
587 	EB_U32 modeTypeTopNeighborIndex = GetNeighborArrayUnitTopIndex(
588 		modeTypeNeighborArray,
589 		puOriginX);
590 	EB_U32 intraLumaModeLeftNeighborIndex = GetNeighborArrayUnitLeftIndex(
591 		intraLumaNeighborArray,
592 		puOriginY);
593 	EB_U32 intraLumaModeTopNeighborIndex = GetNeighborArrayUnitTopIndex(
594 		intraLumaNeighborArray,
595 		puOriginX);
596 
597 	(&cuPtr->predictionUnitArray[0])->intraLumaLeftMode = (EB_U32)(
598 		(modeTypeNeighborArray->leftArray[modeTypeLeftNeighborIndex] != INTRA_MODE) ? EB_INTRA_DC :
599 		(EB_U32)intraLumaNeighborArray->leftArray[intraLumaModeLeftNeighborIndex]);
600 
601 	(&cuPtr->predictionUnitArray[0])->intraLumaTopMode = (EB_U32)(
602 		(modeTypeNeighborArray->topArray[modeTypeTopNeighborIndex] != INTRA_MODE) ? EB_INTRA_DC :
603 		((puOriginY & (lcuSize - 1)) == 0) ? EB_INTRA_DC :                                         // If we are at the top of the LCU boundary, then
604 		(EB_U32)intraLumaNeighborArray->topArray[intraLumaModeTopNeighborIndex]);       //   use DC. This seems like we could use a LCU-width
605 
606 
607 	return;
608 }
609 
610 /**********************************************************
611 * Encode Pass - Update Sao Parameter Neighbor Array
612 **********************************************************/
EncodePassUpdateSaoNeighborArrays(NeighborArrayUnit_t * saoParamNeighborArray,SaoParameters_t * saoParams,EB_U32 originX,EB_U32 originY,EB_U32 size)613 static void EncodePassUpdateSaoNeighborArrays(
614 	NeighborArrayUnit_t     *saoParamNeighborArray,
615 	SaoParameters_t         *saoParams,
616 	EB_U32                   originX,
617 	EB_U32                   originY,
618 	EB_U32                   size)
619 {
620 	NeighborArrayUnitModeWrite(
621 		saoParamNeighborArray,
622 		(EB_U8*)saoParams,
623 		originX,
624 		originY,
625 		size,
626 		size,
627 		NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
628 
629 	return;
630 }
631 
632 /**********************************************************
633 * Encode Loop
634 *
635 * Summary: Performs a H.265 conformant
636 *   Transform, Quantization  and Inverse Quantization of a TU.
637 *
638 * Inputs:
639 *   originX
640 *   originY
641 *   tuSize
642 *   lcuSize
643 *   input - input samples (position sensitive)
644 *   pred - prediction samples (position independent)
645 *
646 * Outputs:
647 *   Inverse quantized coeff - quantization indices (position sensitive)
648 *
649 **********************************************************/
650 
EncodeLoop(EncDecContext_t * contextPtr,LargestCodingUnit_t * lcuPtr,EB_U32 originX,EB_U32 originY,EB_U32 cbQp,EbPictureBufferDesc_t * predSamples,EbPictureBufferDesc_t * coeffSamplesTB,EbPictureBufferDesc_t * residual16bit,EbPictureBufferDesc_t * transform16bit,EB_S16 * transformScratchBuffer,EB_U32 * countNonZeroCoeffs,EB_U32 useDeltaQp,CabacEncodeContext_t * cabacEncodeCtxPtr,EB_U32 intraLumaMode,EB_U32 componentMask,EB_COLOR_FORMAT colorFormat,EB_BOOL secondChroma,EB_U32 tuSize,CabacCost_t * CabacCost,EB_U32 dZoffset)651 static void EncodeLoop(
652 	EncDecContext_t       *contextPtr,
653 	LargestCodingUnit_t   *lcuPtr,
654 	EB_U32                 originX,
655 	EB_U32                 originY,
656 	EB_U32                 cbQp,
657 	EbPictureBufferDesc_t *predSamples,             // no basis/offset
658 	EbPictureBufferDesc_t *coeffSamplesTB,          // lcu based
659 	EbPictureBufferDesc_t *residual16bit,           // no basis/offset
660 	EbPictureBufferDesc_t *transform16bit,          // no basis/offset
661 	EB_S16                *transformScratchBuffer,
662 	EB_U32				  *countNonZeroCoeffs,
663 	EB_U32				   useDeltaQp,
664 	CabacEncodeContext_t  *cabacEncodeCtxPtr,
665 	EB_U32                 intraLumaMode,
666     EB_U32                 componentMask,
667     EB_COLOR_FORMAT        colorFormat,
668     EB_BOOL                secondChroma,
669     EB_U32                 tuSize,
670 	CabacCost_t           *CabacCost,
671 	EB_U32                 dZoffset)
672 {
673 
674     EB_U32                 chromaQp           = cbQp;
675     CodingUnit_t		  *cuPtr              = contextPtr->cuPtr;
676     TransformUnit_t       *tuPtr              = &cuPtr->transformUnitArray[contextPtr->tuItr];
677     EB_PICTURE             sliceType          = lcuPtr->pictureControlSetPtr->sliceType;
678     EB_U32                 temporalLayerIndex = lcuPtr->pictureControlSetPtr->temporalLayerIndex;
679     EB_U32                 qp                 = cuPtr->qp;
680     EbPictureBufferDesc_t  *inputSamples      = contextPtr->inputSamples;
681 
682     const EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
683     const EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
684     EB_U16 tuChromaOffset = 0;
685     if (colorFormat == EB_YUV422 && secondChroma) {
686         tuChromaOffset = tuSize >> 1;
687     }
688 
689 	const EB_U32 inputLumaOffset = ((originY + inputSamples->originY) * inputSamples->strideY) + (originX + inputSamples->originX);
690     const EB_U32 predLumaOffset = ((predSamples->originY+originY) * predSamples->strideY) + (predSamples->originX+originX);
691     const EB_U32 scratchLumaOffset = ((originY & (64 - 1)) * 64) + (originX & (64 - 1));
692 
693 	const EB_U32 inputCbOffset = ((originX + inputSamples->originX) >> subWidthCMinus1) +
694         (((originY + tuChromaOffset + inputSamples->originY) >> subHeightCMinus1) * inputSamples->strideCb);
695 	const EB_U32 inputCrOffset = ((originX + inputSamples->originX) >> subWidthCMinus1) +
696         (((originY + tuChromaOffset + inputSamples->originY) >> subHeightCMinus1) * inputSamples->strideCr);
697 
698 	const EB_U32 predCbOffset = ((predSamples->originX+originX) >> subWidthCMinus1) +
699         (((predSamples->originY+originY+tuChromaOffset) >> subHeightCMinus1) * predSamples->strideCb);
700 	const EB_U32 predCrOffset = ((predSamples->originX+originX) >> subWidthCMinus1) +
701         (((predSamples->originY+originY+tuChromaOffset) >> subHeightCMinus1) * predSamples->strideCr);
702 
703 	const EB_U32 scratchCbOffset = ((originX & (64 - 1)) >> subWidthCMinus1) +
704         ((((originY + tuChromaOffset) & (64 - 1)) >> subHeightCMinus1) * (64 >> subWidthCMinus1));
705 	const EB_U32 scratchCrOffset = ((originX & (64 - 1)) >> subWidthCMinus1) +
706         ((((originY + tuChromaOffset) & (64 - 1)) >> subHeightCMinus1) * (64 >> subWidthCMinus1));
707 
708     EB_U8 enableContouringQCUpdateFlag;
709 
710     enableContouringQCUpdateFlag = DeriveContouringClass(
711         lcuPtr->pictureControlSetPtr->ParentPcsPtr,
712         lcuPtr->index,
713         cuPtr->leafIndex) && (cuPtr->qp < lcuPtr->pictureControlSetPtr->pictureQp);
714 
715 	//**********************************
716 	// Luma
717 	//**********************************
718     if (componentMask & PICTURE_BUFFER_DESC_LUMA_MASK) {
719         PictureResidual(
720             inputSamples->bufferY + inputLumaOffset,
721             inputSamples->strideY,
722             predSamples->bufferY + predLumaOffset,
723             predSamples->strideY,
724             ((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
725             residual16bit->strideY, //64,
726             tuSize,
727             tuSize);
728 
729         EstimateTransform(
730             ((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
731             residual16bit->strideY, //64,
732             ((EB_S16*)transform16bit->bufferY) + scratchLumaOffset,
733             transform16bit->strideY, //64,
734             tuSize,
735             transformScratchBuffer,
736             BIT_INCREMENT_8BIT,
737             (EB_BOOL)(tuSize == MIN_PU_SIZE),
738             contextPtr->transCoeffShapeLuma);
739 
740 		UnifiedQuantizeInvQuantize(
741 			contextPtr,
742 			lcuPtr->pictureControlSetPtr,
743 			((EB_S16*)transform16bit->bufferY) + scratchLumaOffset,
744             transform16bit->strideY, //64,
745 			((EB_S16*)coeffSamplesTB->bufferY) + scratchLumaOffset,
746 			((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
747 			qp,
748 			inputSamples->bitDepth,
749 			tuSize,
750 			sliceType,
751 			&(countNonZeroCoeffs[0]),
752 			contextPtr->transCoeffShapeLuma,
753 			contextPtr->cleanSparseCeoffPfEncDec,
754 			contextPtr->pmpMaskingLevelEncDec,
755 			cuPtr->predictionModeFlag,
756 			0,
757 			enableContouringQCUpdateFlag,
758 			COMPONENT_LUMA,
759 			temporalLayerIndex,
760 			dZoffset,
761 			cabacEncodeCtxPtr,
762 			contextPtr->fullLambda,
763 			intraLumaMode,
764 			EB_INTRA_CHROMA_DM,
765 			CabacCost);
766 
767 		tuPtr->lumaCbf = countNonZeroCoeffs[0] ? EB_TRUE : EB_FALSE;
768 
769         if (tuSize > MIN_PU_SIZE) {
770 		tuPtr->isOnlyDc[0] = (countNonZeroCoeffs[0] == 1 && (((EB_S16*)residual16bit->bufferY) + scratchLumaOffset)[0] != 0 && tuSize != 32) ?
771 			EB_TRUE :
772 			EB_FALSE;
773 
774             if (contextPtr->transCoeffShapeLuma && tuPtr->lumaCbf && tuPtr->isOnlyDc[0] == EB_FALSE) {
775                 if (contextPtr->transCoeffShapeLuma == N2_SHAPE || contextPtr->transCoeffShapeLuma == N4_SHAPE) {
776                     PfZeroOutUselessQuadrants(
777                             ((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
778                             residual16bit->strideY, //64,
779                             (tuSize >> 1));
780                 }
781 
782                 if (contextPtr->transCoeffShapeLuma == N4_SHAPE) {
783                     PfZeroOutUselessQuadrants(
784                             ((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
785                             residual16bit->strideY, //64,
786                             (tuSize >> 2));
787                 }
788             }
789         } else {
790 			if (contextPtr->transCoeffShapeLuma && tuPtr->lumaCbf) {
791 				PfZeroOutUselessQuadrants(
792 						((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
793 						residual16bit->strideY, //64,
794 						(tuSize >> 1));
795 
796 				if (contextPtr->transCoeffShapeLuma == N4_SHAPE) {
797 					PfZeroOutUselessQuadrants(
798 							((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
799 							residual16bit->strideY, //64,
800 							(tuSize >> 2));
801 				}
802 			}
803 		}
804 	}
805 
806     if (componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK) {
807 		//**********************************
808 		// Cb
809 		//**********************************
810 		PictureResidual(
811 				inputSamples->bufferCb + inputCbOffset,
812 				inputSamples->strideCb,
813 				predSamples->bufferCb + predCbOffset,
814 				predSamples->strideCb,
815 				((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
816 				residual16bit->strideCb,
817 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize,
818 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize);
819 
820 		// For the case that DC path chosen for chroma, we check the DC values and determine to use DC or N2Shape for chroma. Since there is only one flag for ChromaShaping, we do the prediction of Cr and Cb and decide on the chroma shaping
821 		if (tuSize > MIN_PU_SIZE && contextPtr->transCoeffShapeChroma == ONLY_DC_SHAPE) {
822 			EB_S64 sumResidual = SumResidual_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
823 					((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
824 					tuSize >> subWidthCMinus1,
825 					residual16bit->strideCb);
826 
827             // Normalized based on the size.
828 			sumResidual = (ABS(sumResidual) / (tuSize >> subWidthCMinus1) / (tuSize >> subWidthCMinus1));
829 			if (sumResidual > 0) {
830 				contextPtr->transCoeffShapeChroma = N2_SHAPE;
831 			}
832 		}
833 
834 		EstimateTransform(
835 				((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
836                 residual16bit->strideCb,
837 				((EB_S16*)transform16bit->bufferCb) + scratchCbOffset,
838                 transform16bit->strideCb,
839 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize,
840 				transformScratchBuffer,
841 				BIT_INCREMENT_8BIT,
842 				EB_FALSE,
843 				contextPtr->transCoeffShapeChroma);
844 
845 		UnifiedQuantizeInvQuantize(
846 				contextPtr,
847 				lcuPtr->pictureControlSetPtr,
848 				((EB_S16*)transform16bit->bufferCb) + scratchCbOffset,
849                 transform16bit->strideCb,
850 				((EB_S16*)coeffSamplesTB->bufferCb) + scratchCbOffset,
851 				((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
852 				chromaQp,
853 				inputSamples->bitDepth,
854 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize,
855 				sliceType,
856 				&(countNonZeroCoeffs[1]),
857 				contextPtr->transCoeffShapeChroma,
858 				contextPtr->cleanSparseCeoffPfEncDec,
859 				contextPtr->pmpMaskingLevelEncDec,
860 				cuPtr->predictionModeFlag,
861 				useDeltaQp == EB_TRUE ? contextPtr->forceCbfFlag : 0,
862 				enableContouringQCUpdateFlag,
863 				COMPONENT_CHROMA,
864 				temporalLayerIndex,
865 				0,
866 				cabacEncodeCtxPtr,
867 				contextPtr->fullLambda,
868 				intraLumaMode,
869 				EB_INTRA_CHROMA_DM,
870 				CabacCost);
871 
872 		if (secondChroma) {
873 			tuPtr->cbCbf2 = countNonZeroCoeffs[1] ? EB_TRUE : EB_FALSE;
874 			tuPtr->isOnlyDc2[0] = (countNonZeroCoeffs[1] == 1 && (((EB_S16*)residual16bit->bufferCb) + scratchCbOffset)[0] != 0) ?
875 				EB_TRUE :
876 				EB_FALSE;
877         } else {
878             tuPtr->cbCbf = countNonZeroCoeffs[1] ? EB_TRUE : EB_FALSE;
879 
880             if (tuSize > MIN_PU_SIZE) {
881                 tuPtr->isOnlyDc[1] = (countNonZeroCoeffs[1] == 1 && (((EB_S16*)residual16bit->bufferCb) + scratchCbOffset)[0] != 0) ?
882                     EB_TRUE :
883                     EB_FALSE;
884 
885                 if (contextPtr->transCoeffShapeChroma && tuPtr->cbCbf && tuPtr->isOnlyDc[1] == EB_FALSE) {
886                     if (contextPtr->transCoeffShapeChroma == PF_N2 || contextPtr->transCoeffShapeChroma == PF_N4) {
887                         PfZeroOutUselessQuadrants(
888                                 ((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
889                                 residual16bit->strideCb,
890                                 (tuSize >> (1 + subWidthCMinus1)));
891                     }
892 
893                     if (contextPtr->transCoeffShapeChroma == PF_N4) {
894                         PfZeroOutUselessQuadrants(
895                                 ((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
896                                 residual16bit->strideCb,
897                                 (tuSize >> (2 + subWidthCMinus1)));
898                     }
899                 }
900             } else {
901                 if (contextPtr->transCoeffShapeChroma && tuPtr->cbCbf) {
902                     PfZeroOutUselessQuadrants(
903                             ((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
904                             residual16bit->strideCb,
905                             (tuSize >> 1));
906 
907                     if (contextPtr->transCoeffShapeChroma == PF_N4) {
908                         PfZeroOutUselessQuadrants(
909                                 ((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
910                                 residual16bit->strideCb,
911                                 (tuSize >> 2));
912                     }
913                 }
914             }
915         }
916 
917 
918 		//**********************************
919 		// Cr
920 		//**********************************
921 		PictureResidual(
922 				inputSamples->bufferCr + inputCrOffset,
923 				inputSamples->strideCr,
924 				predSamples->bufferCr + predCrOffset,
925 				predSamples->strideCr,
926 				((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
927                 residual16bit->strideCr,
928 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize,
929 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize);
930 
931 		if (tuSize > MIN_PU_SIZE && contextPtr->transCoeffShapeChroma == ONLY_DC_SHAPE) {
932 			EB_S64 sumResidual = SumResidual_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
933 					((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
934 					tuSize >> subWidthCMinus1,
935 					residual16bit->strideCr);
936 
937 			sumResidual = (ABS(sumResidual) / (tuSize >> subWidthCMinus1) / (tuSize >> subWidthCMinus1));
938 			if (sumResidual > 0) {
939 				contextPtr->transCoeffShapeChroma = N2_SHAPE;
940 			}
941 		}
942 
943 		EstimateTransform(
944 				((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
945                 residual16bit->strideCr,
946 				((EB_S16*)transform16bit->bufferCr) + scratchCrOffset,
947                 transform16bit->strideCr,
948 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize,
949 				transformScratchBuffer,
950 				BIT_INCREMENT_8BIT,
951 				EB_FALSE,
952 				contextPtr->transCoeffShapeChroma);
953 
954         UnifiedQuantizeInvQuantize(
955                 contextPtr,
956                 lcuPtr->pictureControlSetPtr,
957                 ((EB_S16*)transform16bit->bufferCr) + scratchCrOffset,
958                 transform16bit->strideCr,
959                 ((EB_S16*)coeffSamplesTB->bufferCr) + scratchCrOffset,
960                 ((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
961                 chromaQp,
962                 inputSamples->bitDepth,
963                 tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize,
964                 sliceType,
965                 &(countNonZeroCoeffs[2]),
966                 contextPtr->transCoeffShapeChroma,
967                 contextPtr->cleanSparseCeoffPfEncDec,
968                 contextPtr->pmpMaskingLevelEncDec,
969                 cuPtr->predictionModeFlag,
970                 0,
971                 enableContouringQCUpdateFlag,
972                 COMPONENT_CHROMA,
973                 temporalLayerIndex,
974                 0,
975                 cabacEncodeCtxPtr,
976                 contextPtr->fullLambda,
977                 intraLumaMode,
978                 EB_INTRA_CHROMA_DM,
979                 CabacCost);
980 
981         if ((componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK) && secondChroma) {
982 			tuPtr->crCbf2 = countNonZeroCoeffs[2] ? EB_TRUE : EB_FALSE;
983 			tuPtr->isOnlyDc2[1] = (countNonZeroCoeffs[2] == 1 && (((EB_S16*)residual16bit->bufferCr) + scratchCbOffset)[0] != 0) ?
984 				EB_TRUE :
985 				EB_FALSE;
986         } else {
987             tuPtr->crCbf = countNonZeroCoeffs[2] ? EB_TRUE : EB_FALSE;
988 
989             if (tuSize > MIN_PU_SIZE) {
990                 tuPtr->isOnlyDc[2] = (countNonZeroCoeffs[2] == 1 && (((EB_S16*)residual16bit->bufferCr) + scratchCbOffset)[0] != 0) ?
991                     EB_TRUE :
992                     EB_FALSE;
993                 if (contextPtr->transCoeffShapeChroma && tuPtr->crCbf && tuPtr->isOnlyDc[2] == EB_FALSE) {
994 
995                     if (contextPtr->transCoeffShapeChroma == PF_N2 || contextPtr->transCoeffShapeChroma == PF_N4) {
996                         PfZeroOutUselessQuadrants(
997                                 ((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
998                                 residual16bit->strideCr,
999                                 (tuSize >> (1 + subWidthCMinus1)));
1000                     }
1001 
1002                     if (contextPtr->transCoeffShapeChroma == PF_N4) {
1003                         PfZeroOutUselessQuadrants(
1004                                 ((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
1005                                 residual16bit->strideCr,
1006                                 (tuSize >> (2 + subWidthCMinus1)));
1007                     }
1008                 }
1009             } else {
1010                 if (contextPtr->transCoeffShapeChroma && tuPtr->crCbf) {
1011                     PfZeroOutUselessQuadrants(
1012                             ((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
1013                             residual16bit->strideCr,
1014                             (tuSize >> 1));
1015 
1016                     if (contextPtr->transCoeffShapeChroma == PF_N4) {
1017                         PfZeroOutUselessQuadrants(
1018                                 ((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
1019                                 residual16bit->strideCr,
1020                                 (tuSize >> 2));
1021                     }
1022                 }
1023             }
1024         }
1025 	}
1026 #ifdef DEBUG_REF_INFO
1027     if (lcuPtr->pictureControlSetPtr->pictureNumber == 0) {
1028         {
1029             int chroma_size = tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize;
1030 
1031             printf("\n----- Dump coeff for 1st loop at (%d, %d), qp is %d -----\n", originX, originY, qp);
1032             if (componentMask & PICTURE_BUFFER_DESC_LUMA_MASK) {
1033                 dump_block_from_desc(tuSize, coeffSamplesTB, originX&63, originY&63, 0);
1034             }
1035             //if (componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK) {
1036             //    dump_block_from_desc(chroma_size, coeffSamplesTB, originX&63, originY&63, 1);
1037             //}
1038 
1039             printf("\n----- Dump residual for 1st loop at (%d, %d)-----\n", originX, originY);
1040             if (componentMask & PICTURE_BUFFER_DESC_LUMA_MASK) {
1041                 dump_block_from_desc(tuSize, residual16bit, originX&63, originY&63, 0);
1042             }
1043             //if (componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK) {
1044             //    dump_block_from_desc(chroma_size, residual16bit, originX&63, originY&63, 1);
1045             //}
1046         }
1047     }
1048 #endif
1049 
1050     if ((componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK) && secondChroma) {
1051         tuPtr->nzCoefCount2[0] = (EB_U16)countNonZeroCoeffs[1];
1052         tuPtr->nzCoefCount2[1] = (EB_U16)countNonZeroCoeffs[2];
1053 	    tuPtr->transCoeffShapeChroma2 = contextPtr->transCoeffShapeChroma;
1054     } else {
1055 	    tuPtr->transCoeffShapeLuma   = contextPtr->transCoeffShapeLuma;
1056 	    tuPtr->transCoeffShapeChroma = contextPtr->transCoeffShapeChroma;
1057         tuPtr->nzCoefCount[0] = (EB_U16)countNonZeroCoeffs[0];
1058         tuPtr->nzCoefCount[1] = (EB_U16)countNonZeroCoeffs[1];
1059         tuPtr->nzCoefCount[2] = (EB_U16)countNonZeroCoeffs[2];
1060     }
1061 
1062 	return;
1063 }
1064 
1065 /**********************************************************
1066 * Encode Generate Recon
1067 *
1068 * Summary: Performs a H.265 conformant
1069 *   Inverse Transform and generate
1070 *   the reconstructed samples of a TU.
1071 *
1072 * Inputs:
1073 *   originX
1074 *   originY
1075 *   tuSize
1076 *   lcuSize
1077 *   input - Inverse Qunatized Coeff (position sensitive)
1078 *   pred - prediction samples (position independent)
1079 *
1080 * Outputs:
1081 *   Recon  (position independent)
1082 *
1083 **********************************************************/
EncodeGenerateRecon(EncDecContext_t * contextPtr,EB_U32 originX,EB_U32 originY,EB_U32 componentMask,EB_COLOR_FORMAT colorFormat,EB_BOOL secondChroma,EB_U32 tuSize,EbPictureBufferDesc_t * predSamples,EbPictureBufferDesc_t * residual16bit,EB_S16 * transformScratchBuffer)1084 static void EncodeGenerateRecon(
1085 	EncDecContext_t       *contextPtr,
1086 	EB_U32                 originX,
1087 	EB_U32                 originY,
1088     EB_U32                 componentMask,
1089     EB_COLOR_FORMAT        colorFormat,
1090     EB_BOOL                secondChroma,
1091     EB_U32                 tuSize,
1092 	EbPictureBufferDesc_t *predSamples,     // no basis/offset
1093 	EbPictureBufferDesc_t *residual16bit,    // no basis/offset
1094 	EB_S16                *transformScratchBuffer)
1095 {
1096 	EB_U32 predLumaOffset;
1097 	EB_U32 predChromaOffset;
1098 	EB_U32 scratchLumaOffset;
1099 	EB_U32 scratchChromaOffset;
1100 	EB_U32 reconLumaOffset;
1101 	EB_U32 reconChromaOffset;
1102 
1103     CodingUnit_t		  *cuPtr              = contextPtr->cuPtr;
1104     TransformUnit_t       *tuPtr              = &cuPtr->transformUnitArray[contextPtr->tuItr];
1105     EbPictureBufferDesc_t *reconSamples       = predSamples;
1106 
1107     const EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
1108     const EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
1109     const EB_U16 shift_bit = (tuSize == MIN_PU_SIZE) ? 0 : subWidthCMinus1;
1110     EB_U16 tuChromaOffset = 0;
1111     if (colorFormat == EB_YUV422 && secondChroma) {
1112         tuChromaOffset = tuSize >> 1;
1113     }
1114     EB_BOOL cbCbf=secondChroma?tuPtr->cbCbf2:tuPtr->cbCbf;
1115     EB_BOOL crCbf=secondChroma?tuPtr->crCbf2:tuPtr->crCbf;
1116 	// *Note - The prediction is built in-place in the Recon buffer. It is overwritten with Reconstructed
1117 	//   samples if the CBF==1 && SKIP==False
1118 
1119 	//**********************************
1120 	// Luma
1121 	//**********************************
1122     if (componentMask & PICTURE_BUFFER_DESC_LUMA_MASK) {
1123         predLumaOffset =    (predSamples->originY+originY)             * predSamples->strideY    + (predSamples->originX+originX);
1124 		scratchLumaOffset = ((originY & (63))  * 64) + (originX & (63));
1125         reconLumaOffset =   (reconSamples->originY+originY)            * reconSamples->strideY   + (reconSamples->originX+originX);
1126 		if (tuPtr->lumaCbf == EB_TRUE && cuPtr->skipFlag == EB_FALSE) {
1127 
1128 			EncodeInvTransform(
1129 				(tuSize==MIN_PU_SIZE)?EB_FALSE:(tuPtr->transCoeffShapeLuma == ONLY_DC_SHAPE || tuPtr->isOnlyDc[0]),
1130 				((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1131 				residual16bit->strideY,
1132 				((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1133 				residual16bit->strideY,
1134 				tuSize,
1135 				transformScratchBuffer,
1136 				BIT_INCREMENT_8BIT,
1137 				(EB_BOOL)(tuSize == MIN_PU_SIZE));
1138 
1139 			AdditionKernel_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)][tuSize >> 3](
1140 				predSamples->bufferY + predLumaOffset,
1141 				predSamples->strideY,
1142 				((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1143 				residual16bit->strideY,
1144 				reconSamples->bufferY + reconLumaOffset,
1145 				reconSamples->strideY,
1146 				tuSize,
1147 				tuSize);
1148 		}
1149 	}
1150 
1151 	//**********************************
1152 	// Chroma
1153 	//**********************************
1154 
1155     if (componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK) {
1156         predChromaOffset = ((predSamples->originX + originX) >> subWidthCMinus1) +
1157             (((predSamples->originY + originY + tuChromaOffset) >> subHeightCMinus1) * predSamples->strideCb);
1158 		scratchChromaOffset = ((originX & 63) >> subWidthCMinus1) +
1159             (((originY+tuChromaOffset) & 63) >> subHeightCMinus1) * (64 >> subWidthCMinus1);
1160 		reconChromaOffset = ((reconSamples->originX + originX) >> subWidthCMinus1) +
1161             (((reconSamples->originY + originY + tuChromaOffset) >> subHeightCMinus1) * reconSamples->strideCb);
1162 
1163 		//**********************************
1164 		// Cb
1165 		//**********************************
1166 		if (cbCbf== EB_TRUE && cuPtr->skipFlag == EB_FALSE) {
1167 			EncodeInvTransform(
1168 				(tuSize==MIN_PU_SIZE)?EB_FALSE:(secondChroma ? (tuPtr->transCoeffShapeChroma2 == ONLY_DC_SHAPE || tuPtr->isOnlyDc2[0]) : (tuPtr->transCoeffShapeChroma == ONLY_DC_SHAPE || tuPtr->isOnlyDc[1])),
1169 				((EB_S16*)residual16bit->bufferCb) + scratchChromaOffset,
1170 				residual16bit->strideCb,
1171 				((EB_S16*)residual16bit->bufferCb) + scratchChromaOffset,
1172 				residual16bit->strideCb,
1173                 tuSize >> shift_bit,
1174 				transformScratchBuffer,
1175 				BIT_INCREMENT_8BIT,
1176 				EB_FALSE);
1177 
1178 			AdditionKernel_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)][tuSize >> (3 + shift_bit)](
1179 				predSamples->bufferCb + predChromaOffset,
1180 				predSamples->strideCb,
1181 				((EB_S16*)residual16bit->bufferCb) + scratchChromaOffset,
1182 				residual16bit->strideCb,
1183 				reconSamples->bufferCb + reconChromaOffset,
1184 				reconSamples->strideCb,
1185                 tuSize >> shift_bit,
1186                 tuSize >> shift_bit);
1187 		}
1188 
1189 		//**********************************
1190 		// Cr
1191 		//**********************************
1192         predChromaOffset = ((predSamples->originX+originX) >> subWidthCMinus1) +
1193             (((predSamples->originY + originY + tuChromaOffset) >> subHeightCMinus1) * predSamples->strideCr);
1194 		scratchChromaOffset = ((originX & (63)) >> subWidthCMinus1) +
1195             (((originY+tuChromaOffset) & 63) >> subHeightCMinus1) * (64 >> subWidthCMinus1);
1196 		reconChromaOffset = ((reconSamples->originX+originX) >> subWidthCMinus1) +
1197             (((reconSamples->originY + originY + tuChromaOffset) >> subHeightCMinus1) * reconSamples->strideCr);
1198 
1199 		if (crCbf == EB_TRUE && cuPtr->skipFlag == EB_FALSE) {
1200 			EncodeInvTransform(
1201 				(tuSize==MIN_PU_SIZE)?EB_FALSE:(secondChroma ? (tuPtr->transCoeffShapeChroma2 == ONLY_DC_SHAPE || tuPtr->isOnlyDc2[1]) : (tuPtr->transCoeffShapeChroma == ONLY_DC_SHAPE || tuPtr->isOnlyDc[2])),
1202 				((EB_S16*)residual16bit->bufferCr) + scratchChromaOffset,
1203 				residual16bit->strideCr,
1204 				((EB_S16*)residual16bit->bufferCr) + scratchChromaOffset,
1205 				residual16bit->strideCr,
1206 				tuSize >> shift_bit,
1207 				transformScratchBuffer,
1208 				BIT_INCREMENT_8BIT,
1209 				EB_FALSE);
1210 
1211 			AdditionKernel_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)][tuSize >> (3 + shift_bit)](
1212 				predSamples->bufferCr + predChromaOffset,
1213 				predSamples->strideCr,
1214 				((EB_S16*)residual16bit->bufferCr) + scratchChromaOffset,
1215 				residual16bit->strideCr,
1216 				reconSamples->bufferCr + reconChromaOffset,
1217 				reconSamples->strideCr,
1218 				tuSize >> shift_bit,
1219 				tuSize >> shift_bit);
1220 		}
1221 	}
1222 
1223 	return;
1224 }
1225 
1226 /**********************************************************
1227 * Encode Loop
1228 *
1229 * Summary: Performs a H.265 conformant
1230 *   Transform, Quantization  and Inverse Quantization of a TU.
1231 *
1232 * Inputs:
1233 *   originX
1234 *   originY
1235 *   tuSize
1236 *   lcuSize
1237 *   input - input samples (position sensitive)
1238 *   pred - prediction samples (position independent)
1239 *
1240 * Outputs:
1241 *   Inverse quantized coeff - quantization indices (position sensitive)
1242 *
1243 **********************************************************/
EncodeLoop16bit(EncDecContext_t * contextPtr,LargestCodingUnit_t * lcuPtr,EB_U32 originX,EB_U32 originY,EB_U32 cbQp,EbPictureBufferDesc_t * predSamples,EbPictureBufferDesc_t * coeffSamplesTB,EbPictureBufferDesc_t * residual16bit,EbPictureBufferDesc_t * transform16bit,EB_S16 * transformScratchBuffer,EB_U32 * countNonZeroCoeffs,EB_U32 useDeltaQp,CabacEncodeContext_t * cabacEncodeCtxPtr,EB_U32 intraLumaMode,EB_U32 componentMask,EB_COLOR_FORMAT colorFormat,EB_BOOL secondChroma,EB_U32 tuSize,CabacCost_t * CabacCost,EB_U32 dZoffset)1244 static void EncodeLoop16bit(
1245 	EncDecContext_t				*contextPtr,
1246 	LargestCodingUnit_t			*lcuPtr,
1247 	EB_U32						 originX,
1248 	EB_U32						 originY,
1249 	EB_U32						 cbQp,
1250 	EbPictureBufferDesc_t		*predSamples,         // no basis/offset
1251 	EbPictureBufferDesc_t		*coeffSamplesTB,      // lcu based
1252 	EbPictureBufferDesc_t		*residual16bit,       // no basis/offset
1253 	EbPictureBufferDesc_t		*transform16bit,      // no basis/offset
1254 	EB_S16						*transformScratchBuffer,
1255 	EB_U32						*countNonZeroCoeffs,
1256 	EB_U32						 useDeltaQp,
1257 	CabacEncodeContext_t		*cabacEncodeCtxPtr,
1258 	EB_U32						 intraLumaMode,
1259     EB_U32                       componentMask,
1260     EB_COLOR_FORMAT              colorFormat,
1261     EB_BOOL                      secondChroma,
1262     EB_U32                       tuSize,
1263 	CabacCost_t					*CabacCost,
1264 	EB_U32						 dZoffset)
1265 {
1266     EB_U32 chromaQp = cbQp;
1267     CodingUnit_t *cuPtr = contextPtr->cuPtr;
1268     TransformUnit_t *tuPtr = &cuPtr->transformUnitArray[contextPtr->tuItr];
1269     EB_PICTURE sliceType = lcuPtr->pictureControlSetPtr->sliceType;
1270     EB_U32 temporalLayerIndex = lcuPtr->pictureControlSetPtr->temporalLayerIndex;
1271     EB_U32 qp = cuPtr->qp;
1272     EbPictureBufferDesc_t *inputSamples16bit = contextPtr->inputSample16bitBuffer; //64x64 for 16bit, whole frame for 8bit
1273     EbPictureBufferDesc_t *predSamples16bit = predSamples;
1274 
1275     const EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
1276     const EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
1277     EB_U16 tuChromaOffset = 0;
1278     if (colorFormat == EB_YUV422 && secondChroma) {
1279         tuChromaOffset = tuSize >> 1;
1280     }
1281 
1282     const EB_U32 inputLumaOffset = ((originY & 63) * inputSamples16bit->strideY) + (originX & 63);
1283     const EB_U32 predLumaOffset = ((predSamples16bit->originY + originY) * predSamples16bit->strideY) + (predSamples16bit->originX + originX);
1284     const EB_U32 scratchLumaOffset  = ((originY & 63) * 64) + (originX & 63);
1285 
1286 	const EB_U32 inputCbOffset = ((((originY + tuChromaOffset) & 63) >> subHeightCMinus1) * inputSamples16bit->strideCb) + ((originX & 63) >> subWidthCMinus1);
1287 	const EB_U32 inputCrOffset = ((((originY + tuChromaOffset) & 63) >> subHeightCMinus1) * inputSamples16bit->strideCr) + ((originX & 63) >> subWidthCMinus1);
1288 
1289 	const EB_U32 predCbOffset = ((predSamples->originX + originX) >> subWidthCMinus1) +
1290         (((predSamples->originY + originY + tuChromaOffset) >> subHeightCMinus1) * predSamples->strideCb);
1291 	const EB_U32 predCrOffset = ((predSamples->originX + originX) >> subWidthCMinus1) +
1292         (((predSamples->originY + originY + tuChromaOffset) >> subHeightCMinus1) * predSamples->strideCr);
1293 
1294 	const EB_U32 scratchCbOffset = ((originX & (64 - 1)) >> subWidthCMinus1) +
1295         ((((originY + tuChromaOffset) & (64 - 1)) >> subHeightCMinus1) * (64 >> subWidthCMinus1));
1296 	const EB_U32 scratchCrOffset = ((originX & (64 - 1)) >> subWidthCMinus1) +
1297         ((((originY + tuChromaOffset) & (64 - 1)) >> subHeightCMinus1) * (64 >> subWidthCMinus1));
1298 
1299     EB_U8 enableContouringQCUpdateFlag;
1300 
1301     enableContouringQCUpdateFlag = DeriveContouringClass(
1302         lcuPtr->pictureControlSetPtr->ParentPcsPtr,
1303         lcuPtr->index,
1304         cuPtr->leafIndex) && (cuPtr->qp < lcuPtr->pictureControlSetPtr->pictureQp);
1305 
1306     //Update QP for Quant
1307 	qp += QP_BD_OFFSET;
1308 	chromaQp += QP_BD_OFFSET;
1309 
1310 
1311     if (componentMask & PICTURE_BUFFER_DESC_LUMA_MASK) {
1312 		PictureResidual16bit(
1313 			((EB_U16*)inputSamples16bit->bufferY) + inputLumaOffset,
1314 			inputSamples16bit->strideY,
1315 			((EB_U16*)predSamples16bit->bufferY) + predLumaOffset,
1316 			predSamples16bit->strideY,
1317             ((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1318             64,
1319             tuSize,
1320             tuSize);
1321 
1322         EncodeTransform(
1323             ((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1324             64,
1325             ((EB_S16*)transform16bit->bufferY) + scratchLumaOffset,
1326             64,
1327             tuSize,
1328             transformScratchBuffer,
1329             BIT_INCREMENT_10BIT,
1330             (EB_BOOL)(tuSize == MIN_PU_SIZE),
1331             contextPtr->transCoeffShapeLuma);
1332 
1333 		UnifiedQuantizeInvQuantize(
1334 			contextPtr,
1335 			lcuPtr->pictureControlSetPtr,
1336 			((EB_S16*)transform16bit->bufferY) + scratchLumaOffset,
1337 			64,
1338 			((EB_S16*)coeffSamplesTB->bufferY) + scratchLumaOffset,
1339 			((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1340 			qp,
1341 			EB_10BIT,
1342 			tuSize,
1343 			sliceType,
1344 			&(countNonZeroCoeffs[0]),
1345 			contextPtr->transCoeffShapeLuma,
1346 			contextPtr->cleanSparseCeoffPfEncDec,
1347 			contextPtr->pmpMaskingLevelEncDec,
1348 			cuPtr->predictionModeFlag,
1349 			0,
1350 			enableContouringQCUpdateFlag,
1351 			COMPONENT_LUMA,
1352 			temporalLayerIndex,
1353 			dZoffset,
1354 			cabacEncodeCtxPtr,
1355 			contextPtr->fullLambda,
1356 			intraLumaMode,
1357 			EB_INTRA_CHROMA_DM,
1358 			CabacCost);
1359 
1360 		tuPtr->lumaCbf = countNonZeroCoeffs[0] ? EB_TRUE : EB_FALSE;
1361 
1362         if (tuSize > MIN_PU_SIZE) {
1363 		tuPtr->isOnlyDc[0] = (countNonZeroCoeffs[0] == 1 && (((EB_S16*)residual16bit->bufferY) + scratchLumaOffset)[0] != 0 && tuSize != 32) ?
1364 			EB_TRUE :
1365 			EB_FALSE;
1366 
1367             if (contextPtr->transCoeffShapeLuma && tuPtr->lumaCbf && tuPtr->isOnlyDc[0] == EB_FALSE) {
1368                 if (contextPtr->transCoeffShapeLuma == N2_SHAPE || contextPtr->transCoeffShapeLuma == N4_SHAPE) {
1369                     PfZeroOutUselessQuadrants(
1370                             ((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1371                             64,
1372                             (tuSize >> 1));
1373                 }
1374 
1375                 if (contextPtr->transCoeffShapeLuma == N4_SHAPE) {
1376                     PfZeroOutUselessQuadrants(
1377                             ((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1378                             64,
1379                             (tuSize >> 2));
1380                 }
1381             }
1382         } else {
1383 			if (contextPtr->transCoeffShapeLuma && tuPtr->lumaCbf) {
1384 
1385 				PfZeroOutUselessQuadrants(
1386 						((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1387 						64,
1388 						(tuSize >> 1));
1389 
1390 				if (contextPtr->transCoeffShapeLuma == N4_SHAPE) {
1391 
1392 					PfZeroOutUselessQuadrants(
1393 							((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1394 							64,
1395 							(tuSize >> 2));
1396 				}
1397 			}
1398 		}
1399 	}
1400 
1401     if (componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK) {
1402 		//**********************************
1403 		// Cb
1404 		//**********************************
1405 		PictureResidual16bit(
1406 				((EB_U16*)inputSamples16bit->bufferCb) + inputCbOffset,
1407 				inputSamples16bit->strideCb,
1408 				((EB_U16*)predSamples16bit->bufferCb) + predCbOffset,
1409 				predSamples16bit->strideCb,
1410 				((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
1411 				residual16bit->strideCb,
1412 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize,
1413 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize);
1414 
1415 		// For the case that DC path chosen for chroma, we check the DC values and determine to use DC or N2Shape for chroma. Since there is only one flag for ChromaShaping, we do the prediction of Cr and Cb and decide on the chroma shaping
1416 		if (tuSize > MIN_PU_SIZE && contextPtr->transCoeffShapeChroma == ONLY_DC_SHAPE) {
1417 			EB_S64 sumResidual = SumResidual_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
1418 					((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
1419 					tuSize >> subWidthCMinus1,
1420 					residual16bit->strideCb);
1421 			sumResidual = (ABS(sumResidual) / (tuSize >> subWidthCMinus1) / (tuSize >> subWidthCMinus1)); // Normalized based on the size. For chroma, tusize/2 +Tusize/2
1422 			if (sumResidual > (1 << BIT_INCREMENT_10BIT)) {
1423 				contextPtr->transCoeffShapeChroma = N2_SHAPE;
1424 			}
1425 		}
1426 
1427 		EncodeTransform(
1428 				((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
1429 				residual16bit->strideCb,
1430 				((EB_S16*)transform16bit->bufferCb) + scratchCbOffset,
1431 				transform16bit->strideCb,
1432 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize,
1433 				transformScratchBuffer,
1434 				BIT_INCREMENT_10BIT,
1435 				EB_FALSE,
1436 				contextPtr->transCoeffShapeChroma);
1437 
1438 		UnifiedQuantizeInvQuantize(
1439 				contextPtr,
1440 				lcuPtr->pictureControlSetPtr,
1441 				((EB_S16*)transform16bit->bufferCb) + scratchCbOffset,
1442 				transform16bit->strideCb,
1443 				((EB_S16*)coeffSamplesTB->bufferCb) + scratchCbOffset,
1444 				((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
1445 				chromaQp,
1446 				EB_10BIT,
1447 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize,
1448 				sliceType,
1449 				&(countNonZeroCoeffs[1]),
1450 				contextPtr->transCoeffShapeChroma,
1451 				contextPtr->cleanSparseCeoffPfEncDec,
1452 				contextPtr->pmpMaskingLevelEncDec,
1453 				cuPtr->predictionModeFlag,
1454 				0, //useDeltaQp == EB_TRUE ? contextPtr->forceCbfFlag : 0
1455 				enableContouringQCUpdateFlag,
1456 				COMPONENT_CHROMA,
1457 				temporalLayerIndex,
1458 				0,
1459 				cabacEncodeCtxPtr,
1460 				contextPtr->fullLambda,
1461 				intraLumaMode,
1462 				EB_INTRA_CHROMA_DM,
1463 				CabacCost);
1464 
1465 
1466 		if ((componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK) && secondChroma) {
1467 			tuPtr->cbCbf2 = countNonZeroCoeffs[1] ? EB_TRUE : EB_FALSE;
1468 			tuPtr->isOnlyDc2[0] = (countNonZeroCoeffs[1] == 1 && (((EB_S16*)residual16bit->bufferCb) + scratchCbOffset)[0] != 0) ?
1469 				EB_TRUE :
1470 				EB_FALSE;
1471         } else {
1472             tuPtr->cbCbf = countNonZeroCoeffs[1] ? EB_TRUE : EB_FALSE;
1473 
1474             if (tuSize > MIN_PU_SIZE) {
1475                 tuPtr->isOnlyDc[1] = (countNonZeroCoeffs[1] == 1 && (((EB_S16*)residual16bit->bufferCb) + scratchCbOffset)[0] != 0) ?
1476                     EB_TRUE :
1477                     EB_FALSE;
1478 
1479                 if (contextPtr->transCoeffShapeChroma && tuPtr->cbCbf && tuPtr->isOnlyDc[1] == EB_FALSE) {
1480                     if (contextPtr->transCoeffShapeChroma == PF_N2 || contextPtr->transCoeffShapeChroma == PF_N4) {
1481                         PfZeroOutUselessQuadrants(
1482                                 ((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
1483                                 residual16bit->strideCb,
1484                                 (tuSize >> (1 + subWidthCMinus1)));
1485                     }
1486 
1487                     if (contextPtr->transCoeffShapeChroma == PF_N4) {
1488                         PfZeroOutUselessQuadrants(
1489                                 ((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
1490                                 residual16bit->strideCb,
1491                                 (tuSize >> (2 + subWidthCMinus1)));
1492                     }
1493                 }
1494             } else {
1495                 if (contextPtr->transCoeffShapeChroma && tuPtr->cbCbf) {
1496                     PfZeroOutUselessQuadrants(
1497                             ((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
1498                             residual16bit->strideCb,
1499                             (tuSize >> 1));
1500 
1501                     if (contextPtr->transCoeffShapeChroma == PF_N4) {
1502                         PfZeroOutUselessQuadrants(
1503                                 ((EB_S16*)residual16bit->bufferCb) + scratchCbOffset,
1504                                 residual16bit->strideCb,
1505                                 (tuSize >> 2));
1506                     }
1507                 }
1508             }
1509         }
1510 
1511 
1512 		//**********************************
1513 		// Cr
1514 		//**********************************
1515         PictureResidual16bit(
1516 				((EB_U16*)inputSamples16bit->bufferCr) + inputCrOffset,
1517 				inputSamples16bit->strideCr,
1518 				((EB_U16*)predSamples16bit->bufferCr) + predCrOffset,
1519 				predSamples16bit->strideCr,
1520 				((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
1521 				residual16bit->strideCr,
1522 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize,
1523 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize);
1524 
1525 		if (tuSize > MIN_PU_SIZE && contextPtr->transCoeffShapeChroma == ONLY_DC_SHAPE) {
1526 			EB_S64 sumResidual = SumResidual_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
1527 					((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
1528 					tuSize >> subWidthCMinus1,
1529 					residual16bit->strideCr);
1530 
1531 			sumResidual = (ABS(sumResidual) / (tuSize >> subWidthCMinus1) / (tuSize >> subWidthCMinus1)); // Normalized based on the size. For chroma, tusize/2 +Tusize/2
1532 			if (sumResidual > (1 << BIT_INCREMENT_10BIT)) {
1533 				contextPtr->transCoeffShapeChroma = N2_SHAPE;
1534 			}
1535 		}
1536 
1537 		EncodeTransform(
1538 				((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
1539 				residual16bit->strideCr,
1540 				((EB_S16*)transform16bit->bufferCr) + scratchCrOffset,
1541 				transform16bit->strideCr,
1542 				tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize,
1543 				transformScratchBuffer,
1544 				BIT_INCREMENT_10BIT,
1545 				EB_FALSE,
1546 				contextPtr->transCoeffShapeChroma);
1547 
1548 
1549 		{
1550 			UnifiedQuantizeInvQuantize(
1551 					contextPtr,
1552 					lcuPtr->pictureControlSetPtr,
1553 					((EB_S16*)transform16bit->bufferCr) + scratchCrOffset,
1554 					transform16bit->strideCr,
1555 					((EB_S16*)coeffSamplesTB->bufferCr) + scratchCrOffset,
1556 					((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
1557 					chromaQp,
1558 					EB_10BIT,
1559 					tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize,
1560 					sliceType,
1561 					&(countNonZeroCoeffs[2]),
1562 					contextPtr->transCoeffShapeChroma,
1563 					contextPtr->cleanSparseCeoffPfEncDec,
1564 					contextPtr->pmpMaskingLevelEncDec,
1565 					cuPtr->predictionModeFlag,
1566 					useDeltaQp == EB_TRUE ? contextPtr->forceCbfFlag : 0, //Jing: double check here, not align with Cb
1567 					enableContouringQCUpdateFlag,
1568 					COMPONENT_CHROMA,
1569 					temporalLayerIndex,
1570 					0,
1571 					cabacEncodeCtxPtr,
1572 					contextPtr->fullLambda,
1573 					intraLumaMode,
1574 					EB_INTRA_CHROMA_DM,
1575 					CabacCost);
1576 		}
1577 
1578 		if ((componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK) && secondChroma) {
1579 
1580 			tuPtr->crCbf2 = countNonZeroCoeffs[2] ? EB_TRUE : EB_FALSE;
1581 			tuPtr->isOnlyDc2[1] = (countNonZeroCoeffs[2] == 1 && (((EB_S16*)residual16bit->bufferCr) + scratchCbOffset)[0] != 0) ?
1582 				EB_TRUE :
1583 				EB_FALSE;
1584         } else {
1585             tuPtr->crCbf = countNonZeroCoeffs[2] ? EB_TRUE : EB_FALSE;
1586 
1587             if (tuSize > MIN_PU_SIZE) {
1588                 tuPtr->isOnlyDc[2] = (countNonZeroCoeffs[2] == 1 && (((EB_S16*)residual16bit->bufferCr) + scratchCbOffset)[0] != 0) ?
1589                     EB_TRUE :
1590                     EB_FALSE;
1591                 if (contextPtr->transCoeffShapeChroma && tuPtr->crCbf && tuPtr->isOnlyDc[2] == EB_FALSE) {
1592 
1593                     if (contextPtr->transCoeffShapeChroma == PF_N2 || contextPtr->transCoeffShapeChroma == PF_N4) {
1594                         PfZeroOutUselessQuadrants(
1595                                 ((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
1596                                 residual16bit->strideCr,
1597                                 (tuSize >> (1 + subWidthCMinus1)));
1598                     }
1599 
1600                     if (contextPtr->transCoeffShapeChroma == PF_N4) {
1601                         PfZeroOutUselessQuadrants(
1602                                 ((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
1603                                 residual16bit->strideCr,
1604                                 (tuSize >> (2 + subWidthCMinus1)));
1605                     }
1606                 }
1607             } else {
1608                 if (contextPtr->transCoeffShapeChroma && tuPtr->crCbf) {
1609                     PfZeroOutUselessQuadrants(
1610                             ((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
1611                             residual16bit->strideCr,
1612                             (tuSize >> 1));
1613 
1614                     if (contextPtr->transCoeffShapeChroma == PF_N4) {
1615                         PfZeroOutUselessQuadrants(
1616                                 ((EB_S16*)residual16bit->bufferCr) + scratchCrOffset,
1617                                 residual16bit->strideCr,
1618                                 (tuSize >> 2));
1619                     }
1620                 }
1621             }
1622         }
1623 	}
1624 
1625 
1626     if ((componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK) && secondChroma) {
1627         tuPtr->nzCoefCount2[0] = (EB_U16)countNonZeroCoeffs[1];
1628         tuPtr->nzCoefCount2[1] = (EB_U16)countNonZeroCoeffs[2];
1629 	    tuPtr->transCoeffShapeChroma2 = contextPtr->transCoeffShapeChroma;
1630     } else {
1631 	    tuPtr->transCoeffShapeLuma   = contextPtr->transCoeffShapeLuma;
1632 	    tuPtr->transCoeffShapeChroma = contextPtr->transCoeffShapeChroma;
1633         tuPtr->nzCoefCount[0] = (EB_U16)countNonZeroCoeffs[0];
1634         tuPtr->nzCoefCount[1] = (EB_U16)countNonZeroCoeffs[1];
1635         tuPtr->nzCoefCount[2] = (EB_U16)countNonZeroCoeffs[2];
1636     }
1637 	return;
1638 }
1639 
1640 
1641 /**********************************************************
1642 * Encode Generate Recon
1643 *
1644 * Summary: Performs a H.265 conformant
1645 *   Inverse Transform and generate
1646 *   the reconstructed samples of a TU.
1647 *
1648 * Inputs:
1649 *   originX
1650 *   originY
1651 *   tuSize
1652 *   lcuSize
1653 *   input - Inverse Qunatized Coeff (position sensitive)
1654 *   pred - prediction samples (position independent)
1655 *
1656 * Outputs:
1657 *   Recon  (position independent)
1658 *
1659 **********************************************************/
EncodeGenerateRecon16bit(EncDecContext_t * contextPtr,EB_U32 originX,EB_U32 originY,EB_U32 componentMask,EB_COLOR_FORMAT colorFormat,EB_BOOL secondChroma,EB_U32 tuSize,EbPictureBufferDesc_t * predSamples,EbPictureBufferDesc_t * residual16bit,EB_S16 * transformScratchBuffer)1660 static void EncodeGenerateRecon16bit(
1661     EncDecContext_t       *contextPtr,
1662     EB_U32                 originX,
1663     EB_U32                 originY,
1664     EB_U32                 componentMask,
1665     EB_COLOR_FORMAT        colorFormat,
1666     EB_BOOL                secondChroma,
1667     EB_U32                 tuSize,
1668     EbPictureBufferDesc_t *predSamples,     // no basis/offset
1669     EbPictureBufferDesc_t *residual16bit,    // no basis/offset
1670     EB_S16                *transformScratchBuffer)
1671 {
1672 	EB_U32 predLumaOffset;
1673 	EB_U32 predChromaOffset;
1674 	EB_U32 scratchLumaOffset;
1675 	EB_U32 scratchChromaOffset;
1676 	EB_U32 reconLumaOffset;
1677 	EB_U32 reconChromaOffset;
1678 
1679     CodingUnit_t		  *cuPtr              = contextPtr->cuPtr;
1680     TransformUnit_t       *tuPtr              = &cuPtr->transformUnitArray[contextPtr->tuItr];
1681 
1682     const EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
1683     const EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
1684     const EB_U16 shift_bit = (tuSize == MIN_PU_SIZE) ? 0 : subWidthCMinus1;
1685     EB_U16 tuChromaOffset = 0;
1686     if (colorFormat == EB_YUV422 && secondChroma) {
1687         tuChromaOffset = tuSize >> 1;
1688     }
1689     EB_BOOL cbCbf=secondChroma?tuPtr->cbCbf2:tuPtr->cbCbf;
1690     EB_BOOL crCbf=secondChroma?tuPtr->crCbf2:tuPtr->crCbf;
1691 	// *Note - The prediction is built in-place in the Recon buffer. It is overwritten with Reconstructed
1692 	//   samples if the CBF==1 && SKIP==False
1693 
1694 	//**********************************
1695 	// Luma
1696 	//**********************************
1697     if (componentMask & PICTURE_BUFFER_DESC_LUMA_MASK) {
1698         predLumaOffset = (predSamples->originY+originY) * predSamples->strideY + (predSamples->originX+originX);
1699 		scratchLumaOffset = ((originY & (63)) * 64) + (originX & (63));
1700         reconLumaOffset = (predSamples->originY + originY)* predSamples->strideY + (predSamples->originX + originX);
1701 
1702 		if (tuPtr->lumaCbf == EB_TRUE && cuPtr->skipFlag == EB_FALSE) {
1703 			EncodeInvTransform(
1704 				(tuSize==MIN_PU_SIZE)?EB_FALSE:(tuPtr->transCoeffShapeLuma == ONLY_DC_SHAPE || tuPtr->isOnlyDc[0]),
1705 				((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1706 				64,
1707 				((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1708 				64,
1709 				tuSize,
1710 				transformScratchBuffer,
1711 				BIT_INCREMENT_10BIT,
1712 				(EB_BOOL)(tuSize == MIN_PU_SIZE));
1713 
1714             AdditionKernel_funcPtrArray16bit[!!(ASM_TYPES & PREAVX2_MASK)](
1715                 (EB_U16*)predSamples->bufferY + predLumaOffset,
1716                 predSamples->strideY,
1717                 ((EB_S16*)residual16bit->bufferY) + scratchLumaOffset,
1718                 64,
1719                 (EB_U16*)predSamples->bufferY + reconLumaOffset,
1720                 predSamples->strideY,
1721                 tuSize,
1722                 tuSize);
1723 		}
1724 	}
1725 
1726 	//**********************************
1727 	// Chroma
1728 	//**********************************
1729 
1730     if (componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK) {
1731         predChromaOffset = (((predSamples->originY + originY + tuChromaOffset) >> subHeightCMinus1) * predSamples->strideCb) +
1732             ((predSamples->originX + originX) >> subWidthCMinus1);
1733         scratchChromaOffset = (((originY + tuChromaOffset) & 63) >> subHeightCMinus1) * (64 >> subWidthCMinus1) +
1734             ((originX & 63) >> subWidthCMinus1);
1735         reconChromaOffset = (((predSamples->originY + originY + tuChromaOffset) >> subHeightCMinus1) * predSamples->strideCb) +
1736             ((predSamples->originX + originX) >> subWidthCMinus1);
1737 
1738 		//**********************************
1739 		// Cb
1740 		//**********************************
1741 		if (cbCbf== EB_TRUE && cuPtr->skipFlag == EB_FALSE) {
1742 			EncodeInvTransform(
1743 				(tuSize==MIN_PU_SIZE)?EB_FALSE:(secondChroma ? (tuPtr->transCoeffShapeChroma2 == ONLY_DC_SHAPE || tuPtr->isOnlyDc2[0]) : (tuPtr->transCoeffShapeChroma == ONLY_DC_SHAPE || tuPtr->isOnlyDc[1])),
1744 				((EB_S16*)residual16bit->bufferCb) + scratchChromaOffset,
1745 				residual16bit->strideCb,
1746 				((EB_S16*)residual16bit->bufferCb) + scratchChromaOffset,
1747 				residual16bit->strideCb,
1748                 tuSize >> shift_bit,
1749 				transformScratchBuffer,
1750 				BIT_INCREMENT_10BIT,
1751 				EB_FALSE);
1752 
1753             AdditionKernel_funcPtrArray16bit[!!(ASM_TYPES & PREAVX2_MASK)](
1754                 (EB_U16*)predSamples->bufferCb + predChromaOffset,
1755 				predSamples->strideCb,
1756 				((EB_S16*)residual16bit->bufferCb) + scratchChromaOffset,
1757 				residual16bit->strideCb,
1758                 (EB_U16*)predSamples->bufferCb + reconChromaOffset,
1759                 predSamples->strideCb,
1760                 tuSize >> shift_bit,
1761                 tuSize >> shift_bit);
1762 		}
1763 
1764 		//**********************************
1765 		// Cr
1766 		//**********************************
1767         predChromaOffset = (((predSamples->originY + originY + tuChromaOffset) >> subHeightCMinus1) * predSamples->strideCr) +
1768             ((predSamples->originX + originX) >> subWidthCMinus1);
1769         scratchChromaOffset = (((originY + tuChromaOffset) & 63) >> subHeightCMinus1) * (64 >> subWidthCMinus1) +
1770             ((originX & 63) >> subWidthCMinus1);
1771         reconChromaOffset = (((predSamples->originY + originY + tuChromaOffset) >> subHeightCMinus1) * predSamples->strideCr) +
1772             ((predSamples->originX + originX) >> subWidthCMinus1);
1773 
1774 		if (crCbf == EB_TRUE && cuPtr->skipFlag == EB_FALSE) {
1775 			EncodeInvTransform(
1776 				(tuSize==MIN_PU_SIZE)?EB_FALSE:(secondChroma ? (tuPtr->transCoeffShapeChroma2 == ONLY_DC_SHAPE || tuPtr->isOnlyDc2[1]) : (tuPtr->transCoeffShapeChroma == ONLY_DC_SHAPE || tuPtr->isOnlyDc[2])),
1777 				((EB_S16*)residual16bit->bufferCr) + scratchChromaOffset,
1778 				residual16bit->strideCr,
1779 				((EB_S16*)residual16bit->bufferCr) + scratchChromaOffset,
1780 				residual16bit->strideCr,
1781 				tuSize >> shift_bit,
1782 				transformScratchBuffer,
1783 				BIT_INCREMENT_10BIT,
1784 				EB_FALSE);
1785 
1786             AdditionKernel_funcPtrArray16bit[!!(ASM_TYPES & PREAVX2_MASK)](
1787                 (EB_U16*)predSamples->bufferCr + predChromaOffset,
1788 				predSamples->strideCr,
1789 				((EB_S16*)residual16bit->bufferCr) + scratchChromaOffset,
1790 				residual16bit->strideCr,
1791                 (EB_U16*)predSamples->bufferCr + reconChromaOffset,
1792                 predSamples->strideCr,
1793 				tuSize >> shift_bit,
1794 				tuSize >> shift_bit);
1795 		}
1796 	}
1797 
1798 	return;
1799 }
1800 
1801 static EB_ENCODE_LOOP_FUNC_PTR   EncodeLoopFunctionTable[2] =
1802 {
1803     EncodeLoop,
1804     EncodeLoop16bit
1805 };
1806 
1807 EB_GENERATE_RECON_FUNC_PTR   EncodeGenerateReconFunctionPtr[2] =
1808 {
1809     EncodeGenerateRecon,
1810     EncodeGenerateRecon16bit
1811 };
1812 
1813 
1814 EB_GENERATE_INTRA_SAMPLES_FUNC_PTR GenerateIntraReferenceSamplesFuncTable[2] =
1815 {
1816     GenerateIntraReferenceSamplesEncodePass,
1817     GenerateIntraReference16bitSamplesEncodePass
1818 };
1819 
1820 EB_GENERATE_LUMA_INTRA_SAMPLES_FUNC_PTR GenerateLumaIntraReferenceSamplesFuncTable[2] =
1821 {
1822     GenerateLumaIntraReferenceSamplesEncodePass,
1823     GenerateLumaIntraReference16bitSamplesEncodePass
1824 };
1825 
1826 EB_GENERATE_CHROMA_INTRA_SAMPLES_FUNC_PTR GenerateChromaIntraReferenceSamplesFuncTable[2] =
1827 {
1828     GenerateChromaIntraReferenceSamplesEncodePass,
1829     GenerateChromaIntraReference16bitSamplesEncodePass
1830 };
1831 
1832 EB_ENC_PASS_INTRA_FUNC_PTR EncodePassIntraPredictionFuncTable[2] =
1833 {
1834     EncodePassIntraPrediction,
1835     EncodePassIntraPrediction16bit
1836 };
1837 
1838 EB_LCU_INTERNAL_DLF_FUNC_PTR LcuInternalAreaDLFCoreFuncTable[2] =
1839 {
1840     LCUInternalAreaDLFCore,
1841     LCUInternalAreaDLFCore16bit
1842 };
1843 
1844 EB_LCU_BOUNDARY_DLF_FUNC_PTR LcuBoundaryDLFCoreFuncTable[2] =
1845 {
1846     LCUBoundaryDLFCore,
1847     LCUBoundaryDLFCore16bit
1848 };
1849 
1850 EB_LCU_PIC_EDGE_DLF_FUNC_PTR LcuPicEdgeDLFCoreFuncTable[2] =
1851 {
1852     LCUPictureEdgeDLFCore,
1853     LCUPictureEdgeDLFCore16bit
1854 };
1855 
1856 
1857 
1858 /*************************************************
1859 * Encode Pass Motion Vector Prediction
1860 *************************************************/
EncodePassMvPrediction(SequenceControlSet_t * sequenceControlSetPtr,PictureControlSet_t * pictureControlSetPtr,EB_U32 lcuIndex,EncDecContext_t * contextPtr)1861 static void EncodePassMvPrediction(
1862     SequenceControlSet_t    *sequenceControlSetPtr,
1863     PictureControlSet_t     *pictureControlSetPtr,
1864     EB_U32                   lcuIndex,
1865     EncDecContext_t         *contextPtr)
1866 {
1867     // AMVP Signaled, or we failed to find a Merge MV match
1868     if (contextPtr->cuPtr->predictionUnitArray->mergeFlag == EB_FALSE)
1869     {
1870         EB_U64 mvdBitsIdx0;
1871         EB_U64 mvdBitsIdx1;
1872         EB_S32 xMvdIdx0;
1873         EB_S32 yMvdIdx0;
1874         EB_S32 xMvdIdx1;
1875         EB_S32 yMvdIdx1;
1876 
1877         contextPtr->cuPtr->predictionUnitArray->mergeFlag = EB_FALSE;
1878         contextPtr->cuPtr->skipFlag = EB_FALSE;
1879 
1880         // Generate AMVP List
1881         if (contextPtr->cuPtr->predictionUnitArray->interPredDirectionIndex == UNI_PRED_LIST_0 ||
1882             contextPtr->cuPtr->predictionUnitArray->interPredDirectionIndex == BI_PRED)
1883         {
1884             FillAMVPCandidates(
1885                 pictureControlSetPtr->epMvNeighborArray[contextPtr->encDecTileIndex],
1886                 pictureControlSetPtr->epModeTypeNeighborArray[contextPtr->encDecTileIndex],
1887                 contextPtr->cuOriginX,
1888                 contextPtr->cuOriginY,
1889                 contextPtr->cuStats->size,
1890                 contextPtr->cuStats->size,
1891                 contextPtr->cuStats->size,
1892                 contextPtr->cuStats->depth,
1893                 sequenceControlSetPtr->lcuSize,
1894                 pictureControlSetPtr,
1895                 pictureControlSetPtr->ParentPcsPtr->disableTmvpFlag ? EB_FALSE : EB_TRUE,
1896                 lcuIndex,
1897                 REF_LIST_0,
1898                 contextPtr->xMvAmvpCandidateArrayList0,
1899                 contextPtr->yMvAmvpCandidateArrayList0,
1900                 &contextPtr->amvpCandidateCountRefList0);
1901 
1902             xMvdIdx0 = (contextPtr->cuPtr->predictionUnitArray->mv[REF_LIST_0].x - contextPtr->xMvAmvpCandidateArrayList0[0]);
1903             yMvdIdx0 = (contextPtr->cuPtr->predictionUnitArray->mv[REF_LIST_0].y - contextPtr->yMvAmvpCandidateArrayList0[0]);
1904             EbHevcGetMvdFractionBits(xMvdIdx0, yMvdIdx0, contextPtr->mdRateEstimationPtr, &mvdBitsIdx0);
1905 
1906             if (contextPtr->amvpCandidateCountRefList0 > 1) {
1907                 xMvdIdx1 = (contextPtr->cuPtr->predictionUnitArray->mv[REF_LIST_0].x - contextPtr->xMvAmvpCandidateArrayList0[1]);
1908                 yMvdIdx1 = (contextPtr->cuPtr->predictionUnitArray->mv[REF_LIST_0].y - contextPtr->yMvAmvpCandidateArrayList0[1]);
1909                 EbHevcGetMvdFractionBits(xMvdIdx1, yMvdIdx1, contextPtr->mdRateEstimationPtr, &mvdBitsIdx1);
1910 
1911                 // Assign the AMVP predictor index
1912                 contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_0].predIdx = (mvdBitsIdx1 < mvdBitsIdx0);
1913 
1914                 // Assign the MV Predictor
1915                 contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_0].mvdX = contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_0].predIdx ? xMvdIdx1 : xMvdIdx0;
1916                 contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_0].mvdY = contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_0].predIdx ? yMvdIdx1 : yMvdIdx0;
1917             }
1918             else {
1919                 contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_0].predIdx = 0;
1920 
1921                 // Assign the MV Predictor
1922                 contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_0].mvdX = xMvdIdx0;
1923                 contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_0].mvdY = yMvdIdx0;
1924             }
1925         }
1926 
1927         // Generate AMVP List
1928         if (contextPtr->cuPtr->predictionUnitArray->interPredDirectionIndex == UNI_PRED_LIST_1 ||
1929             contextPtr->cuPtr->predictionUnitArray->interPredDirectionIndex == BI_PRED)
1930         {
1931             FillAMVPCandidates(
1932                 pictureControlSetPtr->epMvNeighborArray[contextPtr->encDecTileIndex],
1933                 pictureControlSetPtr->epModeTypeNeighborArray[contextPtr->encDecTileIndex],
1934                 contextPtr->cuOriginX,
1935                 contextPtr->cuOriginY,
1936                 contextPtr->cuStats->size,
1937                 contextPtr->cuStats->size,
1938                 contextPtr->cuStats->size,
1939                 contextPtr->cuStats->depth,
1940                 sequenceControlSetPtr->lcuSize,
1941                 pictureControlSetPtr,
1942                 pictureControlSetPtr->ParentPcsPtr->disableTmvpFlag ? EB_FALSE : EB_TRUE,
1943                 lcuIndex,
1944                 REF_LIST_1,
1945                 contextPtr->xMvAmvpCandidateArrayList1,
1946                 contextPtr->yMvAmvpCandidateArrayList1,
1947                 &contextPtr->amvpCandidateCountRefList1);
1948 
1949             // Assign the MV Predictor
1950             xMvdIdx0 = (contextPtr->cuPtr->predictionUnitArray->mv[REF_LIST_1].x - contextPtr->xMvAmvpCandidateArrayList1[0]);
1951             yMvdIdx0 = (contextPtr->cuPtr->predictionUnitArray->mv[REF_LIST_1].y - contextPtr->yMvAmvpCandidateArrayList1[0]);
1952             EbHevcGetMvdFractionBits(xMvdIdx0, yMvdIdx0, contextPtr->mdRateEstimationPtr, &mvdBitsIdx0);
1953 
1954             if (contextPtr->amvpCandidateCountRefList1 > 1) {
1955                 xMvdIdx1 = (contextPtr->cuPtr->predictionUnitArray->mv[REF_LIST_1].x - contextPtr->xMvAmvpCandidateArrayList1[1]);
1956                 yMvdIdx1 = (contextPtr->cuPtr->predictionUnitArray->mv[REF_LIST_1].y - contextPtr->yMvAmvpCandidateArrayList1[1]);
1957                 EbHevcGetMvdFractionBits(xMvdIdx1, yMvdIdx1, contextPtr->mdRateEstimationPtr, &mvdBitsIdx1);
1958 
1959                 // Assign the AMVP predictor index
1960                 contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_1].predIdx = (mvdBitsIdx1 < mvdBitsIdx0);
1961                 contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_1].mvdX = contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_1].predIdx ? xMvdIdx1 : xMvdIdx0;
1962                 contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_1].mvdY = contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_1].predIdx ? yMvdIdx1 : yMvdIdx0;
1963             }
1964             else {
1965                 contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_1].predIdx = 0;
1966                 contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_1].mvdX = xMvdIdx0;
1967                 contextPtr->cuPtr->predictionUnitArray->mvd[REF_LIST_1].mvdY = yMvdIdx0;
1968             }
1969 
1970             // Assign the MV Predictor
1971 
1972         }
1973 
1974 
1975     }
1976 
1977     return;
1978 }
1979 
1980 /*******************************************
1981 * Encode Pass - Assign Delta Qp
1982 *******************************************/
EncodePassUpdateQp(PictureControlSet_t * pictureControlSetPtr,EncDecContext_t * contextPtr,EB_BOOL availableCoeff,EB_BOOL isDeltaQpEnable,EB_BOOL * isDeltaQpNotCoded,EB_U32 difCuDeltaQpDepth,EB_U8 * prevCodedQp,EB_U8 * prevQuantGroupCodedQp,EB_U32 tileOriginX,EB_U32 tileOriginY,EB_U32 lcuQp)1983 static void EncodePassUpdateQp(
1984     PictureControlSet_t     *pictureControlSetPtr,
1985     EncDecContext_t         *contextPtr,
1986     EB_BOOL                  availableCoeff,
1987     EB_BOOL                  isDeltaQpEnable,
1988     EB_BOOL                 *isDeltaQpNotCoded,
1989     EB_U32                   difCuDeltaQpDepth,
1990     EB_U8                   *prevCodedQp,
1991     EB_U8                   *prevQuantGroupCodedQp,
1992     EB_U32                   tileOriginX,
1993     EB_U32                   tileOriginY,
1994     EB_U32                   lcuQp
1995     )
1996 {
1997 
1998     EB_U32 refQp;
1999     EB_U8 qp;
2000 
2001     EB_U32  log2MinCuQpDeltaSize = LOG2F_MAX_LCU_SIZE - difCuDeltaQpDepth;
2002     EB_S32  qpTopNeighbor = 0;
2003     EB_S32  qpLeftNeighbor = 0;
2004     EB_BOOL newQuantGroup;
2005     EB_U32  quantGroupX = contextPtr->cuOriginX - (contextPtr->cuOriginX & ((1 << log2MinCuQpDeltaSize) - 1));
2006     EB_U32  quantGroupY = contextPtr->cuOriginY - (contextPtr->cuOriginY & ((1 << log2MinCuQpDeltaSize) - 1));
2007     EB_BOOL sameLcuCheckTop = (((quantGroupY - 1)  >> LOG2F_MAX_LCU_SIZE) == ((quantGroupY)  >> LOG2F_MAX_LCU_SIZE)) ? EB_TRUE : EB_FALSE;
2008     EB_BOOL sameLcuCheckLeft = (((quantGroupX - 1) >> LOG2F_MAX_LCU_SIZE) == ((quantGroupX)  >> LOG2F_MAX_LCU_SIZE)) ? EB_TRUE : EB_FALSE;
2009     // Neighbor Array
2010     EB_U32 qpLeftNeighborIndex = 0;
2011     EB_U32 qpTopNeighborIndex = 0;
2012 
2013     // CU larger than the quantization group
2014     if (Log2f(contextPtr->cuStats->size) >= log2MinCuQpDeltaSize){
2015         *isDeltaQpNotCoded = EB_TRUE;
2016     }
2017 
2018     // At the beginning of a new quantization group
2019     if (((contextPtr->cuOriginX & ((1 << log2MinCuQpDeltaSize) - 1)) == 0) &&
2020         ((contextPtr->cuOriginY & ((1 << log2MinCuQpDeltaSize) - 1)) == 0))
2021     {
2022         *isDeltaQpNotCoded = EB_TRUE;
2023         newQuantGroup = EB_TRUE;
2024     }
2025     else {
2026         newQuantGroup = EB_FALSE;
2027     }
2028 
2029     // setting the previous Quantization Group QP
2030     if (newQuantGroup == EB_TRUE) {
2031         *prevCodedQp = *prevQuantGroupCodedQp;
2032     }
2033 
2034     if ((quantGroupY > tileOriginY) && sameLcuCheckTop) {
2035         qpTopNeighborIndex =
2036             LUMA_SAMPLE_PIC_WISE_LOCATION_TO_QP_ARRAY_IDX(
2037             quantGroupX,
2038             quantGroupY - 1,
2039             pictureControlSetPtr->qpArrayStride);
2040         qpTopNeighbor = pictureControlSetPtr->qpArray[qpTopNeighborIndex];
2041     }
2042     else {
2043         qpTopNeighbor = *prevCodedQp;
2044     }
2045 
2046     if ((quantGroupX > tileOriginX) && sameLcuCheckLeft) {
2047         qpLeftNeighborIndex =
2048             LUMA_SAMPLE_PIC_WISE_LOCATION_TO_QP_ARRAY_IDX(
2049             quantGroupX - 1,
2050             quantGroupY,
2051             pictureControlSetPtr->qpArrayStride);
2052 
2053         qpLeftNeighbor = pictureControlSetPtr->qpArray[qpLeftNeighborIndex];
2054     }
2055     else {
2056         qpLeftNeighbor = *prevCodedQp;
2057     }
2058 
2059     refQp = (qpLeftNeighbor + qpTopNeighbor + 1) >> 1;
2060 
2061     qp = (EB_U8)contextPtr->cuPtr->qp;
2062     // Update the State info
2063     if (isDeltaQpEnable) {
2064         if (*isDeltaQpNotCoded) {
2065             if (availableCoeff){
2066                 qp = (EB_U8)contextPtr->cuPtr->qp;
2067                 *prevCodedQp = qp;
2068                 *prevQuantGroupCodedQp = qp;
2069                 *isDeltaQpNotCoded = EB_FALSE;
2070             }
2071             else{
2072                 qp = (EB_U8)refQp;
2073                 *prevQuantGroupCodedQp = qp;
2074             }
2075         }
2076     }
2077     else{
2078         qp = (EB_U8)lcuQp;
2079     }
2080     contextPtr->cuPtr->qp = qp;
2081     return;
2082 }
2083 
SetPmEncDecMode(PictureControlSet_t * pictureControlSetPtr,EncDecContext_t * contextPtr,EB_U32 lcuIndex,EB_U8 stationaryEdgeOverTimeFlag,EB_U8 pmStationaryEdgeOverTimeFlag)2084 void SetPmEncDecMode(
2085     PictureControlSet_t     *pictureControlSetPtr,
2086     EncDecContext_t		    *contextPtr,
2087     EB_U32                   lcuIndex,
2088     EB_U8                    stationaryEdgeOverTimeFlag,
2089     EB_U8                    pmStationaryEdgeOverTimeFlag){
2090 
2091 
2092 
2093     SequenceControlSet_t *sequenceControlSetPtr = (SequenceControlSet_t*)pictureControlSetPtr->ParentPcsPtr->sequenceControlSetWrapperPtr->objectPtr;
2094 
2095 
2096 
2097     contextPtr->cleanSparseCeoffPfEncDec    = 0;
2098 
2099     contextPtr->pmpMaskingLevelEncDec       = 0;
2100 
2101     EB_BOOL pmSensitiveUncoveredBackground = EB_FALSE;
2102     // Derived for REF P & B & kept false otherwise (for temporal distance equal to 1 uncovered area are easier to handle)
2103     if (pictureControlSetPtr->sliceType != EB_I_PICTURE) {
2104         if (pictureControlSetPtr->ParentPcsPtr->isUsedAsReferenceFlag) {
2105             EbReferenceObject_t  * refObjL0;
2106             refObjL0 = (EbReferenceObject_t*)pictureControlSetPtr->refPicPtrArray[REF_LIST_0]->objectPtr;
2107             pmSensitiveUncoveredBackground = ((pictureControlSetPtr->ParentPcsPtr->failingMotionLcuFlag[lcuIndex] || contextPtr->cuPtr->predictionModeFlag == INTRA_MODE) && (pictureControlSetPtr->ParentPcsPtr->nonMovingIndexArray[lcuIndex] < PM_NON_MOVING_INDEX_TH || refObjL0->nonMovingIndexArray[lcuIndex] < PM_NON_MOVING_INDEX_TH));
2108         }
2109     }
2110 
2111     EB_BOOL pmSensitiveComplexArea = EB_FALSE;
2112     // Derived for all frames
2113     pmSensitiveComplexArea = pictureControlSetPtr->highIntraSlection && pictureControlSetPtr->ParentPcsPtr->complexLcuArray[lcuIndex] == LCU_COMPLEXITY_STATUS_1;
2114 
2115 
2116     EB_BOOL pmSensitiveSkinArea = EB_FALSE;
2117     LcuStat_t *lcuStatPtr = &(pictureControlSetPtr->ParentPcsPtr->lcuStatArray[lcuIndex]);
2118     if (pictureControlSetPtr->sceneCaracteristicId == EB_FRAME_CARAC_1) {
2119         if (lcuStatPtr->cuStatArray[0].skinArea) {
2120             pmSensitiveSkinArea = EB_TRUE;
2121         }
2122     }
2123 
2124     EB_BOOL pmSensitiveCmplxContrastArea = EB_FALSE;
2125     if (pictureControlSetPtr->sceneCaracteristicId == EB_FRAME_CARAC_2) {
2126         if (pictureControlSetPtr->ParentPcsPtr->lcuCmplxContrastArray[lcuIndex]) {
2127             pmSensitiveCmplxContrastArea = EB_TRUE;
2128         }
2129     }
2130 
2131 	if (sequenceControlSetPtr->staticConfig.bitRateReduction == EB_TRUE && !contextPtr->forceCbfFlag && !((pictureControlSetPtr->sliceType == EB_I_PICTURE && contextPtr->cuStats->size == 8) || stationaryEdgeOverTimeFlag || pmSensitiveSkinArea || pmSensitiveCmplxContrastArea)) {
2132 
2133 		if (sequenceControlSetPtr->inputResolution == INPUT_SIZE_4K_RANGE) {
2134 
2135             if (pictureControlSetPtr->sliceType != EB_I_PICTURE) {
2136 
2137                 if (stationaryEdgeOverTimeFlag || pictureControlSetPtr->ParentPcsPtr->logoPicFlag || pmSensitiveUncoveredBackground || pmSensitiveComplexArea) {
2138 
2139                     contextPtr->pmpMaskingLevelEncDec = 1;
2140                 }
2141                 else if (pmStationaryEdgeOverTimeFlag){
2142                     contextPtr->pmpMaskingLevelEncDec = 2;
2143                 }
2144                 else
2145                 {
2146                     if (pictureControlSetPtr->temporalLayerIndex == 0) {
2147 
2148                         if (contextPtr->cuPtr->predictionModeFlag == INTRA_MODE) {
2149                             contextPtr->pmpMaskingLevelEncDec = 2;
2150                         }
2151                         else{
2152                             contextPtr->pmpMaskingLevelEncDec = 3;
2153                         }
2154                     }
2155                     else {
2156                         contextPtr->cleanSparseCeoffPfEncDec = 1;
2157                         if (pictureControlSetPtr->ParentPcsPtr->highDarkLowLightAreaDensityFlag && pictureControlSetPtr->ParentPcsPtr->sharpEdgeLcuFlag[lcuIndex] && !pictureControlSetPtr->ParentPcsPtr->similarColocatedLcuArrayAllLayers[lcuIndex]){
2158                             contextPtr->pmpMaskingLevelEncDec = 2;
2159                         }
2160                         else
2161                         {
2162 
2163                             if (pictureControlSetPtr->temporalLayerIndex == 3) {
2164                                 if (contextPtr->cuPtr->predictionModeFlag == INTRA_MODE) {
2165                                     contextPtr->pmpMaskingLevelEncDec = 6;
2166                                 }
2167                                 else{
2168                                     contextPtr->pmpMaskingLevelEncDec = 7;
2169                                 }
2170                             }
2171                             else{
2172                                 if (contextPtr->cuPtr->predictionModeFlag == INTRA_MODE) {
2173                                     contextPtr->pmpMaskingLevelEncDec = 4;
2174                                 }
2175                                 else{
2176                                     contextPtr->pmpMaskingLevelEncDec = 5;
2177                                 }
2178                             }
2179                         }
2180                     }
2181                 }
2182 
2183             }
2184             else{
2185                 if (stationaryEdgeOverTimeFlag == 0 && !pictureControlSetPtr->ParentPcsPtr->logoPicFlag)
2186                 {
2187                     contextPtr->pmpMaskingLevelEncDec = 1;
2188                 }
2189             }
2190 
2191 
2192             if (contextPtr->cuPtr->predictionModeFlag == INTRA_MODE && contextPtr->cuStats->size == 32 && contextPtr->cuPtr->predictionUnitArray->intraLumaMode != EB_INTRA_DC && contextPtr->cuPtr->predictionUnitArray->intraLumaMode != EB_INTRA_PLANAR) {
2193                 contextPtr->pmpMaskingLevelEncDec = 0;
2194             }
2195 
2196 
2197 
2198             if (pictureControlSetPtr->sliceType == EB_P_PICTURE) {
2199                 contextPtr->pmpMaskingLevelEncDec = 1;
2200             }
2201 
2202 
2203         }
2204         else{
2205 
2206             if (stationaryEdgeOverTimeFlag == 0 && !pictureControlSetPtr->ParentPcsPtr->logoPicFlag)
2207             {
2208 
2209                 if (pictureControlSetPtr->temporalLayerIndex > 0 && !pmSensitiveUncoveredBackground && !pmSensitiveComplexArea) {
2210                     contextPtr->cleanSparseCeoffPfEncDec = 1;
2211                 }
2212                 if (pictureControlSetPtr->sliceType != EB_I_PICTURE) {
2213                     {
2214                         if (stationaryEdgeOverTimeFlag || pictureControlSetPtr->ParentPcsPtr->logoPicFlag){
2215                             contextPtr->pmpMaskingLevelEncDec = 0;
2216                         }
2217                         if (pmSensitiveUncoveredBackground || pmSensitiveComplexArea) {
2218                             contextPtr->pmpMaskingLevelEncDec = 1;
2219                         }
2220                         else
2221                         {
2222 
2223                             if (pictureControlSetPtr->temporalLayerIndex == 0) {
2224 
2225                                 if (contextPtr->cuPtr->predictionModeFlag == INTRA_MODE) {
2226                                     contextPtr->pmpMaskingLevelEncDec = 2;
2227                                 }
2228                                 else{
2229                                     contextPtr->pmpMaskingLevelEncDec = 3;
2230                                 }
2231                             }
2232                             else {
2233 
2234                                 if (pictureControlSetPtr->temporalLayerIndex == 3) {
2235                                     if (contextPtr->cuPtr->predictionModeFlag == INTRA_MODE) {
2236                                         contextPtr->pmpMaskingLevelEncDec = 6;
2237                                     }
2238                                     else{
2239                                         contextPtr->pmpMaskingLevelEncDec = 7;
2240                                     }
2241                                 }
2242                                 else{
2243                                     if (contextPtr->cuPtr->predictionModeFlag == INTRA_MODE) {
2244                                         contextPtr->pmpMaskingLevelEncDec = 4;
2245                                     }
2246                                     else{
2247                                         contextPtr->pmpMaskingLevelEncDec = 5;
2248                                     }
2249                                 }
2250                             }
2251 
2252 
2253 
2254 							if (contextPtr->cuPtr->predictionModeFlag == INTRA_MODE && contextPtr->cuStats->size == 32 && contextPtr->cuPtr->predictionUnitArray->intraLumaMode != EB_INTRA_DC && contextPtr->cuPtr->predictionUnitArray->intraLumaMode != EB_INTRA_PLANAR) {
2255 								contextPtr->pmpMaskingLevelEncDec = 0;
2256 							}
2257 
2258 							if (pictureControlSetPtr->sliceType == EB_P_PICTURE) {
2259 								contextPtr->pmpMaskingLevelEncDec = 1;
2260 							}
2261 
2262 
2263                         }
2264                     }
2265                 }
2266                 else{
2267 
2268 					contextPtr->pmpMaskingLevelEncDec = 0;
2269 
2270                 }
2271             }
2272 
2273         }
2274     }
2275 
2276     if (pictureControlSetPtr->ParentPcsPtr->segmentOvArray != NULL && sequenceControlSetPtr->staticConfig.segmentOvEnabled) {
2277         SegmentOverride_t *segmentOvPtr = pictureControlSetPtr->ParentPcsPtr->segmentOvArray;
2278         if (segmentOvPtr[lcuIndex].ovFlags & EB_TU_FILTER_OV)
2279             contextPtr->pmpMaskingLevelEncDec = CLIP3(0, 7, contextPtr->pmpMaskingLevelEncDec + segmentOvPtr[lcuIndex].filterOv);
2280     }
2281 
2282 }
2283 
2284 
Pack2DBlock(EncDecContext_t * contextPtr,EbPictureBufferDesc_t * inputPicture,EB_U32 originX,EB_U32 originY,EB_U32 width,EB_U32 height)2285 void Pack2DBlock(
2286     EncDecContext_t        *contextPtr,
2287     EbPictureBufferDesc_t  *inputPicture,
2288     EB_U32					originX,
2289     EB_U32					originY,
2290     EB_U32					width,
2291     EB_U32					height) {
2292 
2293     const EB_U32 inputLumaOffset = ((originY + inputPicture->originY)      * inputPicture->strideY) + (originX + inputPicture->originX);
2294     const EB_U32 inputBitIncLumaOffset = ((originY + inputPicture->originY)      * inputPicture->strideBitIncY) + (originX + inputPicture->originX);
2295     const EB_U32 inputCbOffset = (((originY + inputPicture->originY) >> 1) * inputPicture->strideCb) + ((originX + inputPicture->originX) >> 1);
2296     const EB_U32 inputBitIncCbOffset = (((originY + inputPicture->originY) >> 1) * inputPicture->strideBitIncCb) + ((originX + inputPicture->originX) >> 1);
2297     const EB_U32 inputCrOffset = (((originY + inputPicture->originY) >> 1) * inputPicture->strideCr) + ((originX + inputPicture->originX) >> 1);
2298     const EB_U32 inputBitIncCrOffset = (((originY + inputPicture->originY) >> 1) * inputPicture->strideBitIncCr) + ((originX + inputPicture->originX) >> 1);
2299 
2300 
2301     const EB_U32 blockLumaOffset = ((originY % 64) * contextPtr->inputSample16bitBuffer->strideY) + (originX % 64);
2302     const EB_U32 blockCbOffset = (((originY % 64) >> 1) * contextPtr->inputSample16bitBuffer->strideCb) + ((originX % 64) >> 1);
2303     const EB_U32 blockCrOffset = (((originY % 64) >> 1) * contextPtr->inputSample16bitBuffer->strideCr) + ((originX % 64) >> 1);
2304 
2305     {
2306         Pack2D_SRC(
2307             inputPicture->bufferY + inputLumaOffset,
2308             inputPicture->strideY,
2309             inputPicture->bufferBitIncY + inputBitIncLumaOffset,
2310             inputPicture->strideBitIncY,
2311             ((EB_U16 *)(contextPtr->inputSample16bitBuffer->bufferY)) + blockLumaOffset,
2312             contextPtr->inputSample16bitBuffer->strideY,
2313             width,
2314             height);  //this should be depending on a configuration param
2315 
2316         Pack2D_SRC(
2317             inputPicture->bufferCb + inputCbOffset,
2318             inputPicture->strideCr,
2319             inputPicture->bufferBitIncCb + inputBitIncCbOffset,
2320             inputPicture->strideBitIncCr,
2321             ((EB_U16 *)(contextPtr->inputSample16bitBuffer->bufferCb)) + blockCbOffset,
2322             contextPtr->inputSample16bitBuffer->strideCb,
2323             width >> 1,
2324             height >> 1);  //this should be depending on a configuration param
2325 
2326         Pack2D_SRC(
2327             inputPicture->bufferCr + inputCrOffset,
2328             inputPicture->strideCr,
2329             inputPicture->bufferBitIncCr + inputBitIncCrOffset,
2330             inputPicture->strideBitIncCr,
2331             ((EB_U16 *)(contextPtr->inputSample16bitBuffer->bufferCr)) + blockCrOffset,
2332             contextPtr->inputSample16bitBuffer->strideCr,
2333             width >> 1,
2334             height >> 1); //this should be depending on a configuration param
2335 
2336     }
2337 
2338 }
2339 
QpmDeriveBeaAndSkipQpmFlagLcu(SequenceControlSet_t * sequenceControlSetPtr,PictureControlSet_t * pictureControlSetPtr,LargestCodingUnit_t * lcuPtr,EB_U32 lcuIndex,EncDecContext_t * contextPtr)2340 EB_ERRORTYPE QpmDeriveBeaAndSkipQpmFlagLcu(
2341     SequenceControlSet_t                   *sequenceControlSetPtr,
2342     PictureControlSet_t                    *pictureControlSetPtr,
2343     LargestCodingUnit_t                    *lcuPtr,
2344     EB_U32                                 lcuIndex,
2345     EncDecContext_t                        *contextPtr)
2346 {
2347 
2348     EB_ERRORTYPE                    return_error = EB_ErrorNone;
2349     EB_S8                           pictureQp = pictureControlSetPtr->pictureQp;
2350     EB_U8                           minQpAllowed = (EB_U8)sequenceControlSetPtr->staticConfig.minQpAllowed;
2351     EB_U8                           maxQpAllowed = (EB_U8)sequenceControlSetPtr->staticConfig.maxQpAllowed;
2352 
2353 
2354     if (sequenceControlSetPtr->staticConfig.segmentOvEnabled && pictureControlSetPtr->ParentPcsPtr->segmentOvArray != NULL) {
2355         SegmentOverride_t *segmentOvPtr = pictureControlSetPtr->ParentPcsPtr->segmentOvArray;
2356         if (segmentOvPtr[lcuIndex].ovFlags & EB_QP_OV_DIRECT)
2357             pictureQp = segmentOvPtr[lcuIndex].qpOv;
2358         else if (segmentOvPtr[lcuIndex].ovFlags & EB_QP_OV_DELTA)
2359             pictureQp += segmentOvPtr[lcuIndex].qpOv;
2360     }
2361     contextPtr->qpmQp = CLIP3(minQpAllowed, maxQpAllowed, pictureQp);
2362 
2363     LcuStat_t *lcuStatPtr = &(pictureControlSetPtr->ParentPcsPtr->lcuStatArray[lcuIndex]);
2364 
2365 
2366     contextPtr->nonMovingDeltaQp = 0;
2367 
2368     contextPtr->grassEnhancementFlag = ((pictureControlSetPtr->sceneCaracteristicId == EB_FRAME_CARAC_1) && (lcuStatPtr->cuStatArray[0].grassArea)
2369         && (lcuPtr->pictureControlSetPtr->ParentPcsPtr->edgeResultsPtr[lcuIndex].edgeBlockNum > 0))
2370         ? EB_TRUE : EB_FALSE;
2371 
2372 	contextPtr->backgorundEnhancement = EB_FALSE;
2373 
2374 
2375 	contextPtr->backgorundEnhancement = EB_FALSE;
2376 
2377 	contextPtr->skipQpmFlag = (sequenceControlSetPtr->staticConfig.improveSharpness || sequenceControlSetPtr->staticConfig.bitRateReduction) ? EB_FALSE : EB_TRUE;
2378 
2379 	if ((pictureControlSetPtr->ParentPcsPtr->logoPicFlag == EB_FALSE) && ((pictureControlSetPtr->ParentPcsPtr->picNoiseClass >= PIC_NOISE_CLASS_3_1) || (pictureControlSetPtr->ParentPcsPtr->highDarkLowLightAreaDensityFlag) || (pictureControlSetPtr->ParentPcsPtr->intraCodedBlockProbability > 90))){
2380 		contextPtr->skipQpmFlag = EB_TRUE;
2381 	}
2382 
2383     if (contextPtr->skipQpmFlag == EB_FALSE) {
2384         if (pictureControlSetPtr->ParentPcsPtr->picHomogenousOverTimeLcuPercentage > 30 && pictureControlSetPtr->sliceType != EB_I_PICTURE){
2385 			contextPtr->qpmQp = CLIP3(minQpAllowed, maxQpAllowed, pictureQp + 1);
2386         }
2387     }
2388 
2389 	return return_error;
2390 }
2391 
EncQpmDeriveDeltaQPForEachLeafLcu(SequenceControlSet_t * sequenceControlSetPtr,PictureControlSet_t * pictureControlSetPtr,LargestCodingUnit_t * lcuPtr,EB_U32 lcuIndex,CodingUnit_t * cuPtr,EB_U32 cuDepth,EB_U32 cuIndex,EB_U32 cuSize,EB_U8 type,EB_U8 parent32x32Index,EncDecContext_t * contextPtr)2392 EB_ERRORTYPE EncQpmDeriveDeltaQPForEachLeafLcu(
2393 	SequenceControlSet_t                   *sequenceControlSetPtr,
2394 	PictureControlSet_t                    *pictureControlSetPtr,
2395 	LargestCodingUnit_t                    *lcuPtr,
2396 	EB_U32                                  lcuIndex,
2397 	CodingUnit_t                           *cuPtr,
2398 	EB_U32                                  cuDepth,
2399 	EB_U32                                  cuIndex,
2400 	EB_U32                                  cuSize,
2401 	EB_U8                                   type,
2402 	EB_U8                                   parent32x32Index,
2403 	EncDecContext_t                        *contextPtr)
2404 {
2405 	EB_ERRORTYPE                    return_error = EB_ErrorNone;
2406 
2407 
2408 	//LcuParams_t				        lcuParams;
2409 	EB_S64                          complexityDistance;
2410 	EB_S8                           deltaQp = 0;
2411 	EB_U8                           qpmQp = contextPtr->qpmQp;
2412 	EB_U8                           minQpAllowed = (EB_U8)sequenceControlSetPtr->staticConfig.minQpAllowed;
2413 	EB_U8                           maxQpAllowed = (EB_U8)sequenceControlSetPtr->staticConfig.maxQpAllowed;
2414 	EB_S16                          cuQP;
2415 
2416     EB_BOOL  skipOis8x8  = (pictureControlSetPtr->ParentPcsPtr->skipOis8x8 && cuSize == 8);
2417 
2418 	EB_U32 usedDepth = cuDepth;
2419 	if (skipOis8x8)
2420 		usedDepth = 2;
2421 
2422 	EB_U32 cuIndexInRaterScan = MD_SCAN_TO_RASTER_SCAN[cuIndex];
2423 
2424 	EB_BOOL acEnergyBasedAntiContouring = pictureControlSetPtr->sliceType == EB_I_PICTURE ? EB_TRUE : EB_FALSE;
2425 	EB_U8   lowerQPClass;
2426 
2427 	EB_S8	nonMovingDeltaQp = contextPtr->nonMovingDeltaQp;
2428 
2429 	EB_S8	bea64x64DeltaQp;
2430 
2431 	cuQP = qpmQp;
2432 	cuPtr->qp = qpmQp;
2433 
2434 	EB_U32  distortion = 0;
2435 
2436 	if (!contextPtr->skipQpmFlag){
2437 
2438 		// INTRA MODE
2439 		if (type == INTRA_MODE){
2440 
2441 				OisCu32Cu16Results_t  *oisCu32Cu16ResultsPtr = pictureControlSetPtr->ParentPcsPtr->oisCu32Cu16Results[lcuIndex];
2442 				OisCu8Results_t   	  *oisCu8ResultsPtr      = pictureControlSetPtr->ParentPcsPtr->oisCu8Results[lcuIndex];
2443 
2444 				if (cuSize > 32){
2445 					distortion =
2446 						oisCu32Cu16ResultsPtr->sortedOisCandidate[1][0].distortion +
2447 						oisCu32Cu16ResultsPtr->sortedOisCandidate[2][0].distortion +
2448 						oisCu32Cu16ResultsPtr->sortedOisCandidate[3][0].distortion +
2449 						oisCu32Cu16ResultsPtr->sortedOisCandidate[4][0].distortion;
2450 				}
2451 				else if (cuSize == 32) {
2452 					const EB_U32 me2Nx2NTableOffset = contextPtr->cuStats->cuNumInDepth + me2Nx2NOffset[contextPtr->cuStats->depth];
2453 					distortion = oisCu32Cu16ResultsPtr->sortedOisCandidate[me2Nx2NTableOffset][0].distortion;
2454 				}
2455 				else{
2456 					if (cuSize > 8){
2457 						const EB_U32 me2Nx2NTableOffset = contextPtr->cuStats->cuNumInDepth + me2Nx2NOffset[contextPtr->cuStats->depth];
2458 						distortion = oisCu32Cu16ResultsPtr->sortedOisCandidate[me2Nx2NTableOffset][0].distortion;
2459 					}
2460 					else{
2461 						if (skipOis8x8){
2462 
2463 							const CodedUnitStats_t  *cuStats = GetCodedUnitStats(ParentBlockIndex[cuIndex]);
2464 							const EB_U32 me2Nx2NTableOffset = cuStats->cuNumInDepth + me2Nx2NOffset[cuStats->depth];
2465 
2466 							distortion = oisCu32Cu16ResultsPtr->sortedOisCandidate[me2Nx2NTableOffset][0].distortion;
2467 						}
2468 						else {
2469 
2470 							const EB_U32 me2Nx2NTableOffset = contextPtr->cuStats->cuNumInDepth;
2471 
2472 							if (oisCu8ResultsPtr->sortedOisCandidate[me2Nx2NTableOffset][0].validDistortion){
2473 								distortion = oisCu8ResultsPtr->sortedOisCandidate[me2Nx2NTableOffset][0].distortion;
2474 							}
2475 							else{
2476 
2477 								const CodedUnitStats_t  *cuStats = GetCodedUnitStats(ParentBlockIndex[cuIndex]);
2478 								const EB_U32 me2Nx2NTableOffset = cuStats->cuNumInDepth + me2Nx2NOffset[cuStats->depth];
2479 
2480 								if (oisCu32Cu16ResultsPtr->sortedOisCandidate[me2Nx2NTableOffset][0].validDistortion){
2481 									distortion = oisCu32Cu16ResultsPtr->sortedOisCandidate[me2Nx2NTableOffset][0].distortion;
2482 								}
2483 								else {
2484 									distortion = 0;
2485 								}
2486 							}
2487 
2488 						}
2489 					}
2490 				}
2491 
2492 
2493 				distortion = (EB_U32)CLIP3(pictureControlSetPtr->ParentPcsPtr->intraComplexityMin[usedDepth], pictureControlSetPtr->ParentPcsPtr->intraComplexityMax[usedDepth], distortion);
2494 				complexityDistance = ((EB_S32)distortion - (EB_S32)pictureControlSetPtr->ParentPcsPtr->intraComplexityAvg[usedDepth]);
2495 
2496 				if (complexityDistance < 0){
2497 
2498 					deltaQp = (pictureControlSetPtr->ParentPcsPtr->intraMinDistance[usedDepth] != 0) ? (EB_S8)((contextPtr->minDeltaQpWeight * contextPtr->minDeltaQp[usedDepth] * complexityDistance) / (100 * pictureControlSetPtr->ParentPcsPtr->intraMinDistance[usedDepth])) : 0;
2499 				}
2500 				else{
2501 
2502 					deltaQp = (pictureControlSetPtr->ParentPcsPtr->intraMaxDistance[usedDepth] != 0) ? (EB_S8)((contextPtr->maxDeltaQpWeight * contextPtr->maxDeltaQp[usedDepth] * complexityDistance) / (100 * pictureControlSetPtr->ParentPcsPtr->intraMaxDistance[usedDepth])) : 0;
2503 				}
2504 				// QPM action
2505 				if (lcuPtr->pictureControlSetPtr->sceneCaracteristicId == EB_FRAME_CARAC_2) {
2506 					if (lcuPtr->pictureControlSetPtr->ParentPcsPtr->lcuCmplxContrastArray[lcuIndex] && deltaQp > 0) {
2507 						deltaQp = 0;
2508 					}
2509 				}
2510 			}
2511 			// INTER MODE
2512 			else{
2513 				distortion = pictureControlSetPtr->ParentPcsPtr->meResults[lcuIndex][cuIndexInRaterScan].distortionDirection[0].distortion;
2514 				if (skipOis8x8){
2515 					EB_U32 cuIndexRScan = MD_SCAN_TO_RASTER_SCAN[ParentBlockIndex[cuIndex]];
2516 
2517 					distortion = pictureControlSetPtr->ParentPcsPtr->meResults[lcuIndex][cuIndexRScan].distortionDirection[0].distortion;
2518 
2519 				}
2520 				distortion = (EB_U32)CLIP3(pictureControlSetPtr->ParentPcsPtr->interComplexityMin[usedDepth], pictureControlSetPtr->ParentPcsPtr->interComplexityMax[usedDepth], distortion);
2521 				complexityDistance = ((EB_S32)distortion - (EB_S32)pictureControlSetPtr->ParentPcsPtr->interComplexityAvg[usedDepth]);
2522 
2523 				if (complexityDistance < 0){
2524 
2525 					deltaQp = (pictureControlSetPtr->ParentPcsPtr->interMinDistance[usedDepth] != 0) ? (EB_S8)((contextPtr->minDeltaQpWeight * contextPtr->minDeltaQp[usedDepth] * complexityDistance) / (100 * pictureControlSetPtr->ParentPcsPtr->interMinDistance[usedDepth])) : 0;
2526 				}
2527 				else{
2528 
2529 					deltaQp = (pictureControlSetPtr->ParentPcsPtr->interMaxDistance[usedDepth] != 0) ? (EB_S8)((contextPtr->maxDeltaQpWeight * contextPtr->maxDeltaQp[usedDepth] * complexityDistance) / (100 * pictureControlSetPtr->ParentPcsPtr->interMaxDistance[usedDepth])) : 0;
2530 				}
2531 			}
2532 
2533 		if (contextPtr->backgorundEnhancement){
2534 			// Use the 8x8 background enhancement only for the Intra slice, otherwise, use the existing LCU based BEA results
2535 			bea64x64DeltaQp = nonMovingDeltaQp;
2536 
2537 			if (((cuIndex > 0) && ((pictureControlSetPtr->ParentPcsPtr->yMean[lcuIndex][parent32x32Index]) > ANTI_CONTOURING_LUMA_T2 || (pictureControlSetPtr->ParentPcsPtr->yMean[lcuIndex][parent32x32Index]) < ANTI_CONTOURING_LUMA_T1)) ||
2538 				((cuIndex == 0) && ((pictureControlSetPtr->ParentPcsPtr->yMean[lcuIndex][0]) > ANTI_CONTOURING_LUMA_T2 || (pictureControlSetPtr->ParentPcsPtr->yMean[lcuIndex][0]) < ANTI_CONTOURING_LUMA_T1))) {
2539 
2540 				if (bea64x64DeltaQp < 0){
2541 					bea64x64DeltaQp = 0;
2542 				}
2543 
2544 			}
2545 
2546             deltaQp += bea64x64DeltaQp;
2547 		}
2548 
2549         if ((pictureControlSetPtr->ParentPcsPtr->logoPicFlag)){
2550 			deltaQp = (deltaQp < contextPtr->minDeltaQp[0]) ? deltaQp : contextPtr->minDeltaQp[0];
2551 		}
2552 
2553         LcuStat_t *lcuStatPtr = &(pictureControlSetPtr->ParentPcsPtr->lcuStatArray[lcuIndex]);
2554         if (lcuStatPtr->stationaryEdgeOverTimeFlag && deltaQp > 0){
2555             deltaQp = 0;
2556         }
2557 		// QPM action
2558         if (lcuPtr->pictureControlSetPtr->sceneCaracteristicId == EB_FRAME_CARAC_2) {
2559             if (lcuPtr->pictureControlSetPtr->ParentPcsPtr->lcuCmplxContrastArray[lcuIndex] && deltaQp > 0) {
2560                 deltaQp = 0;
2561             }
2562         }
2563 
2564         if (acEnergyBasedAntiContouring) {
2565 
2566             lowerQPClass = DeriveContouringClass(
2567                 lcuPtr->pictureControlSetPtr->ParentPcsPtr,
2568                 lcuPtr->index,
2569                 (EB_U8) cuIndex);
2570 
2571 		    if (lowerQPClass){
2572                 if (lowerQPClass == 3)
2573                     deltaQp = ANTI_CONTOURING_DELTA_QP_0;
2574                 else if (lowerQPClass == 2)
2575                     deltaQp = ANTI_CONTOURING_DELTA_QP_1;
2576                  else if (lowerQPClass == 1)
2577                      deltaQp = ANTI_CONTOURING_DELTA_QP_2;
2578 		    }
2579         }
2580 
2581 
2582         deltaQp -= contextPtr->grassEnhancementFlag ? 3 : 0;
2583 		if (sequenceControlSetPtr->inputResolution == INPUT_SIZE_4K_RANGE)
2584 		deltaQp = ((deltaQp < 0 && sequenceControlSetPtr->staticConfig.bitRateReduction && !sequenceControlSetPtr->staticConfig.improveSharpness) ||
2585 			(deltaQp > 0 && sequenceControlSetPtr->staticConfig.improveSharpness && !sequenceControlSetPtr->staticConfig.bitRateReduction)) ? 0 : deltaQp;
2586 		else
2587 			deltaQp = (deltaQp > 0 && sequenceControlSetPtr->staticConfig.improveSharpness) ? 0 : deltaQp;
2588 		if (sequenceControlSetPtr->staticConfig.rateControlMode == 1 || sequenceControlSetPtr->staticConfig.rateControlMode == 2){
2589 
2590 			if (qpmQp > RC_QPMOD_MAXQP){
2591 				deltaQp = MIN(0, deltaQp);
2592 			}
2593 
2594 			cuQP = (qpmQp + deltaQp);
2595 
2596 
2597 			if ((qpmQp <= RC_QPMOD_MAXQP)){
2598 				cuQP = (EB_U8)CLIP3(
2599 					minQpAllowed,
2600 					RC_QPMOD_MAXQP,
2601 					cuQP);
2602 			}
2603 		}
2604 		else{
2605             cuQP = (qpmQp + deltaQp);
2606 		}
2607 
2608 		cuQP = (EB_U8)CLIP3(
2609 			minQpAllowed,
2610 			maxQpAllowed,
2611 			cuQP);
2612 
2613 
2614 	}
2615 
2616     cuPtr->qp = cuQP ;
2617 
2618 	lcuPtr->qp = (cuSize == 64) ? (EB_U8)cuPtr->qp : lcuPtr->qp;
2619 
2620 
2621 	cuPtr->deltaQp = (EB_S16)cuPtr->qp - (EB_S16)qpmQp;
2622 
2623 	cuPtr->orgDeltaQp = cuPtr->deltaQp;
2624 
2625 
2626 
2627 	return return_error;
2628 }
2629 
2630 
2631 /************************************
2632 this function checks whether any intra
2633 CU is present in the current LCU
2634 *************************************/
isIntraPresent(LargestCodingUnit_t * lcuPtr)2635 EB_BOOL isIntraPresent(
2636 	LargestCodingUnit_t                 *lcuPtr)
2637 {
2638 	EB_U8 leafIndex = 0;
2639 	while (leafIndex < CU_MAX_COUNT) {
2640 
2641 		CodingUnit_t * const cuPtr = lcuPtr->codedLeafArrayPtr[leafIndex];
2642 
2643 		if (cuPtr->splitFlag == EB_FALSE) {
2644 
2645 			const CodedUnitStats_t *cuStatsPtr = GetCodedUnitStats(leafIndex);
2646 			if (cuPtr->predictionModeFlag == INTRA_MODE)
2647 				return EB_TRUE;
2648 
2649 
2650 			leafIndex += DepthOffset[cuStatsPtr->depth];
2651 		}
2652 		else {
2653 			leafIndex++;
2654 		}
2655 	}
2656 
2657 	return EB_FALSE;
2658 
2659 }
2660 
2661 
EncodePassPreFetchRef(PictureControlSet_t * pictureControlSetPtr,EncDecContext_t * contextPtr,CodingUnit_t * cuPtr,const CodedUnitStats_t * cuStats,PredictionUnit_t * puPtr,EB_BOOL is16bit)2662 void EncodePassPreFetchRef(
2663     PictureControlSet_t     *pictureControlSetPtr,
2664     EncDecContext_t         *contextPtr,
2665     CodingUnit_t            *cuPtr,
2666     const CodedUnitStats_t  *cuStats,
2667     PredictionUnit_t        *puPtr,
2668     EB_BOOL                  is16bit)
2669 {
2670 
2671     if (cuPtr->predictionModeFlag == INTER_MODE){
2672 
2673         if (is16bit)
2674         {
2675             puPtr = cuPtr->predictionUnitArray;
2676             contextPtr->mvUnit.predDirection = (EB_U8)puPtr->interPredDirectionIndex;
2677             contextPtr->mvUnit.mv[REF_LIST_0].mvUnion = puPtr->mv[REF_LIST_0].mvUnion;
2678             contextPtr->mvUnit.mv[REF_LIST_1].mvUnion = puPtr->mv[REF_LIST_1].mvUnion;
2679 
2680             if ((contextPtr->mvUnit.predDirection == UNI_PRED_LIST_0) || (contextPtr->mvUnit.predDirection == BI_PRED))
2681             {
2682 
2683                 EbPictureBufferDesc_t  *refPicList0 = 0;
2684                 EbReferenceObject_t    *referenceObject;
2685                 EB_U16                  refList0PosX = 0;
2686                 EB_U16                  refList0PosY = 0;
2687                 EB_U8					counter;
2688                 EB_U16				   *src0Ptr;
2689 
2690                 referenceObject = (EbReferenceObject_t*)pictureControlSetPtr->refPicPtrArray[REF_LIST_0]->objectPtr;
2691                 refPicList0 = (EbPictureBufferDesc_t*)referenceObject->referencePicture16bit;
2692 
2693                 refList0PosX = (EB_U32)CLIP3(
2694                     (EB_S32)((refPicList0->originX - 71) << 2),
2695                     (EB_S32)((refPicList0->width + refPicList0->originX + 7) << 2),
2696                     (EB_S32)((contextPtr->cuOriginX + refPicList0->originX) << 2) + contextPtr->mvUnit.mv[REF_LIST_0].x);
2697 
2698                 refList0PosY = (EB_U32)CLIP3(
2699                     (EB_S32)((refPicList0->originY - 71) << 2),
2700                     (EB_S32)((refPicList0->height + refPicList0->originY + 7) << 2),
2701                     (EB_S32)((contextPtr->cuOriginY + refPicList0->originY) << 2) + contextPtr->mvUnit.mv[REF_LIST_0].y);
2702 
2703                 EB_U32  lumaOffSet = ((refList0PosX >> 2) - 4) * 2 + ((refList0PosY >> 2) - 4) * 2 * refPicList0->strideY;
2704                 EB_U32  cbOffset = ((refList0PosX >> 3) - 2) * 2 + ((refList0PosY >> 3) - 2) * 2 * refPicList0->strideCb;
2705                 EB_U32  crOffset = ((refList0PosX >> 3) - 2) * 2 + ((refList0PosY >> 3) - 2) * 2 * refPicList0->strideCr;
2706 
2707 
2708                 contextPtr->mcpContext->localReferenceBlockL0->bufferY = refPicList0->bufferY + lumaOffSet;
2709                 contextPtr->mcpContext->localReferenceBlockL0->bufferCb = refPicList0->bufferCb + cbOffset;
2710                 contextPtr->mcpContext->localReferenceBlockL0->bufferCr = refPicList0->bufferCr + crOffset;
2711                 contextPtr->mcpContext->localReferenceBlockL0->strideY = refPicList0->strideY;
2712                 contextPtr->mcpContext->localReferenceBlockL0->strideCb = refPicList0->strideCb;
2713                 contextPtr->mcpContext->localReferenceBlockL0->strideCr = refPicList0->strideCr;
2714 
2715 
2716                 src0Ptr = (EB_U16 *)contextPtr->mcpContext->localReferenceBlockL0->bufferY + 4 + 4 * contextPtr->mcpContext->localReferenceBlockL0->strideY;
2717 
2718                 for (counter = 0; counter < cuStats->size; counter++)
2719                 {
2720                     char const* p0 = (char const*)(src0Ptr + counter*contextPtr->mcpContext->localReferenceBlockL0->strideY);
2721                     _mm_prefetch(p0, _MM_HINT_T2);
2722                     char const* p1 = (char const*)(src0Ptr + counter*contextPtr->mcpContext->localReferenceBlockL0->strideY + (cuStats->size >> 1));
2723                     _mm_prefetch(p1, _MM_HINT_T2);
2724                 }
2725 
2726             }
2727 
2728             if ((contextPtr->mvUnit.predDirection == UNI_PRED_LIST_1) || (contextPtr->mvUnit.predDirection == BI_PRED))
2729             {
2730                 // Setup List 0
2731                 EbPictureBufferDesc_t  *refPicList1 = 0;
2732                 EbReferenceObject_t    *referenceObject;
2733                 EB_U16                  refList1PosX = 0;
2734                 EB_U16                  refList1PosY = 0;
2735                 EB_U8					counter;
2736                 EB_U16				   *src1Ptr;
2737 
2738                 referenceObject = (EbReferenceObject_t*)pictureControlSetPtr->refPicPtrArray[REF_LIST_1]->objectPtr;
2739                 refPicList1 = (EbPictureBufferDesc_t*)referenceObject->referencePicture16bit;
2740 
2741                 refList1PosX = (EB_U32)CLIP3(
2742                     (EB_S32)((refPicList1->originX - 71) << 2),
2743                     (EB_S32)((refPicList1->width + refPicList1->originX + 7) << 2),
2744                     (EB_S32)((contextPtr->cuOriginX + refPicList1->originX) << 2) + contextPtr->mvUnit.mv[REF_LIST_1].x);
2745 
2746                 refList1PosY = (EB_U32)CLIP3(
2747                     (EB_S32)((refPicList1->originY - 71) << 2),
2748                     (EB_S32)((refPicList1->height + refPicList1->originY + 7) << 2),
2749                     (EB_S32)((contextPtr->cuOriginY + refPicList1->originY) << 2) + contextPtr->mvUnit.mv[REF_LIST_1].y);
2750 
2751                 EB_U32  lumaOffSet = ((refList1PosX >> 2) - 4) * 2 + ((refList1PosY >> 2) - 4) * 2 * refPicList1->strideY; //refPicList0->originX + refPicList0->originY*refPicList0->strideY; //
2752                 EB_U32  cbOffset = ((refList1PosX >> 3) - 2) * 2 + ((refList1PosY >> 3) - 2) * 2 * refPicList1->strideCb;
2753                 EB_U32  crOffset = ((refList1PosX >> 3) - 2) * 2 + ((refList1PosY >> 3) - 2) * 2 * refPicList1->strideCr;
2754 
2755 
2756                 contextPtr->mcpContext->localReferenceBlockL1->bufferY = refPicList1->bufferY + lumaOffSet;
2757                 contextPtr->mcpContext->localReferenceBlockL1->bufferCb = refPicList1->bufferCb + cbOffset;
2758                 contextPtr->mcpContext->localReferenceBlockL1->bufferCr = refPicList1->bufferCr + crOffset;
2759                 contextPtr->mcpContext->localReferenceBlockL1->strideY = refPicList1->strideY;
2760                 contextPtr->mcpContext->localReferenceBlockL1->strideCb = refPicList1->strideCb;
2761                 contextPtr->mcpContext->localReferenceBlockL1->strideCr = refPicList1->strideCr;
2762 
2763 
2764                 src1Ptr = (EB_U16 *)contextPtr->mcpContext->localReferenceBlockL1->bufferY + 4 + 4 * contextPtr->mcpContext->localReferenceBlockL1->strideY;
2765 
2766                 for (counter = 0; counter < cuStats->size; counter++)
2767                 {
2768                     char const* p0 = (char const*)(src1Ptr + counter*contextPtr->mcpContext->localReferenceBlockL1->strideY);
2769                     _mm_prefetch(p0, _MM_HINT_T2);
2770                     char const* p1 = (char const*)(src1Ptr + counter*contextPtr->mcpContext->localReferenceBlockL1->strideY + (cuStats->size >> 1));
2771                     _mm_prefetch(p1, _MM_HINT_T2);
2772                 }
2773 
2774             }
2775         }
2776         else
2777         {
2778             puPtr = cuPtr->predictionUnitArray;
2779             contextPtr->mvUnit.predDirection = (EB_U8)puPtr->interPredDirectionIndex;
2780             contextPtr->mvUnit.mv[REF_LIST_0].mvUnion = puPtr->mv[REF_LIST_0].mvUnion;
2781             contextPtr->mvUnit.mv[REF_LIST_1].mvUnion = puPtr->mv[REF_LIST_1].mvUnion;
2782 
2783             if ((contextPtr->mvUnit.predDirection == UNI_PRED_LIST_0) || (contextPtr->mvUnit.predDirection == BI_PRED))
2784             {
2785                 // Setup List 0
2786                 EbPictureBufferDesc_t  *refPicList0 = 0;
2787                 EbReferenceObject_t    *referenceObject;
2788                 EB_U16                  refList0PosX = 0;
2789                 EB_U16                  refList0PosY = 0;
2790                 EB_U32					integPosL0x;
2791                 EB_U32					integPosL0y;
2792                 EB_U8					counter;
2793                 EB_U8				   *src0Ptr;
2794 
2795                 referenceObject = (EbReferenceObject_t*)pictureControlSetPtr->refPicPtrArray[REF_LIST_0]->objectPtr;
2796                 refPicList0 = (EbPictureBufferDesc_t*)referenceObject->referencePicture;
2797 
2798                 refList0PosX = (EB_U32)CLIP3(
2799                     (EB_S32)((refPicList0->originX - 71) << 2),
2800                     (EB_S32)((refPicList0->width + refPicList0->originX + 7) << 2),
2801                     (EB_S32)((contextPtr->cuOriginX + refPicList0->originX) << 2) + contextPtr->mvUnit.mv[REF_LIST_0].x);
2802 
2803                 refList0PosY = (EB_U32)CLIP3(
2804                     (EB_S32)((refPicList0->originY - 71) << 2),
2805                     (EB_S32)((refPicList0->height + refPicList0->originY + 7) << 2),
2806                     (EB_S32)((contextPtr->cuOriginY + refPicList0->originY) << 2) + contextPtr->mvUnit.mv[REF_LIST_0].y);
2807 
2808 
2809                 //compute the luma fractional position
2810                 integPosL0x = (refList0PosX >> 2);
2811                 integPosL0y = (refList0PosY >> 2);
2812 
2813 
2814                 src0Ptr = refPicList0->bufferY + integPosL0x + integPosL0y*refPicList0->strideY;
2815                 for (counter = 0; counter < cuStats->size; counter++)
2816                 {
2817                     char const* p0 = (char const*)(src0Ptr + counter*refPicList0->strideY);
2818                     _mm_prefetch(p0, _MM_HINT_T2);
2819                 }
2820 
2821             }
2822 
2823             if ((contextPtr->mvUnit.predDirection == UNI_PRED_LIST_1) || (contextPtr->mvUnit.predDirection == BI_PRED))
2824             {
2825                 // Setup List 0
2826                 EbPictureBufferDesc_t  *refPicList1 = 0;
2827                 EbReferenceObject_t    *referenceObject;
2828                 EB_U16                  refList1PosX = 0;
2829                 EB_U16                  refList1PosY = 0;
2830                 EB_U32					integPosL1x;
2831                 EB_U32					integPosL1y;
2832                 EB_U8					counter;
2833                 EB_U8				   *src1Ptr;
2834 
2835                 referenceObject = (EbReferenceObject_t*)pictureControlSetPtr->refPicPtrArray[REF_LIST_1]->objectPtr;
2836                 refPicList1 = (EbPictureBufferDesc_t*)referenceObject->referencePicture;
2837 
2838                 refList1PosX = (EB_U32)CLIP3(
2839                     (EB_S32)((refPicList1->originX - 71) << 2),
2840                     (EB_S32)((refPicList1->width + refPicList1->originX + 7) << 2),
2841                     (EB_S32)((contextPtr->cuOriginX + refPicList1->originX) << 2) + contextPtr->mvUnit.mv[REF_LIST_1].x);
2842 
2843                 refList1PosY = (EB_U32)CLIP3(
2844                     (EB_S32)((refPicList1->originY - 71) << 2),
2845                     (EB_S32)((refPicList1->height + refPicList1->originY + 7) << 2),
2846                     (EB_S32)((contextPtr->cuOriginY + refPicList1->originY) << 2) + contextPtr->mvUnit.mv[REF_LIST_1].y);
2847 
2848 
2849                 //uni-prediction List1 luma
2850                 integPosL1x = (refList1PosX >> 2);
2851                 integPosL1y = (refList1PosY >> 2);
2852 
2853 
2854                 src1Ptr = refPicList1->bufferY + integPosL1x + integPosL1y*refPicList1->strideY;
2855                 for (counter = 0; counter < cuStats->size; counter++)
2856                 {
2857                     char const* p1 = (char const*)(src1Ptr + counter*refPicList1->strideY);
2858                     _mm_prefetch(p1, _MM_HINT_T2);
2859                 }
2860 
2861             }
2862         }
2863     }
2864 }
2865 
2866 
EncodePassPackLcu(SequenceControlSet_t * sequenceControlSetPtr,EbPictureBufferDesc_t * inputPicture,EncDecContext_t * contextPtr,EB_U32 lcuOriginX,EB_U32 lcuOriginY,EB_U32 lcuWidth,EB_U32 lcuHeight)2867 void EncodePassPackLcu(
2868     SequenceControlSet_t   *sequenceControlSetPtr,
2869     EbPictureBufferDesc_t  *inputPicture,
2870     EncDecContext_t        *contextPtr,
2871     EB_U32                  lcuOriginX,
2872     EB_U32                  lcuOriginY,
2873     EB_U32                  lcuWidth,
2874     EB_U32                  lcuHeight)
2875 {
2876     const EB_COLOR_FORMAT colorFormat = inputPicture->colorFormat;
2877     const EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
2878     const EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
2879 
2880     if (sequenceControlSetPtr->staticConfig.compressedTenBitFormat == 1)
2881     {
2882 
2883         const EB_U32 inputLumaOffset = ((lcuOriginY + inputPicture->originY)       * inputPicture->strideY) + (lcuOriginX + inputPicture->originX);
2884         const EB_U32 inputCbOffset = (((lcuOriginY + inputPicture->originY) >> subHeightCMinus1)  * inputPicture->strideCb) + ((lcuOriginX + inputPicture->originX) >> subWidthCMinus1);
2885         const EB_U32 inputCrOffset = (((lcuOriginY + inputPicture->originY) >> subHeightCMinus1)  * inputPicture->strideCr) + ((lcuOriginX + inputPicture->originX) >> subWidthCMinus1);
2886         const EB_U16 luma2BitWidth = inputPicture->width / 4;
2887         const EB_U16 chroma2BitWidth = (inputPicture->width / 4) >> subWidthCMinus1;
2888 
2889 
2890         CompressedPackLcu(
2891             inputPicture->bufferY + inputLumaOffset,
2892             inputPicture->strideY,
2893             inputPicture->bufferBitIncY + lcuOriginY*luma2BitWidth + (lcuOriginX / 4)*lcuHeight,
2894             lcuWidth / 4,
2895             (EB_U16 *)contextPtr->inputSample16bitBuffer->bufferY,
2896             MAX_LCU_SIZE,
2897             lcuWidth,
2898             lcuHeight);
2899 
2900         CompressedPackLcu(
2901             inputPicture->bufferCb + inputCbOffset,
2902             inputPicture->strideCb,
2903             inputPicture->bufferBitIncCb + (lcuOriginY >> subHeightCMinus1) * chroma2BitWidth + ((lcuOriginX >> subWidthCMinus1) / 4)*(lcuHeight  >> subHeightCMinus1),
2904             (lcuWidth >> subWidthCMinus1) / 4,
2905             (EB_U16 *)contextPtr->inputSample16bitBuffer->bufferCb,
2906             MAX_LCU_SIZE >> subWidthCMinus1,
2907             lcuWidth >> subWidthCMinus1,
2908             lcuHeight >> subHeightCMinus1);
2909 
2910         CompressedPackLcu(
2911             inputPicture->bufferCr + inputCrOffset,
2912             inputPicture->strideCr,
2913             inputPicture->bufferBitIncCr + (lcuOriginY >> subHeightCMinus1) * chroma2BitWidth + ((lcuOriginX >> subWidthCMinus1) / 4)*(lcuHeight >> subHeightCMinus1),
2914             (lcuWidth >> subWidthCMinus1) / 4,
2915             (EB_U16 *)contextPtr->inputSample16bitBuffer->bufferCr,
2916             MAX_LCU_SIZE >> subWidthCMinus1,
2917             lcuWidth >> subWidthCMinus1,
2918             lcuHeight >> subHeightCMinus1);
2919 
2920     }
2921     else {
2922 
2923         const EB_U32 inputLumaOffset = ((lcuOriginY + inputPicture->originY) * inputPicture->strideY) + (lcuOriginX + inputPicture->originX);
2924         const EB_U32 inputBitIncLumaOffset = ((lcuOriginY + inputPicture->originY) * inputPicture->strideBitIncY) + (lcuOriginX + inputPicture->originX);
2925         const EB_U32 inputCbOffset = ((lcuOriginX + inputPicture->originX) >> subWidthCMinus1) +
2926             (((lcuOriginY + inputPicture->originY) >> subHeightCMinus1) * inputPicture->strideCb);
2927         const EB_U32 inputCrOffset = ((lcuOriginX + inputPicture->originX) >> subWidthCMinus1) +
2928             (((lcuOriginY + inputPicture->originY) >> subHeightCMinus1) * inputPicture->strideCr);
2929 
2930         const EB_U32 inputBitIncCrOffset = ((lcuOriginX + inputPicture->originX) >> subWidthCMinus1) +
2931             (((lcuOriginY + inputPicture->originY) >> subHeightCMinus1)  * inputPicture->strideBitIncCr);
2932         const EB_U32 inputBitIncCbOffset = ((lcuOriginX + inputPicture->originX) >> subWidthCMinus1) +
2933             (((lcuOriginY + inputPicture->originY) >> subHeightCMinus1) * inputPicture->strideBitIncCb);
2934 
2935         Pack2D_SRC(
2936             inputPicture->bufferY + inputLumaOffset,
2937             inputPicture->strideY,
2938             inputPicture->bufferBitIncY + inputBitIncLumaOffset,
2939             inputPicture->strideBitIncY,
2940             (EB_U16 *)contextPtr->inputSample16bitBuffer->bufferY,
2941             MAX_LCU_SIZE,
2942             lcuWidth,
2943             lcuHeight);
2944 
2945 
2946         Pack2D_SRC(
2947             inputPicture->bufferCb + inputCbOffset,
2948             inputPicture->strideCr,
2949             inputPicture->bufferBitIncCb + inputBitIncCbOffset,
2950             inputPicture->strideBitIncCr,
2951             (EB_U16 *)contextPtr->inputSample16bitBuffer->bufferCb,
2952             MAX_LCU_SIZE >> subWidthCMinus1,
2953             lcuWidth >> subWidthCMinus1,
2954             lcuHeight >> subHeightCMinus1);
2955 
2956 
2957         Pack2D_SRC(
2958             inputPicture->bufferCr + inputCrOffset,
2959             inputPicture->strideCr,
2960             inputPicture->bufferBitIncCr + inputBitIncCrOffset,
2961             inputPicture->strideBitIncCr,
2962             (EB_U16 *)contextPtr->inputSample16bitBuffer->bufferCr,
2963             MAX_LCU_SIZE >> subWidthCMinus1,
2964             lcuWidth >> subWidthCMinus1,
2965             lcuHeight >> subHeightCMinus1);
2966     }
2967 }
2968 
2969 
2970 /*******************************************
2971 * Encode Pass
2972 *
2973 * Summary: Performs a H.265 conformant
2974 *   reconstruction based on the LCU
2975 *   mode decision.
2976 *
2977 * Inputs:
2978 *   SourcePic
2979 *   Coding Results
2980 *   LCU Location
2981 *   Sequence Control Set
2982 *   Picture Control Set
2983 *
2984 * Outputs:
2985 *   Reconstructed Samples
2986 *   Coefficient Samples
2987 *
2988 *******************************************/
EncodePass(SequenceControlSet_t * sequenceControlSetPtr,PictureControlSet_t * pictureControlSetPtr,LargestCodingUnit_t * lcuPtr,EB_U32 tbAddr,EB_U32 lcuOriginX,EB_U32 lcuOriginY,EB_U32 lcuQp,EB_BOOL enableSaoFlag,EncDecContext_t * contextPtr)2989 EB_EXTERN void EncodePass(
2990     SequenceControlSet_t    *sequenceControlSetPtr,
2991     PictureControlSet_t     *pictureControlSetPtr,
2992     LargestCodingUnit_t     *lcuPtr,
2993     EB_U32                   tbAddr,
2994     EB_U32                   lcuOriginX,
2995     EB_U32                   lcuOriginY,
2996     EB_U32                   lcuQp,
2997     EB_BOOL                  enableSaoFlag,
2998     EncDecContext_t         *contextPtr)
2999 {
3000     EB_BOOL                 is16bit = contextPtr->is16bit;
3001     EB_COLOR_FORMAT         colorFormat = contextPtr->colorFormat;
3002     const EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
3003 
3004     EB_U32                   tileIdx = contextPtr->encDecTileIndex;
3005     EbPictureBufferDesc_t *reconBuffer = is16bit ? pictureControlSetPtr->reconPicture16bitPtr : pictureControlSetPtr->reconPicturePtr;
3006     EbPictureBufferDesc_t *coeffBufferTB = lcuPtr->quantizedCoeff;
3007 
3008     EbPictureBufferDesc_t *inputPicture;
3009     ModeDecisionContext_t *mdcontextPtr;
3010 
3011     mdcontextPtr = contextPtr->mdContext;
3012     inputPicture = contextPtr->inputSamples = (EbPictureBufferDesc_t*)pictureControlSetPtr->ParentPcsPtr->enhancedPicturePtr;
3013 
3014     LcuStat_t *lcuStatPtr = &(pictureControlSetPtr->ParentPcsPtr->lcuStatArray[tbAddr]);
3015 
3016 
3017     // TMVP
3018     TmvpUnit_t *tmvpMapWritePtr;
3019     EB_U32 tmvpMapHorizontalStartIndex;
3020     EB_U32 tmvpMapVerticalStartIndex;
3021     EB_U32 tmvpMapHorizontalEndIndex;
3022     EB_U32 tmvpMapVerticalEndIndex;
3023     EB_U32 tmvpMapIndex;
3024     EB_U32 mvCompressionUnitSizeMinus1 = (1 << LOG_MV_COMPRESS_UNIT_SIZE) - 1;
3025 
3026     // DLF
3027     EB_U32 startIndex;
3028     EB_U8 blk4x4IndexX;
3029     EB_U8 blk4x4IndexY;
3030     EB_BOOL availableCoeff;
3031 
3032     // QP Neighbor Arrays
3033     EB_BOOL isDeltaQpNotCoded = EB_TRUE;
3034 
3035     // LCU Stats
3036     EB_U32 lcuWidth = MIN(sequenceControlSetPtr->lcuSize, sequenceControlSetPtr->lumaWidth - lcuOriginX);
3037     EB_U32 lcuHeight = MIN(sequenceControlSetPtr->lcuSize, sequenceControlSetPtr->lumaHeight - lcuOriginY);
3038 
3039     // SAO
3040     EB_S64 saoLumaBestCost;
3041     EB_S64 saoChromaBestCost;
3042 
3043     // MV merge mode
3044     EB_U32 yCbf=0;
3045     EB_U32 cbCbf=0;
3046     EB_U32 crCbf=0;
3047     EB_U32 cbCbf2=0;
3048     EB_U32 crCbf2=0;
3049     EB_U64 yCoeffBits;
3050     EB_U64 cbCoeffBits;
3051     EB_U64 crCoeffBits;
3052     EB_U64 yFullDistortion[DIST_CALC_TOTAL];
3053     EB_U64 yTuFullDistortion[DIST_CALC_TOTAL];
3054     EB_U32 countNonZeroCoeffs[3];
3055     EB_U64 yTuCoeffBits;
3056     EB_U64 cbTuCoeffBits;
3057     EB_U64 crTuCoeffBits;
3058     EB_U32 lumaShift;
3059     EB_U32 scratchLumaOffset;
3060     EB_U32 lcuRowIndex = lcuOriginY / MAX_LCU_SIZE;
3061     EncodeContext_t *encodeContextPtr = NULL;
3062 
3063     // Dereferencing early
3064     NeighborArrayUnit_t *epModeTypeNeighborArray = pictureControlSetPtr->epModeTypeNeighborArray[tileIdx];
3065     NeighborArrayUnit_t *epIntraLumaModeNeighborArray = pictureControlSetPtr->epIntraLumaModeNeighborArray[tileIdx];
3066     NeighborArrayUnit_t *epMvNeighborArray = pictureControlSetPtr->epMvNeighborArray[tileIdx];
3067     NeighborArrayUnit_t *epLumaReconNeighborArray = is16bit ? pictureControlSetPtr->epLumaReconNeighborArray16bit[tileIdx] : pictureControlSetPtr->epLumaReconNeighborArray[tileIdx];
3068     NeighborArrayUnit_t *epCbReconNeighborArray = is16bit ? pictureControlSetPtr->epCbReconNeighborArray16bit[tileIdx] : pictureControlSetPtr->epCbReconNeighborArray[tileIdx];
3069     NeighborArrayUnit_t *epCrReconNeighborArray = is16bit ? pictureControlSetPtr->epCrReconNeighborArray16bit[tileIdx] : pictureControlSetPtr->epCrReconNeighborArray[tileIdx];
3070     NeighborArrayUnit_t *epSkipFlagNeighborArray = pictureControlSetPtr->epSkipFlagNeighborArray[tileIdx];
3071     NeighborArrayUnit_t *epLeafDepthNeighborArray = pictureControlSetPtr->epLeafDepthNeighborArray[tileIdx];
3072 
3073     EB_BOOL constrainedIntraFlag = pictureControlSetPtr->constrainedIntraFlag;
3074     EB_BOOL enableStrongIntraSmoothing = sequenceControlSetPtr->enableStrongIntraSmoothing;
3075     CodingUnit_t **codedLeafArrayPtr = lcuPtr->codedLeafArrayPtr;
3076 
3077     EB_BOOL dlfEnableFlag = (EB_BOOL)(!sequenceControlSetPtr->staticConfig.disableDlfFlag) &&
3078         (pictureControlSetPtr->ParentPcsPtr->isUsedAsReferenceFlag ||
3079         sequenceControlSetPtr->staticConfig.reconEnabled);
3080 
3081     dlfEnableFlag =  contextPtr->allowEncDecMismatch ? EB_FALSE : dlfEnableFlag;
3082 
3083     const EB_BOOL isIntraLCU = contextPtr->mdContext->limitIntra ? isIntraPresent(lcuPtr) : EB_TRUE;
3084 
3085     EB_BOOL doRecon = (EB_BOOL)(contextPtr->mdContext->limitIntra == 0 || isIntraLCU == 1) ||
3086         pictureControlSetPtr->ParentPcsPtr->isUsedAsReferenceFlag ||
3087         sequenceControlSetPtr->staticConfig.reconEnabled;
3088 
3089     CabacCost_t *cabacCost = pictureControlSetPtr->cabacCost;
3090     EntropyCoder_t *coeffEstEntropyCoderPtr = pictureControlSetPtr->coeffEstEntropyCoderPtr;
3091     EB_U8 cuItr;
3092     EB_U32 dZoffset = 0;
3093 
3094     if (!lcuStatPtr->stationaryEdgeOverTimeFlag && sequenceControlSetPtr->staticConfig.improveSharpness && pictureControlSetPtr->ParentPcsPtr->picNoiseClass < PIC_NOISE_CLASS_3_1) {
3095         EB_S16 cuDeltaQp = (EB_S16)(lcuPtr->qp - pictureControlSetPtr->ParentPcsPtr->averageQp);
3096         EB_U32 dzCondition = cuDeltaQp > 0 ? 0 : 1;
3097 
3098         if (sequenceControlSetPtr->inputResolution == INPUT_SIZE_4K_RANGE) {
3099 
3100             if (!(pictureControlSetPtr->ParentPcsPtr->isPan ||
3101                 (pictureControlSetPtr->ParentPcsPtr->nonMovingIndexAverage < 10 && lcuPtr->auraStatus == AURA_STATUS_1) ||
3102                 (lcuStatPtr->cuStatArray[0].skinArea) ||
3103                 (pictureControlSetPtr->ParentPcsPtr->intraCodedBlockProbability > 90) ||
3104                 (pictureControlSetPtr->ParentPcsPtr->highDarkAreaDensityFlag))) {
3105 
3106                 if (pictureControlSetPtr->sliceType != EB_I_PICTURE &&
3107                     pictureControlSetPtr->temporalLayerIndex == 0 &&
3108                     pictureControlSetPtr->ParentPcsPtr->intraCodedBlockProbability > 60 &&
3109                     !pictureControlSetPtr->ParentPcsPtr->isTilt &&
3110                     pictureControlSetPtr->ParentPcsPtr->picHomogenousOverTimeLcuPercentage > 40)
3111                 {
3112                     dZoffset = 10;
3113                 }
3114 
3115                 if (dzCondition) {
3116                     if (pictureControlSetPtr->sceneCaracteristicId == EB_FRAME_CARAC_1) {
3117                         if (pictureControlSetPtr->sliceType == EB_I_PICTURE) {
3118                             dZoffset = lcuStatPtr->cuStatArray[0].grassArea ? 10 : dZoffset;
3119                         }
3120                         else if (pictureControlSetPtr->temporalLayerIndex == 0) {
3121                             dZoffset = lcuStatPtr->cuStatArray[0].grassArea ? 9 : dZoffset;
3122                         }
3123                         else if (pictureControlSetPtr->temporalLayerIndex == 1) {
3124                             dZoffset = lcuStatPtr->cuStatArray[0].grassArea ? 5 : dZoffset;
3125                         }
3126                     }
3127 
3128                 }
3129             }
3130         }
3131     }
3132 
3133     QpmDeriveBeaAndSkipQpmFlagLcu(
3134         sequenceControlSetPtr,
3135         pictureControlSetPtr,
3136         lcuPtr,
3137         tbAddr,
3138         contextPtr);
3139 
3140     encodeContextPtr = ((SequenceControlSet_t*)(pictureControlSetPtr->sequenceControlSetWrapperPtr->objectPtr))->encodeContextPtr;
3141 
3142     if (pictureControlSetPtr->ParentPcsPtr->isUsedAsReferenceFlag == EB_TRUE) {
3143         // TMVP init
3144         tmvpMapWritePtr = &(contextPtr->referenceObjectWritePtr->tmvpMap[tbAddr]);
3145         tmvpMapIndex = 0;
3146 
3147         //get the 16bit form of the input LCU
3148         if (is16bit) {
3149 
3150             reconBuffer = ((EbReferenceObject_t*)pictureControlSetPtr->ParentPcsPtr->referencePictureWrapperPtr->objectPtr)->referencePicture16bit;
3151         } else {
3152             reconBuffer = ((EbReferenceObject_t*)pictureControlSetPtr->ParentPcsPtr->referencePictureWrapperPtr->objectPtr)->referencePicture;
3153         }
3154     }
3155     else { // non ref pictures
3156         reconBuffer = is16bit ? pictureControlSetPtr->reconPicture16bitPtr : pictureControlSetPtr->reconPicturePtr;
3157         tmvpMapWritePtr = (TmvpUnit_t*)EB_NULL;
3158     }
3159 
3160 
3161     EB_BOOL useDeltaQp = (EB_BOOL)(sequenceControlSetPtr->staticConfig.improveSharpness || sequenceControlSetPtr->staticConfig.bitRateReduction || sequenceControlSetPtr->staticConfig.segmentOvEnabled);
3162 
3163     EB_BOOL singleSegment = (sequenceControlSetPtr->encDecSegmentColCountArray[pictureControlSetPtr->temporalLayerIndex] == 1) && (sequenceControlSetPtr->encDecSegmentRowCountArray[pictureControlSetPtr->temporalLayerIndex] == 1);
3164 
3165     EB_BOOL useDeltaQpSegments = singleSegment ? 0 : (EB_BOOL)(sequenceControlSetPtr->staticConfig.improveSharpness || sequenceControlSetPtr->staticConfig.bitRateReduction || sequenceControlSetPtr->staticConfig.segmentOvEnabled);
3166 
3167     if (is16bit) {
3168         EncodePassPackLcu(
3169             sequenceControlSetPtr,
3170             inputPicture,
3171             contextPtr,
3172             lcuOriginX,
3173             lcuOriginY,
3174             lcuWidth,
3175             lcuHeight);
3176     }
3177 
3178     contextPtr->intraCodedAreaLCU[tbAddr] = 0;
3179 
3180     // CU Loop
3181     cuItr = 0;
3182     while (cuItr < CU_MAX_COUNT) {
3183         if (codedLeafArrayPtr[cuItr]->splitFlag == EB_FALSE){
3184             // PU Stack variables
3185             PredictionUnit_t        *puPtr                  = (PredictionUnit_t *)EB_NULL; //  done
3186             EbPictureBufferDesc_t   *residualBuffer         = contextPtr->residualBuffer;
3187             EbPictureBufferDesc_t   *transformBuffer        = contextPtr->transformBuffer;
3188             EB_S16                  *transformInnerArrayPtr = contextPtr->transformInnerArrayPtr;
3189             const CodedUnitStats_t  *cuStats                = contextPtr->cuStats   = GetCodedUnitStats(cuItr);
3190             CodingUnit_t            *cuPtr                  = contextPtr->cuPtr     = lcuPtr->codedLeafArrayPtr[cuItr];
3191 
3192             _mm_prefetch((const char *)cuStats, _MM_HINT_T0);
3193 
3194             contextPtr->cuOriginX = (EB_U16)(lcuOriginX + cuStats->originX);
3195             contextPtr->cuOriginY = (EB_U16)(lcuOriginY + cuStats->originY);
3196 
3197             EB_BOOL  tileLeftBoundary = (lcuPtr->lcuEdgeInfoPtr->tileLeftEdgeFlag == EB_TRUE && ((contextPtr->cuOriginX & (lcuPtr->size - 1)) == 0)) ? EB_TRUE : EB_FALSE;
3198             EB_BOOL  tileTopBoundary = (lcuPtr->lcuEdgeInfoPtr->tileTopEdgeFlag == EB_TRUE && ((contextPtr->cuOriginY & (lcuPtr->size - 1)) == 0)) ? EB_TRUE : EB_FALSE;
3199             EB_BOOL  tileRightBoundary = (lcuPtr->lcuEdgeInfoPtr->tileRightEdgeFlag == EB_TRUE && (((contextPtr->cuOriginX + cuStats->size) & (lcuPtr->size - 1)) == 0)) ? EB_TRUE : EB_FALSE;
3200             //printf("LCU (%d, %d), left/top/right boundary %d/%d/%d\n", lcuOriginX, lcuOriginY,
3201             //        tileLeftBoundary, tileTopBoundary, tileRightBoundary);
3202 
3203             EncodePassPreFetchRef(
3204                 pictureControlSetPtr,
3205                 contextPtr,
3206                 cuPtr,
3207                 cuStats,
3208                 puPtr,
3209                 is16bit);
3210 
3211             cuPtr->deltaQp = 0;
3212 
3213 			cuPtr->qp = (sequenceControlSetPtr->staticConfig.improveSharpness || sequenceControlSetPtr->staticConfig.bitRateReduction || sequenceControlSetPtr->staticConfig.segmentOvEnabled) ? contextPtr->qpmQp : pictureControlSetPtr->pictureQp;
3214 			lcuPtr->qp = (sequenceControlSetPtr->staticConfig.improveSharpness || sequenceControlSetPtr->staticConfig.bitRateReduction || sequenceControlSetPtr->staticConfig.segmentOvEnabled) ? contextPtr->qpmQp : pictureControlSetPtr->pictureQp;
3215             cuPtr->orgDeltaQp = cuPtr->deltaQp;
3216 
3217 			if (!contextPtr->skipQpmFlag &&
3218                     (sequenceControlSetPtr->staticConfig.improveSharpness || sequenceControlSetPtr->staticConfig.bitRateReduction) &&
3219                     (contextPtr->cuStats->depth <= pictureControlSetPtr->difCuDeltaQpDepth)) {
3220                 EncQpmDeriveDeltaQPForEachLeafLcu(
3221                     sequenceControlSetPtr,
3222                     pictureControlSetPtr,
3223                     lcuPtr,
3224                     tbAddr,
3225                     cuPtr,
3226                     contextPtr->cuStats->depth,
3227                     cuItr,
3228                     cuStats->size,
3229                     cuPtr->predictionModeFlag,
3230                     contextPtr->cuStats->parent32x32Index,
3231                     contextPtr);
3232             }
3233 
3234             EB_U8  fastEl = (contextPtr->fastEl && contextPtr->cuStats->size > 8);
3235             EB_U64 yCoeffBitsTemp = contextPtr->mdContext->mdEpPipeLcu[cuPtr->leafIndex].yCoeffBits;
3236             EB_S16 yDc = 0;
3237             EB_U16 yCountNonZeroCoeffs = 0;
3238 			EB_U32 yBitsThsld = (contextPtr->cuStats->size > 32) ? contextPtr->yBitsThsld : (contextPtr->cuStats->size > 16) ? (contextPtr->yBitsThsld >> 1) : (contextPtr->yBitsThsld >> 2);
3239 
3240             EB_U8 qpScaled = CLIP3((EB_S8)MIN_QP_VALUE, (EB_S8)MAX_CHROMA_MAP_QP_VALUE, (EB_S8)(cuPtr->qp + pictureControlSetPtr->cbQpOffset + pictureControlSetPtr->sliceCbQpOffset));
3241             EB_U8 cbQp = 0;
3242 
3243             if (colorFormat == EB_YUV420) {
3244                 cbQp = MapChromaQp(qpScaled);
3245             } else {
3246                 cbQp = MIN(qpScaled, 51);
3247             }
3248 
3249             //if (pictureControlSetPtr->pictureNumber == 1) {
3250             //    printf("POC %d, ", pictureControlSetPtr->pictureNumber);
3251             //    if (cuPtr->predictionModeFlag == INTRA_MODE) {
3252             //        printf("(%d, %d), pu size %d, intraLumaMode %d\n",
3253             //                contextPtr->cuOriginX, contextPtr->cuOriginY, cuStats->size, cuPtr->predictionUnitArray->intraLumaMode);
3254             //    } else {
3255             //        printf("(%d, %d), pu size %d,  inter mode, merge flag  %d, mvp (%d, %d), tileIdx %d\n",
3256             //                contextPtr->cuOriginX, contextPtr->cuOriginY, cuStats->size,
3257             //                cuPtr->predictionUnitArray[0].mergeFlag,
3258             //                cuPtr->predictionUnitArray->mv[0].x,
3259             //                cuPtr->predictionUnitArray->mv[0].y, tileIdx);
3260             //    }
3261             //}
3262 
3263 
3264             if (cuPtr->predictionModeFlag == INTRA_MODE &&
3265                     cuPtr->predictionUnitArray->intraLumaMode != EB_INTRA_MODE_4x4) {
3266                 contextPtr->totIntraCodedArea += cuStats->size*cuStats->size;
3267                 if (pictureControlSetPtr->sliceType != EB_I_PICTURE){
3268                     contextPtr->intraCodedAreaLCU[tbAddr] += cuStats->size*cuStats->size;
3269                 }
3270 
3271                 // *Note - Transforms are the same size as predictions
3272                 // Partition Loop
3273                 contextPtr->tuItr = 0;
3274 
3275                 {
3276                     // Set the PU Loop Variables
3277                     puPtr = cuPtr->predictionUnitArray;
3278                     // Generate Intra Luma Neighbor Modes
3279                     EbHevcGeneratePuIntraLumaNeighborModes( // HT done
3280                         cuPtr,
3281                         contextPtr->cuOriginX,
3282                         contextPtr->cuOriginY,
3283                         MAX_LCU_SIZE,
3284                         epIntraLumaModeNeighborArray,
3285                         epModeTypeNeighborArray);
3286 
3287 
3288                     // Transform Loop (not supported)
3289                     {
3290                         // Generate Intra Reference Samples
3291                         if (colorFormat == EB_YUV420) {
3292                             GenerateIntraReferenceSamplesFuncTable[is16bit](
3293                                     constrainedIntraFlag,
3294                                     enableStrongIntraSmoothing,
3295                                     contextPtr->cuOriginX,
3296                                     contextPtr->cuOriginY,
3297                                     cuStats->size,
3298                                     MAX_LCU_SIZE,
3299                                     cuStats->depth,
3300                                     epModeTypeNeighborArray,
3301                                     epLumaReconNeighborArray,
3302                                     epCbReconNeighborArray,
3303                                     epCrReconNeighborArray,
3304                                     is16bit ? (void*)contextPtr->intraRefPtr16 : (void*)contextPtr->intraRefPtr,
3305                                     colorFormat,
3306                                     tileLeftBoundary, tileTopBoundary, tileRightBoundary);
3307                         } else if (colorFormat == EB_YUV422 || colorFormat == EB_YUV444) {
3308                             //Jing: TODO, add tiles support
3309                             GenerateLumaIntraReferenceSamplesFuncTable[is16bit](
3310                                     constrainedIntraFlag,
3311                                     enableStrongIntraSmoothing,
3312                                     contextPtr->cuOriginX,
3313                                     contextPtr->cuOriginY,
3314                                     cuStats->size,
3315                                     MAX_LCU_SIZE,
3316                                     cuStats->depth,
3317                                     epModeTypeNeighborArray,
3318                                     epLumaReconNeighborArray,
3319                                     epCbReconNeighborArray,
3320                                     epCrReconNeighborArray,
3321                                     is16bit ? (void*)contextPtr->intraRefPtr16 : (void*)contextPtr->intraRefPtr,
3322                                     tileLeftBoundary, tileTopBoundary, tileRightBoundary);
3323 
3324 						    GenerateChromaIntraReferenceSamplesFuncTable[is16bit](
3325                                     constrainedIntraFlag,
3326                                     enableStrongIntraSmoothing,
3327                                     contextPtr->cuOriginX,
3328                                     contextPtr->cuOriginY,
3329                                     cuStats->size,
3330                                     MAX_LCU_SIZE,
3331                                     cuStats->depth,
3332                                     epModeTypeNeighborArray,
3333                                     epLumaReconNeighborArray,
3334                                     epCbReconNeighborArray,
3335                                     epCrReconNeighborArray,
3336                                     is16bit ? (void*)contextPtr->intraRefPtr16 : (void*)contextPtr->intraRefPtr,
3337                                     colorFormat,
3338                                     EB_FALSE,
3339                                     tileLeftBoundary, tileTopBoundary, tileRightBoundary);
3340                         }
3341 
3342                         // Prediction
3343                         EncodePassIntraPredictionFuncTable[is16bit](
3344                             is16bit ? (void*)contextPtr->intraRefPtr16 : (void*)contextPtr->intraRefPtr,
3345                             contextPtr->cuOriginX + reconBuffer->originX,
3346                             contextPtr->cuOriginY + reconBuffer->originY,
3347                             cuStats->size,
3348                             cuStats->size >> subWidthCMinus1, //chroma PU size, for 444 chroma PU size is the same as luma
3349                             reconBuffer,
3350                             colorFormat,
3351                             EB_FALSE,
3352                             (EB_U32)puPtr->intraLumaMode,
3353                             EB_INTRA_CHROMA_DM,
3354                             PICTURE_BUFFER_DESC_FULL_MASK );
3355 
3356 #ifdef DEBUG_REF_INFO
3357                         int originX = contextPtr->cuOriginX;
3358                         int originY = contextPtr->cuOriginY;
3359                         int tuSize = cuStats->size;
3360                         printf("\n----- Dump prediction for 1st loop at (%d, %d)-----\n", originX, originY);
3361 
3362                         int chroma_size = tuSize > MIN_PU_SIZE? (tuSize >> subWidthCMinus1): tuSize;
3363 
3364                         //dump_block_from_desc(chroma_size, reconBuffer, originX, originY, 1);
3365                         dump_block_from_desc(tuSize, reconBuffer, originX, originY, 0);
3366 #endif
3367                         // Encode Transform Unit -INTRA-
3368                         {
3369                             contextPtr->forceCbfFlag = (contextPtr->skipQpmFlag) ?
3370                                 EB_FALSE :
3371                                 lcuPtr->lcuEdgeInfoPtr->tileLeftEdgeFlag && ((contextPtr->cuOriginX & (63)) == 0) && (contextPtr->cuOriginY == lcuOriginY);
3372                             SetPmEncDecMode(
3373                                 pictureControlSetPtr,
3374                                 contextPtr,
3375                                 tbAddr,
3376                                 lcuStatPtr->stationaryEdgeOverTimeFlag,
3377                                 pictureControlSetPtr->temporalLayerIndex > 0 ? lcuStatPtr->pmStationaryEdgeOverTimeFlag : lcuStatPtr->stationaryEdgeOverTimeFlag);
3378 
3379                             // Set Fast El coef shaping method
3380                             contextPtr->transCoeffShapeLuma   = DEFAULT_SHAPE;
3381                             contextPtr->transCoeffShapeChroma = DEFAULT_SHAPE;
3382 							if (fastEl && contextPtr->pmpMaskingLevelEncDec > MASK_THSHLD_1) {
3383                                 yDc = contextPtr->mdContext->mdEpPipeLcu[cuPtr->leafIndex].yDc[0];
3384                                 yCountNonZeroCoeffs = contextPtr->mdContext->mdEpPipeLcu[cuPtr->leafIndex].yCountNonZeroCoeffs[0];
3385 
3386 								if ((cuPtr->rootCbf == 0) ||
3387                                         ((yCoeffBitsTemp <= yBitsThsld) && yDc < YDC_THSHLD_1 && yCountNonZeroCoeffs <= 1)) {
3388                                     // Skip pass for cuPtr->rootCbf == 0 caused some VQ issues in chroma, so DC path is used instead
3389                                     contextPtr->transCoeffShapeLuma     = ONLY_DC_SHAPE;
3390                                     contextPtr->transCoeffShapeChroma   = ONLY_DC_SHAPE;
3391                                 } else if ((yCoeffBitsTemp <= yBitsThsld * 4)) {
3392                                     contextPtr->transCoeffShapeLuma = N4_SHAPE;
3393                                     if ((cuStats->size >> 1) > 8) {
3394                                         contextPtr->transCoeffShapeChroma = N4_SHAPE;
3395                                     } else {
3396                                         contextPtr->transCoeffShapeChroma = N2_SHAPE;
3397                                     }
3398                                 } else if ((yCoeffBitsTemp <= yBitsThsld * 16)) {
3399                                     contextPtr->transCoeffShapeLuma     = N2_SHAPE;
3400                                     contextPtr->transCoeffShapeChroma   = N2_SHAPE;
3401                                 }
3402                            }
3403 
3404                             EncodeLoopFunctionTable[is16bit](
3405                                 contextPtr,
3406                                 lcuPtr,
3407                                 contextPtr->cuOriginX,
3408                                 contextPtr->cuOriginY,
3409                                 cbQp,
3410                                 reconBuffer,
3411                                 coeffBufferTB,
3412                                 residualBuffer,
3413                                 transformBuffer,
3414                                 transformInnerArrayPtr,
3415                                 countNonZeroCoeffs,
3416                                 useDeltaQpSegments,
3417 								(CabacEncodeContext_t*)coeffEstEntropyCoderPtr->cabacEncodeContextPtr,
3418 								(EB_U32)puPtr->intraLumaMode,
3419                                 PICTURE_BUFFER_DESC_FULL_MASK,
3420                                 colorFormat,
3421                                 EB_FALSE,
3422                                 (contextPtr->cuStats->size == 64) ? 32 : contextPtr->cuStats->size,
3423 								pictureControlSetPtr->cabacCost,
3424                                 cuPtr->deltaQp > 0 ? 0 : dZoffset);
3425 
3426                             EncodeGenerateReconFunctionPtr[is16bit](
3427                                 contextPtr,
3428                                 contextPtr->cuOriginX,
3429                                 contextPtr->cuOriginY,
3430                                 PICTURE_BUFFER_DESC_FULL_MASK,
3431                                 colorFormat,
3432                                 EB_FALSE,
3433                                 (contextPtr->cuStats->size == 64) ? 32 : contextPtr->cuStats->size,
3434                                 reconBuffer,
3435                                 residualBuffer,
3436                                 transformInnerArrayPtr);
3437                         }
3438 
3439                         // Update Recon Samples-INTRA-
3440                         EncodePassUpdateReconSampleNeighborArrays(
3441                             epLumaReconNeighborArray,
3442                             epCbReconNeighborArray,
3443                             epCrReconNeighborArray,
3444                             reconBuffer,
3445                             contextPtr->cuOriginX,
3446                             contextPtr->cuOriginY,
3447                             cuStats->size,
3448                             PICTURE_BUFFER_DESC_FULL_MASK,
3449                             colorFormat,
3450                             is16bit);
3451                     } // Transform Loop
3452 
3453 					if (colorFormat == EB_YUV422) {
3454 						GenerateChromaIntraReferenceSamplesFuncTable[is16bit](
3455 								constrainedIntraFlag,
3456 								enableStrongIntraSmoothing,
3457 								contextPtr->cuOriginX,
3458 								contextPtr->cuOriginY,
3459 								cuStats->size,
3460 								MAX_LCU_SIZE,
3461 								cuStats->depth,
3462 								epModeTypeNeighborArray,
3463 								epLumaReconNeighborArray,
3464 								epCbReconNeighborArray,
3465 								epCrReconNeighborArray,
3466 								is16bit ? (void*)contextPtr->intraRefPtr16 : (void*)contextPtr->intraRefPtr,
3467 								colorFormat,
3468                                 EB_TRUE,
3469                                 tileLeftBoundary, EB_FALSE, tileRightBoundary);
3470 
3471 						// Prediction
3472 						EncodePassIntraPredictionFuncTable[is16bit](
3473 								is16bit ? (void*)contextPtr->intraRefPtr16 : (void*)contextPtr->intraRefPtr,
3474 								contextPtr->cuOriginX + reconBuffer->originX,
3475 								contextPtr->cuOriginY + reconBuffer->originY,
3476 								cuStats->size,
3477 								cuStats->size>>1,
3478 								reconBuffer,
3479 								colorFormat,
3480                                 EB_TRUE,
3481 								(EB_U32)puPtr->intraLumaMode,
3482                                 EB_INTRA_CHROMA_DM,
3483 								PICTURE_BUFFER_DESC_CHROMA_MASK);
3484 
3485 						//EncodeLoop
3486 						{
3487 							EncodeLoopFunctionTable[is16bit](
3488 									contextPtr,
3489 									lcuPtr,
3490 									contextPtr->cuOriginX,
3491 									contextPtr->cuOriginY,
3492 									cbQp,
3493 									reconBuffer,
3494 									coeffBufferTB,
3495 									residualBuffer,
3496 									transformBuffer,
3497 									transformInnerArrayPtr,
3498 									countNonZeroCoeffs,
3499 									useDeltaQpSegments,
3500 									(CabacEncodeContext_t*)coeffEstEntropyCoderPtr->cabacEncodeContextPtr,
3501 									(EB_U32)puPtr->intraLumaMode,
3502 									PICTURE_BUFFER_DESC_CHROMA_MASK,
3503 									colorFormat,
3504                                     EB_TRUE,
3505                                     (contextPtr->cuStats->size == 64) ? 32 : contextPtr->cuStats->size,
3506 									pictureControlSetPtr->cabacCost,
3507 									cuPtr->deltaQp > 0 ? 0 : dZoffset);
3508 
3509 							EncodeGenerateReconFunctionPtr[is16bit](
3510 									contextPtr,
3511 									contextPtr->cuOriginX,
3512 									contextPtr->cuOriginY,
3513 									PICTURE_BUFFER_DESC_CHROMA_MASK,
3514 									colorFormat,
3515                                     EB_TRUE,
3516                                     (contextPtr->cuStats->size == 64) ? 32 : contextPtr->cuStats->size,
3517 									reconBuffer,
3518 									residualBuffer,
3519 									transformInnerArrayPtr);
3520 
3521 							// Update Recon Samples-INTRA-
3522 							EncodePassUpdateReconSampleNeighborArrays(
3523 									epLumaReconNeighborArray,
3524 									epCbReconNeighborArray,
3525 									epCrReconNeighborArray,
3526 									reconBuffer,
3527 									contextPtr->cuOriginX,
3528 									contextPtr->cuOriginY+(cuStats->size>>1),
3529 									cuStats->size,
3530 									PICTURE_BUFFER_DESC_CHROMA_MASK,
3531 									colorFormat,
3532 									is16bit);
3533 						}
3534 					}
3535 
3536                     // Update the Intra-specific Neighbor Arrays
3537                     EncodePassUpdateIntraModeNeighborArrays(
3538                         epModeTypeNeighborArray,
3539                         epIntraLumaModeNeighborArray,
3540                         (EB_U8)cuPtr->predictionUnitArray->intraLumaMode,
3541                         contextPtr->cuOriginX,
3542                         contextPtr->cuOriginY,
3543                         cuStats->size);
3544 
3545                     // set up the bS based on PU boundary for DLF
3546                     if (dlfEnableFlag){
3547                         // Update the cbf map for DLF
3548                         startIndex = (contextPtr->cuOriginY >> 2) * (sequenceControlSetPtr->lumaWidth >> 2) + (contextPtr->cuOriginX >> 2);
3549                         for (blk4x4IndexY = 0; blk4x4IndexY < (cuStats->size >> 2); ++blk4x4IndexY){
3550                             EB_MEMSET(&pictureControlSetPtr->cbfMapArray[startIndex], (EB_U8)(cuPtr->transformUnitArray[contextPtr->tuItr].lumaCbf), (cuStats->size >> 2));
3551                             startIndex += (sequenceControlSetPtr->lumaWidth >> 2);
3552                         }
3553 
3554                         SetBSArrayBasedOnPUBoundary(
3555                             epModeTypeNeighborArray,
3556                             epMvNeighborArray,
3557                             puPtr,
3558                             cuPtr,
3559                             cuStats,
3560                             lcuOriginX,
3561                             lcuOriginY,
3562                             tileLeftBoundary,
3563                             tileTopBoundary,
3564                             pictureControlSetPtr,
3565                             pictureControlSetPtr->horizontalEdgeBSArray[tbAddr],
3566                             pictureControlSetPtr->verticalEdgeBSArray[tbAddr]);
3567 
3568                     }
3569                 } // Partition Loop
3570             } else if (cuPtr->predictionModeFlag == INTRA_MODE) {
3571                 //*************************
3572                 //       INTRA  4x4
3573                 //*************************
3574                 contextPtr->totIntraCodedArea += cuStats->size*cuStats->size;
3575                 if (pictureControlSetPtr->sliceType != EB_I_PICTURE) {
3576                     contextPtr->intraCodedAreaLCU[tbAddr] += cuStats->size*cuStats->size;
3577                 }
3578 
3579                 // Partition Loop
3580                 EB_U8 partitionIndex;
3581                 EB_U8 componentMask = PICTURE_BUFFER_DESC_LUMA_MASK;
3582 
3583 				for (partitionIndex = 0; partitionIndex < 4; partitionIndex++) {
3584 					// Partition Loop
3585 					contextPtr->tuItr = partitionIndex + 1;
3586 
3587                     EB_U16 partitionOriginX = contextPtr->cuOriginX + INTRA_4x4_OFFSET_X[partitionIndex];
3588                     EB_U16 partitionOriginY = contextPtr->cuOriginY + INTRA_4x4_OFFSET_Y[partitionIndex];
3589 
3590                     EB_BOOL pictureLeftBoundary = (lcuPtr->lcuEdgeInfoPtr->tileLeftEdgeFlag == EB_TRUE && ((partitionOriginX & (lcuPtr->size - 1)) == 0)) ? EB_TRUE : EB_FALSE;
3591                     EB_BOOL pictureTopBoundary = (lcuPtr->lcuEdgeInfoPtr->tileTopEdgeFlag == EB_TRUE && ((partitionOriginY & (lcuPtr->size - 1)) == 0)) ? EB_TRUE : EB_FALSE;
3592                     EB_BOOL pictureRightBoundary = (lcuPtr->lcuEdgeInfoPtr->tileRightEdgeFlag == EB_TRUE && (((partitionOriginX + MIN_PU_SIZE) & (lcuPtr->size - 1)) == 0)) ? EB_TRUE : EB_FALSE;
3593 
3594                     EB_U8   intraLumaMode = lcuPtr->intra4x4Mode[((MD_SCAN_TO_RASTER_SCAN[cuItr] - 21) << 2) + partitionIndex];
3595                     EB_U8   intraLumaModeForChroma = lcuPtr->intra4x4Mode[((MD_SCAN_TO_RASTER_SCAN[cuItr] - 21) << 2)];
3596 
3597                     //printf("Intra 4x4 block (%d, %d), luma mode is %d\n", partitionOriginX, partitionOriginY, intraLumaMode);
3598 
3599                     // Set the PU Loop Variables
3600                     puPtr = cuPtr->predictionUnitArray;
3601 
3602                     // Generate Intra Luma Neighbor Modes
3603                     EbHevcGeneratePuIntraLumaNeighborModes( // HT done
3604                         cuPtr,
3605                         partitionOriginX,
3606                         partitionOriginY,
3607                         MAX_LCU_SIZE,
3608                         epIntraLumaModeNeighborArray,
3609                         epModeTypeNeighborArray);
3610 
3611                     // Generate Intra Reference Samples
3612                     GenerateLumaIntraReferenceSamplesFuncTable[is16bit](
3613                         constrainedIntraFlag,
3614                         enableStrongIntraSmoothing,
3615                         partitionOriginX,
3616                         partitionOriginY,
3617                         MIN_PU_SIZE,
3618                         MAX_LCU_SIZE,
3619                         cuStats->depth,
3620                         epModeTypeNeighborArray,
3621                         epLumaReconNeighborArray,
3622                         epCbReconNeighborArray,
3623                         epCrReconNeighborArray,
3624 						is16bit ? (void*)contextPtr->intraRefPtr16 : (void*)contextPtr->intraRefPtr,
3625 						pictureLeftBoundary,
3626 						pictureTopBoundary,
3627 						pictureRightBoundary);
3628 
3629                     componentMask = PICTURE_BUFFER_DESC_LUMA_MASK;
3630                     if (partitionIndex == 0 ||
3631                             (colorFormat == EB_YUV422 && partitionIndex == 2) ||
3632                             (colorFormat == EB_YUV444)) {
3633                         // For the Intra4x4 case, the Chroma for the CU is coded as a single 4x4 block.
3634                         //   This changes how the right picture boundary is interpreted for the Luma and Chroma blocks
3635                         //   as there is not a one-to-one relationship between the luma/chroma blocks. This effects
3636                         //   only the right picture edge check and not the left or top boundary checks as the block size
3637                         //   has no influence on those checks.
3638                         if (colorFormat == EB_YUV444) {
3639                             pictureRightBoundary = (lcuPtr->lcuEdgeInfoPtr->tileRightEdgeFlag == EB_TRUE && (((partitionOriginX + MIN_PU_SIZE) & (MAX_LCU_SIZE - 1)) == 0)) ? EB_TRUE : EB_FALSE;
3640                         } else {
3641                             pictureRightBoundary = (lcuPtr->lcuEdgeInfoPtr->tileRightEdgeFlag == EB_TRUE && ((((partitionOriginX / 2) + MIN_PU_SIZE) & ((MAX_LCU_SIZE / 2) - 1)) == 0)) ? EB_TRUE : EB_FALSE;
3642                         }
3643                         componentMask = PICTURE_BUFFER_DESC_FULL_MASK;
3644 						GenerateChromaIntraReferenceSamplesFuncTable[is16bit](
3645                                 constrainedIntraFlag,
3646                                 enableStrongIntraSmoothing,
3647                                 (colorFormat == EB_YUV444) ? partitionOriginX : contextPtr->cuOriginX,
3648                                 (colorFormat == EB_YUV444) ? partitionOriginY : contextPtr->cuOriginY,
3649                                 (colorFormat == EB_YUV444) ? MIN_PU_SIZE: (MIN_PU_SIZE << 1), //Jing: really a mess here, clean up later
3650                                 MAX_LCU_SIZE,
3651                                 cuStats->depth,
3652                                 epModeTypeNeighborArray,
3653                                 epLumaReconNeighborArray,
3654                                 epCbReconNeighborArray,
3655                                 epCrReconNeighborArray,
3656 						        is16bit ? (void*)contextPtr->intraRefPtr16 : (void*)contextPtr->intraRefPtr,
3657                                 colorFormat,
3658                                 (colorFormat == EB_YUV444) ? EB_FALSE : (EB_BOOL)partitionIndex,
3659                                 pictureLeftBoundary,
3660                                 pictureTopBoundary,
3661                                 pictureRightBoundary);
3662                     }
3663 
3664 					// Prediction
3665                     if (componentMask & PICTURE_BUFFER_DESC_LUMA_MASK) {
3666                         EncodePassIntraPredictionFuncTable[is16bit](
3667                                 is16bit ? (void*)contextPtr->intraRefPtr16 : (void*)contextPtr->intraRefPtr,
3668                                 partitionOriginX + reconBuffer->originX,
3669                                 partitionOriginY + reconBuffer->originY,
3670                                 MIN_PU_SIZE,
3671                                 MIN_PU_SIZE,
3672                                 reconBuffer,
3673                                 colorFormat,
3674                                 EB_FALSE, //4x4, always 1st block
3675                                 intraLumaMode,
3676                                 EB_INTRA_CHROMA_DM,
3677                                 PICTURE_BUFFER_DESC_LUMA_MASK);
3678                     }
3679 
3680                     if (componentMask & PICTURE_BUFFER_DESC_CHROMA_MASK) {
3681                         // Jing:
3682                         // For 422 intra4x4, the mode for chroma is the mode of 1st luma 4x4
3683                         EncodePassIntraPredictionFuncTable[is16bit](
3684                                 is16bit ? (void*)contextPtr->intraRefPtr16 : (void*)contextPtr->intraRefPtr,
3685                                 partitionOriginX + reconBuffer->originX,
3686                                 partitionOriginY + reconBuffer->originY,
3687                                 MIN_PU_SIZE,
3688                                 MIN_PU_SIZE,
3689                                 reconBuffer,
3690                                 colorFormat,
3691                                 EB_FALSE, //4x4, always 1st block
3692                                 (colorFormat == EB_YUV444) ? intraLumaMode : intraLumaModeForChroma,//420/422 use 1st luma 4x4 mode
3693                                 EB_INTRA_CHROMA_DM,
3694                                 PICTURE_BUFFER_DESC_CHROMA_MASK);
3695                     }
3696 
3697                     // Encode Transform Unit -INTRA-
3698                     contextPtr->forceCbfFlag = (contextPtr->skipQpmFlag) ?
3699                         EB_FALSE :
3700                         lcuPtr->lcuEdgeInfoPtr->tileLeftEdgeFlag && ((contextPtr->cuOriginX & (63)) == 0) && (contextPtr->cuOriginY == lcuOriginY);
3701 
3702                     SetPmEncDecMode(
3703                         pictureControlSetPtr,
3704                         contextPtr,
3705                         tbAddr,
3706 
3707                     lcuStatPtr->stationaryEdgeOverTimeFlag,
3708                         pictureControlSetPtr->temporalLayerIndex > 0 ? lcuStatPtr->pmStationaryEdgeOverTimeFlag : lcuStatPtr->stationaryEdgeOverTimeFlag);
3709 
3710                     contextPtr->transCoeffShapeLuma   = DEFAULT_SHAPE;
3711                     contextPtr->transCoeffShapeChroma = DEFAULT_SHAPE;
3712 
3713                     EncodeLoopFunctionTable[is16bit](
3714                         contextPtr,
3715                         lcuPtr,
3716                         partitionOriginX,
3717                         partitionOriginY,
3718                         cbQp,
3719                         reconBuffer,
3720                         coeffBufferTB,
3721                         residualBuffer,
3722                         transformBuffer,
3723                         transformInnerArrayPtr,
3724                         countNonZeroCoeffs,
3725                         useDeltaQpSegments,
3726                         (CabacEncodeContext_t*)coeffEstEntropyCoderPtr->cabacEncodeContextPtr,
3727                         (EB_U32)puPtr->intraLumaMode,
3728                         componentMask,
3729                         colorFormat,
3730                         EB_FALSE, //always 1st chroma block for 4x4
3731                         MIN_PU_SIZE,
3732                         pictureControlSetPtr->cabacCost,
3733                         cuPtr->deltaQp > 0 ? 0 : dZoffset);
3734 
3735 					EncodeGenerateReconFunctionPtr[is16bit](
3736 						contextPtr,
3737 						partitionOriginX,
3738 						partitionOriginY,
3739 						componentMask,
3740 						colorFormat,
3741 						EB_FALSE,
3742                         MIN_PU_SIZE,
3743 						reconBuffer,
3744 						residualBuffer,
3745 						transformInnerArrayPtr);
3746 
3747                     // Update the Intra-specific Neighbor Arrays
3748                     EncodePassUpdateIntraModeNeighborArrays(
3749                         epModeTypeNeighborArray,
3750                         epIntraLumaModeNeighborArray,
3751                         intraLumaMode,
3752                         partitionOriginX,
3753                         partitionOriginY,
3754                         MIN_PU_SIZE);
3755 
3756                     // Update Recon Samples-INTRA-
3757                     EncodePassUpdateReconSampleNeighborArrays(
3758                         epLumaReconNeighborArray,
3759                         epCbReconNeighborArray,
3760                         epCrReconNeighborArray,
3761                         reconBuffer,
3762                         partitionOriginX,
3763                         partitionOriginY,
3764                         MIN_PU_SIZE,
3765                         componentMask,
3766                         colorFormat,
3767                         is16bit);
3768 
3769                     // set up the bS based on PU boundary for DLF
3770                     if (dlfEnableFlag){
3771                         // Update the cbf map for DLF
3772                         startIndex = (partitionOriginY >> 2) * (sequenceControlSetPtr->lumaWidth >> 2) + (partitionOriginX >> 2);
3773                         for (blk4x4IndexY = 0; blk4x4IndexY < (MIN_PU_SIZE >> 2); ++blk4x4IndexY){
3774                             for (blk4x4IndexX = 0; blk4x4IndexX < (MIN_PU_SIZE >> 2); ++blk4x4IndexX){
3775                                 pictureControlSetPtr->cbfMapArray[startIndex + blk4x4IndexX] = (EB_U8)cuPtr->transformUnitArray[contextPtr->tuItr].lumaCbf;
3776                             }
3777                             startIndex += (sequenceControlSetPtr->lumaWidth >> 2);
3778                         }
3779 
3780                         // Set the bS on TU boundary for DLF
3781                         Intra4x4SetBSArrayBasedOnTUBoundary(
3782                             partitionOriginX,
3783                             partitionOriginY,
3784                             MIN_PU_SIZE,
3785                             MIN_PU_SIZE,
3786                             partitionOriginY == contextPtr->cuOriginY ? EB_TRUE : EB_FALSE,
3787                             partitionOriginX == contextPtr->cuOriginX ? EB_TRUE : EB_FALSE,
3788                             contextPtr->cuStats,
3789                             (EB_PART_MODE)contextPtr->cuPtr->predictionModeFlag,
3790                             lcuOriginX,
3791                             lcuOriginY,
3792                             pictureControlSetPtr,
3793                             pictureControlSetPtr->horizontalEdgeBSArray[tbAddr],
3794                             pictureControlSetPtr->verticalEdgeBSArray[tbAddr]);
3795 
3796 
3797                         Intra4x4SetBSArrayBasedOnPUBoundary(
3798                             epModeTypeNeighborArray,
3799                             epMvNeighborArray,
3800                             puPtr,
3801                             cuPtr,
3802                             cuStats,
3803                             partitionOriginX & (MAX_LCU_SIZE - 1),
3804                             partitionOriginY & (MAX_LCU_SIZE - 1),
3805                             MIN_PU_SIZE,
3806                             MIN_PU_SIZE,
3807                             lcuOriginX,
3808                             lcuOriginY,
3809                             EB_FALSE,
3810                             EB_FALSE,
3811                             pictureControlSetPtr,
3812                             pictureControlSetPtr->horizontalEdgeBSArray[tbAddr],
3813                             pictureControlSetPtr->verticalEdgeBSArray[tbAddr]);
3814 
3815                     }
3816                 } // Partition Loop
3817             } else if (cuPtr->predictionModeFlag == INTER_MODE) {
3818                 EB_U16  tuOriginX;
3819                 EB_U16  tuOriginY;
3820                 EB_U8   tuSize = 0;
3821                 EB_U8   tuSizeChroma;
3822 
3823                 EB_BOOL isCuSkip = EB_FALSE;
3824 
3825                 //********************************
3826                 //        INTER
3827                 //********************************
3828                 EB_BOOL doMVpred = EB_TRUE;
3829                 //if QPM and Segments are used, First Cu in LCU row should have at least one coeff.
3830                 EB_BOOL isFirstCUinRow = (useDeltaQp == 1) &&
3831                     !singleSegment &&
3832                      lcuPtr->lcuEdgeInfoPtr->tileLeftEdgeFlag && ((contextPtr->cuOriginX & (63)) == 0) && (contextPtr->cuOriginY == lcuOriginY) ? EB_TRUE : EB_FALSE;;
3833                 //Motion Compensation could be avoided in the case below
3834                 EB_BOOL doMC = EB_TRUE;
3835 
3836                 // Perform Merge/Skip Decision if the mode coming from MD is merge. for the First CU in Row merge will remain as is.
3837 
3838                 if (cuPtr->predictionUnitArray[0].mergeFlag == EB_TRUE) {
3839                     if (isFirstCUinRow == EB_FALSE) {
3840                         if (lcuPtr->chromaEncodeMode == CHROMA_MODE_BEST) {
3841                             // Jing: using 420 for MD related stuff
3842                             //EbPictureBufferDesc_t *inputPicturePtr = pictureControlSetPtr->ParentPcsPtr->enhancedPicturePtr;
3843                             EbPictureBufferDesc_t *inputPicturePtr = pictureControlSetPtr->ParentPcsPtr->chromaDownSamplePicturePtr;
3844                             const EB_U32 inputCbOriginIndex = ((contextPtr->cuOriginY >> 1) + (inputPicturePtr->originY >> 1)) * inputPicturePtr->strideCb + ((contextPtr->cuOriginX >> 1) + (inputPicturePtr->originX >> 1));
3845                             const EB_U32 cuChromaOriginIndex = (((contextPtr->cuOriginY & 63) * 32) + (contextPtr->cuOriginX & 63)) >> 1;
3846 
3847                             contextPtr->mdContext->cuOriginX = contextPtr->cuOriginX;
3848                             contextPtr->mdContext->cuOriginY = contextPtr->cuOriginY;
3849                             contextPtr->mdContext->puItr = 0;
3850                             contextPtr->mdContext->cuSize = contextPtr->cuStats->size;
3851                             contextPtr->mdContext->cuSizeLog2 = contextPtr->cuStats->sizeLog2;
3852                             contextPtr->mdContext->cuStats = contextPtr->cuStats;
3853 
3854                             AddChromaEncDec(
3855                                     pictureControlSetPtr,
3856                                     lcuPtr,
3857                                     cuPtr,
3858                                     contextPtr->mdContext,
3859                                     contextPtr,
3860                                     inputPicturePtr,
3861                                     inputCbOriginIndex,
3862                                     cuChromaOriginIndex,
3863                                     0);
3864                         }
3865 
3866                         if (pictureControlSetPtr->sliceType == EB_B_PICTURE &&
3867                                 pictureControlSetPtr->ParentPcsPtr->isUsedAsReferenceFlag == EB_FALSE) {
3868                             EbReferenceObject_t  * refObjL0, *refObjL1;
3869                             EB_U16 cuVar = (pictureControlSetPtr->ParentPcsPtr->variance[lcuPtr->index][0]);
3870                             EB_U8 INTRA_AREA_TH[MAX_TEMPORAL_LAYERS] = { 40, 30, 30, 0, 0, 0 };
3871                             refObjL0 = (EbReferenceObject_t*)pictureControlSetPtr->refPicPtrArray[REF_LIST_0]->objectPtr;
3872                             refObjL1 = (EbReferenceObject_t*)pictureControlSetPtr->refPicPtrArray[REF_LIST_1]->objectPtr;
3873 
3874                             if (cuVar < 200 && (refObjL0->intraCodedArea > INTRA_AREA_TH[refObjL0->tmpLayerIdx] ||
3875                                         refObjL1->intraCodedArea > INTRA_AREA_TH[refObjL1->tmpLayerIdx])) {
3876                                 mdcontextPtr->mdEpPipeLcu[cuPtr->leafIndex].skipCost +=
3877                                     (mdcontextPtr->mdEpPipeLcu[cuPtr->leafIndex].skipCost * 70) / 100;
3878                             }
3879                         }
3880                         isCuSkip = mdcontextPtr->mdEpPipeLcu[cuPtr->leafIndex].skipCost <= mdcontextPtr->mdEpPipeLcu[cuPtr->leafIndex].mergeCost ? 1 : 0;
3881                     }
3882                 }
3883 
3884                 //MC could be avoided in some cases below
3885                 if (isFirstCUinRow == EB_FALSE) {
3886                     if (pictureControlSetPtr->ParentPcsPtr->isUsedAsReferenceFlag == EB_FALSE &&
3887                             constrainedIntraFlag == EB_TRUE &&
3888                             cuPtr->predictionUnitArray[0].mergeFlag == EB_TRUE) {
3889                         if (isCuSkip) {
3890                             //here merge is decided to be skip in nonRef frame.
3891                             doMC = EB_FALSE;
3892                             doMVpred = EB_FALSE;
3893                         }
3894                     } else if (contextPtr->mdContext->limitIntra && isIntraLCU == EB_FALSE) {
3895                         if (isCuSkip) {
3896                             doMC = EB_FALSE;
3897                             doMVpred = EB_FALSE;
3898                         }
3899                     }
3900                 }
3901 
3902                 doMC = (EB_BOOL)(doRecon | doMC);
3903                 doMVpred = (EB_BOOL)(doRecon | doMVpred);
3904                 {
3905                     // 1st Partition Loop
3906                     puPtr = cuPtr->predictionUnitArray;
3907                     if (doMVpred)
3908                         EncodePassMvPrediction(  //AMVP, not merge
3909                                 sequenceControlSetPtr,
3910                                 pictureControlSetPtr,
3911                                 tbAddr,
3912                                 contextPtr);
3913 
3914                     // Set MvUnit
3915                     contextPtr->mvUnit.predDirection = (EB_U8)puPtr->interPredDirectionIndex;
3916                     contextPtr->mvUnit.mv[REF_LIST_0].mvUnion = puPtr->mv[REF_LIST_0].mvUnion;
3917                     contextPtr->mvUnit.mv[REF_LIST_1].mvUnion = puPtr->mv[REF_LIST_1].mvUnion;
3918                     // Inter Prediction
3919                     if (is16bit) {
3920                         if (doMC)
3921                             EncodePassInterPrediction16bit(
3922                                     &contextPtr->mvUnit,
3923                                     contextPtr->cuOriginX,
3924                                     contextPtr->cuOriginY,
3925                                     cuStats->size,
3926                                     cuStats->size,
3927                                     pictureControlSetPtr,
3928                                     reconBuffer,
3929                                     contextPtr->mcpContext);
3930                     } else{
3931                         if (doMC) {
3932                             EncodePassInterPrediction(
3933                                     &contextPtr->mvUnit,
3934                                     contextPtr->cuOriginX,
3935                                     contextPtr->cuOriginY,
3936                                     cuStats->size,
3937                                     cuStats->size,
3938                                     pictureControlSetPtr,
3939                                     reconBuffer,
3940                                     contextPtr->mcpContext);
3941                         }
3942                     }
3943                 }
3944                 contextPtr->tuItr = (cuStats->size < MAX_LCU_SIZE) ? 0 : 1;
3945 
3946                 // Transform Loop
3947                 cuPtr->transformUnitArray[0].lumaCbf = EB_FALSE;
3948                 cuPtr->transformUnitArray[0].cbCbf = EB_FALSE;
3949                 cuPtr->transformUnitArray[0].crCbf = EB_FALSE;
3950                 cuPtr->transformUnitArray[0].cbCbf2 = EB_FALSE;
3951                 cuPtr->transformUnitArray[0].crCbf2 = EB_FALSE;
3952 
3953                 // initialize TU Split
3954                 yFullDistortion[DIST_CALC_RESIDUAL] = 0;
3955                 yFullDistortion[DIST_CALC_PREDICTION] = 0;
3956 
3957                 yCoeffBits = 0;
3958                 cbCoeffBits = 0;
3959                 crCoeffBits = 0;
3960 
3961                 //printf("sizeof %i \n",sizeof(CodingUnit_t));
3962                 EB_U32  totTu = (cuStats->size < MAX_LCU_SIZE) ? 1 : 4;
3963                 EB_U8   tuIt;
3964 
3965                 EB_U32  componentMask   = PICTURE_BUFFER_DESC_FULL_MASK;
3966                 EB_MODETYPE predictionModeFlag = (EB_MODETYPE)cuPtr->predictionModeFlag;
3967 
3968                 if (cuPtr->predictionUnitArray[0].mergeFlag == EB_FALSE) {
3969                     for (tuIt = 0; tuIt < totTu; tuIt++) {
3970                         contextPtr->tuItr = (cuStats->size < MAX_LCU_SIZE) ? 0 : tuIt + 1;
3971                         if (cuStats->size < MAX_LCU_SIZE) {
3972                             tuOriginX = contextPtr->cuOriginX;
3973                             tuOriginY = contextPtr->cuOriginY;
3974                             tuSize = cuStats->size;
3975                             tuSizeChroma = (cuStats->size >> (colorFormat==EB_YUV444 ? 0 : 1));
3976                         } else {
3977                             tuOriginX = contextPtr->cuOriginX + ((tuIt & 1) << 5);
3978                             tuOriginY = contextPtr->cuOriginY + ((tuIt > 1) << 5);
3979                             tuSize = 32;
3980                             tuSizeChroma = (colorFormat == EB_YUV444 ? 32: 16);
3981                         }
3982 
3983                         //TU LOOP for MV mode + Luma CBF decision.
3984                         contextPtr->forceCbfFlag = (contextPtr->skipQpmFlag) ?
3985                             EB_FALSE :
3986                             lcuPtr->lcuEdgeInfoPtr->tileLeftEdgeFlag && ((tuOriginX & 63) == 0) && (tuOriginY == lcuOriginY);
3987 
3988                         SetPmEncDecMode(
3989                                 pictureControlSetPtr,
3990                                 contextPtr,
3991                                 tbAddr,
3992                                 lcuStatPtr->stationaryEdgeOverTimeFlag,
3993                                 pictureControlSetPtr->temporalLayerIndex > 0 ? lcuStatPtr->pmStationaryEdgeOverTimeFlag : lcuStatPtr->stationaryEdgeOverTimeFlag);
3994 
3995                         // Set Fast El coef shaping method
3996                         contextPtr->transCoeffShapeLuma     = DEFAULT_SHAPE;
3997                         contextPtr->transCoeffShapeChroma   = DEFAULT_SHAPE;
3998 
3999                         if (fastEl && isFirstCUinRow == EB_FALSE && contextPtr->pmpMaskingLevelEncDec > MASK_THSHLD_1) {
4000                             yDc = contextPtr->mdContext->mdEpPipeLcu[cuPtr->leafIndex].yDc[tuIt];
4001                             yCountNonZeroCoeffs = contextPtr->mdContext->mdEpPipeLcu[cuPtr->leafIndex].yCountNonZeroCoeffs[tuIt];
4002 
4003                             if ((cuPtr->rootCbf == 0) || ((yCoeffBitsTemp <= yBitsThsld) && yDc < YDC_THSHLD_1 && yCountNonZeroCoeffs <= 1)) {
4004                                 // Skip pass for cuPtr->rootCbf == 0 caused some VQ issues in chroma, so DC path is used instead
4005                                 contextPtr->transCoeffShapeLuma = ONLY_DC_SHAPE;
4006                                 contextPtr->transCoeffShapeChroma = ONLY_DC_SHAPE;
4007                             } else if ((yCoeffBitsTemp <= yBitsThsld * 4)) {
4008                                 contextPtr->transCoeffShapeLuma = N4_SHAPE;
4009                                 if ((cuStats->size >> 1) > 8) {
4010                                     contextPtr->transCoeffShapeChroma = N4_SHAPE;
4011                                 } else {
4012                                     contextPtr->transCoeffShapeChroma = N2_SHAPE;
4013                                 }
4014                             } else if ((yCoeffBitsTemp <= yBitsThsld * 16)) {
4015                                 contextPtr->transCoeffShapeLuma = N2_SHAPE;
4016                                 contextPtr->transCoeffShapeChroma = N2_SHAPE;
4017                             }
4018                         }
4019 
4020                         EncodeLoopFunctionTable[is16bit](
4021                                 contextPtr,
4022                                 lcuPtr,
4023                                 tuOriginX,
4024                                 tuOriginY,
4025                                 cbQp,
4026                                 reconBuffer,
4027                                 coeffBufferTB,
4028                                 residualBuffer,
4029                                 transformBuffer,
4030                                 transformInnerArrayPtr,
4031                                 countNonZeroCoeffs,
4032                                 useDeltaQpSegments,
4033                                 (CabacEncodeContext_t*)coeffEstEntropyCoderPtr->cabacEncodeContextPtr,
4034                                 0,
4035                                 PICTURE_BUFFER_DESC_FULL_MASK,
4036                                 colorFormat,
4037                                 EB_FALSE,
4038                                 tuSize,
4039                                 pictureControlSetPtr->cabacCost,
4040                                 cuPtr->deltaQp > 0 ? 0 : dZoffset);
4041 
4042                         //Jing: For 422, do for the 2nd chroma
4043                         if (colorFormat == EB_YUV422) {
4044                             EB_U32  tmpCountNonZeroCoeffs[3];
4045                             EncodeLoopFunctionTable[is16bit](
4046                                     contextPtr,
4047                                     lcuPtr,
4048                                     tuOriginX,
4049                                     tuOriginY,
4050                                     cbQp,
4051                                     reconBuffer,
4052                                     coeffBufferTB,
4053                                     residualBuffer,
4054                                     transformBuffer,
4055                                     transformInnerArrayPtr,
4056                                     tmpCountNonZeroCoeffs, //Jing: beware of this
4057                                     useDeltaQpSegments,
4058                                     (CabacEncodeContext_t*)coeffEstEntropyCoderPtr->cabacEncodeContextPtr,
4059                                     0,
4060                                     PICTURE_BUFFER_DESC_CHROMA_MASK,
4061                                     colorFormat,
4062                                     EB_TRUE,
4063                                     tuSize,
4064                                     pictureControlSetPtr->cabacCost,
4065                                     cuPtr->deltaQp > 0 ? 0 : dZoffset);
4066                             // Jing, seems not useful here, never used ...
4067                             //countNonZeroCoeffs[1] += tmpCountNonZeroCoeffs[1];
4068                             //countNonZeroCoeffs[2] += tmpCountNonZeroCoeffs[2];
4069                         }
4070 
4071                         // SKIP the CBF zero mode for DC path. There are problems with cost calculations
4072                         if (contextPtr->transCoeffShapeLuma != ONLY_DC_SHAPE && colorFormat == EB_YUV420) {
4073                             // Jing: A bit mess here, seems only luma is used, but chroma everywhere and wastes calculation power
4074                             //       Will clean it later for 422, only enable it for 420 now
4075                             scratchLumaOffset = ((tuOriginY & (63)) * 64) + (tuOriginX & (63));
4076 
4077                             // Compute Tu distortion
4078                             PictureFullDistortionLuma(
4079                                     transformBuffer,
4080                                     scratchLumaOffset,
4081                                     residualBuffer,
4082                                     scratchLumaOffset,
4083                                     contextPtr->transCoeffShapeLuma == ONLY_DC_SHAPE || cuPtr->transformUnitArray[contextPtr->tuItr].isOnlyDc[0] == EB_TRUE ? 1 : (tuSize >> contextPtr->transCoeffShapeLuma),
4084                                     yTuFullDistortion,
4085                                     countNonZeroCoeffs[0],
4086                                     predictionModeFlag);
4087 
4088 
4089                             lumaShift = 2 * (7 - Log2f(tuSize));
4090 
4091                             // Note: for square Transform, the scale is 1/(2^(7-Log2(Transform size)))
4092                             // For NSQT the scale would be 1/ (2^(7-(Log2(first Transform size)+Log2(second Transform size))/2))
4093                             // Add Log2 of Transform size in order to calculating it multiple time in this function
4094 
4095                             yTuFullDistortion[DIST_CALC_RESIDUAL] = (yTuFullDistortion[DIST_CALC_RESIDUAL] + (EB_U64)(1 << (lumaShift - 1))) >> lumaShift;
4096                             yTuFullDistortion[DIST_CALC_PREDICTION] = (yTuFullDistortion[DIST_CALC_PREDICTION] + (EB_U64)(1 << (lumaShift - 1))) >> lumaShift;
4097 
4098                             yTuCoeffBits = 0;
4099                             cbTuCoeffBits = 0;
4100                             crTuCoeffBits = 0;
4101 
4102                             // Estimate Tu Coeff  bits
4103                             TuEstimateCoeffBitsEncDec(
4104                                     (tuOriginY & (63)) * MAX_LCU_SIZE + (tuOriginX & (63)),
4105                                     ((tuOriginY & (63)) * MAX_LCU_SIZE_CHROMA + (tuOriginX & (63))) >> 1, //Jing: 444 is different
4106                                     coeffEstEntropyCoderPtr,
4107                                     coeffBufferTB,
4108                                     countNonZeroCoeffs,
4109                                     &yTuCoeffBits,
4110                                     &cbTuCoeffBits,
4111                                     &crTuCoeffBits,
4112                                     contextPtr->transCoeffShapeLuma == ONLY_DC_SHAPE ? 1 : (tuSize >> contextPtr->transCoeffShapeLuma),
4113                                     contextPtr->transCoeffShapeChroma == ONLY_DC_SHAPE ? 1 : (tuSizeChroma >> contextPtr->transCoeffShapeChroma),
4114                                     predictionModeFlag,
4115                                     cabacCost);
4116 
4117                             // CBF Tu decision
4118                             EncodeTuCalcCost(
4119                                     contextPtr,
4120                                     countNonZeroCoeffs,
4121                                     yTuFullDistortion,
4122                                     &yTuCoeffBits,
4123                                     componentMask);
4124 
4125                             yCoeffBits += yTuCoeffBits;
4126                             cbCoeffBits += cbTuCoeffBits;
4127                             crCoeffBits += crTuCoeffBits;
4128 
4129                             yFullDistortion[DIST_CALC_RESIDUAL] += yTuFullDistortion[DIST_CALC_RESIDUAL];
4130                             yFullDistortion[DIST_CALC_PREDICTION] += yTuFullDistortion[DIST_CALC_PREDICTION];
4131                             //-------------------------------------------------
4132                         }
4133                     } // Transform Loop
4134                 }
4135 
4136                 //Set Final CU data flags after skip/Merge decision.
4137                 if (isFirstCUinRow == EB_FALSE) {
4138                     if (cuPtr->predictionUnitArray[0].mergeFlag == EB_TRUE) {
4139                         cuPtr->skipFlag = (isCuSkip) ? EB_TRUE : EB_FALSE;
4140                         cuPtr->predictionUnitArray[0].mergeFlag = (isCuSkip) ? EB_FALSE : EB_TRUE;
4141                     }
4142                 }
4143 
4144                 // Initialize the Transform Loop
4145                 contextPtr->tuItr = (cuStats->size < MAX_LCU_SIZE) ? 0 : 1;
4146                 yCbf = 0;
4147                 cbCbf = 0;
4148                 crCbf = 0;
4149                 cbCbf2 = 0;
4150                 crCbf2 = 0;
4151 
4152                 for (tuIt = 0; tuIt < totTu; tuIt++) {
4153                     contextPtr->tuItr = (cuStats->size < MAX_LCU_SIZE) ? 0 : tuIt + 1;
4154                     if (cuStats->size < MAX_LCU_SIZE) {
4155                         tuOriginX = contextPtr->cuOriginX;
4156                         tuOriginY = contextPtr->cuOriginY;
4157                         tuSize = cuStats->size;
4158                         tuSizeChroma = (tuSize >> (colorFormat==EB_YUV444 ? 0 : 1));
4159                     } else {
4160                         tuOriginX = contextPtr->cuOriginX + ((tuIt & 1) << 5);
4161                         tuOriginY = contextPtr->cuOriginY + ((tuIt > 1) << 5);
4162                         tuSize = 32;
4163                         tuSizeChroma = colorFormat==EB_YUV444 ? 32 : 16;
4164                     }
4165 
4166                     if (cuPtr->skipFlag == EB_TRUE){
4167                         cuPtr->transformUnitArray[contextPtr->tuItr].lumaCbf = EB_FALSE;
4168                         cuPtr->transformUnitArray[contextPtr->tuItr].cbCbf = EB_FALSE;
4169                         cuPtr->transformUnitArray[contextPtr->tuItr].crCbf = EB_FALSE;
4170                         cuPtr->transformUnitArray[contextPtr->tuItr].cbCbf2 = EB_FALSE;
4171                         cuPtr->transformUnitArray[contextPtr->tuItr].crCbf2 = EB_FALSE;
4172                     } else if (cuPtr->predictionUnitArray[0].mergeFlag == EB_TRUE) {
4173                         contextPtr->forceCbfFlag = (contextPtr->skipQpmFlag) ?
4174                             EB_FALSE :
4175                             lcuPtr->lcuEdgeInfoPtr->tileLeftEdgeFlag && ((tuOriginX & 63) == 0) && (tuOriginY == lcuOriginY);
4176 
4177                         SetPmEncDecMode(
4178                                 pictureControlSetPtr,
4179                                 contextPtr,
4180                                 tbAddr,
4181                                 lcuStatPtr->stationaryEdgeOverTimeFlag,
4182                                 pictureControlSetPtr->temporalLayerIndex > 0 ? lcuStatPtr->pmStationaryEdgeOverTimeFlag : lcuStatPtr->stationaryEdgeOverTimeFlag);
4183 
4184                         // Set Fast El coef shaping method
4185                         contextPtr->transCoeffShapeLuma     = DEFAULT_SHAPE;
4186                         contextPtr->transCoeffShapeChroma   = DEFAULT_SHAPE;
4187 
4188                         if (fastEl && isFirstCUinRow == EB_FALSE && contextPtr->pmpMaskingLevelEncDec > MASK_THSHLD_1) {
4189                             yDc = contextPtr->mdContext->mdEpPipeLcu[cuPtr->leafIndex].yDc[tuIt];
4190                             yCountNonZeroCoeffs = contextPtr->mdContext->mdEpPipeLcu[cuPtr->leafIndex].yCountNonZeroCoeffs[tuIt];
4191 
4192                             if ((cuPtr->rootCbf == 0) || ((yCoeffBitsTemp <= yBitsThsld) && yDc < YDC_THSHLD_1 && yCountNonZeroCoeffs <= 1)) {
4193                                 // Skip pass for cuPtr->rootCbf == 0 caused some VQ issues in chroma, so DC path is used instead
4194                                 contextPtr->transCoeffShapeLuma = ONLY_DC_SHAPE;
4195                                 contextPtr->transCoeffShapeChroma = ONLY_DC_SHAPE;
4196                             } else if ((yCoeffBitsTemp <= yBitsThsld * 4)) {
4197                                 contextPtr->transCoeffShapeLuma = N4_SHAPE;
4198                                 if ((cuStats->size >> 1) > 8) {
4199                                     contextPtr->transCoeffShapeChroma = N4_SHAPE;
4200                                 } else {
4201                                     contextPtr->transCoeffShapeChroma = N2_SHAPE;
4202                                 }
4203                             } else if ((yCoeffBitsTemp <= yBitsThsld * 16)) {
4204                                 contextPtr->transCoeffShapeLuma = N2_SHAPE;
4205                                 contextPtr->transCoeffShapeChroma = N2_SHAPE;
4206                             }
4207                         }
4208 
4209                         EncodeLoopFunctionTable[is16bit](
4210                                 contextPtr,
4211                                 lcuPtr,
4212                                 tuOriginX,
4213                                 tuOriginY,
4214                                 cbQp,
4215                                 reconBuffer,
4216                                 coeffBufferTB,
4217                                 residualBuffer,
4218                                 transformBuffer,
4219                                 transformInnerArrayPtr,
4220                                 countNonZeroCoeffs,
4221                                 useDeltaQpSegments,
4222                                 (CabacEncodeContext_t*)coeffEstEntropyCoderPtr->cabacEncodeContextPtr,
4223                                 0,
4224                                 PICTURE_BUFFER_DESC_FULL_MASK,
4225                                 colorFormat,
4226                                 EB_FALSE,
4227                                 tuSize,
4228                                 pictureControlSetPtr->cabacCost,
4229                                 cuPtr->deltaQp > 0 ? 0 : dZoffset);
4230 
4231                         if (colorFormat == EB_YUV422) {
4232                             EncodeLoopFunctionTable[is16bit](
4233                                     contextPtr,
4234                                     lcuPtr,
4235                                     tuOriginX,
4236                                     tuOriginY,
4237                                     cbQp,
4238                                     reconBuffer,
4239                                     coeffBufferTB,
4240                                     residualBuffer,
4241                                     transformBuffer,
4242                                     transformInnerArrayPtr,
4243                                     countNonZeroCoeffs,
4244                                     useDeltaQpSegments,
4245                                     (CabacEncodeContext_t*)coeffEstEntropyCoderPtr->cabacEncodeContextPtr,
4246                                     0,
4247                                     PICTURE_BUFFER_DESC_CHROMA_MASK,
4248                                     colorFormat,
4249                                     EB_TRUE,
4250                                     tuSize,
4251                                     pictureControlSetPtr->cabacCost,
4252                                     cuPtr->deltaQp > 0 ? 0 : dZoffset);
4253                         }
4254                     }
4255 
4256                     cuPtr->rootCbf = cuPtr->rootCbf |
4257                         cuPtr->transformUnitArray[contextPtr->tuItr].lumaCbf |
4258                         cuPtr->transformUnitArray[contextPtr->tuItr].cbCbf |
4259                         cuPtr->transformUnitArray[contextPtr->tuItr].cbCbf2 |
4260                         cuPtr->transformUnitArray[contextPtr->tuItr].crCbf |
4261                         cuPtr->transformUnitArray[contextPtr->tuItr].crCbf2;
4262 
4263                     if (cuPtr->transformUnitArray[contextPtr->tuItr].cbCbf) {
4264                         cuPtr->transformUnitArray[0].cbCbf = EB_TRUE;
4265                     }
4266 
4267                     if (cuPtr->transformUnitArray[contextPtr->tuItr].cbCbf2) {
4268                         cuPtr->transformUnitArray[0].cbCbf2 = EB_TRUE;
4269                     }
4270 
4271                     contextPtr->forceCbfFlag = (contextPtr->skipQpmFlag) ?
4272                         EB_FALSE :
4273                         lcuPtr->lcuEdgeInfoPtr->tileLeftEdgeFlag && ((tuOriginX & 63) == 0) && (tuOriginY == lcuOriginY);
4274 
4275                     if (cuPtr->transformUnitArray[contextPtr->tuItr].crCbf) {
4276                         cuPtr->transformUnitArray[0].crCbf = EB_TRUE;
4277                     }
4278 
4279                     if (cuPtr->transformUnitArray[contextPtr->tuItr].crCbf2) {
4280                         cuPtr->transformUnitArray[0].crCbf2 = EB_TRUE;
4281                     }
4282 
4283                     if (doRecon) {
4284                         EncodeGenerateReconFunctionPtr[is16bit](
4285                                 contextPtr,
4286                                 tuOriginX,
4287                                 tuOriginY,
4288                                 PICTURE_BUFFER_DESC_FULL_MASK,
4289                                 colorFormat,
4290                                 EB_FALSE,
4291                                 tuSize,
4292                                 reconBuffer,
4293                                 residualBuffer,
4294                                 transformInnerArrayPtr);
4295                         if (colorFormat == EB_YUV422) {
4296                             EncodeGenerateReconFunctionPtr[is16bit](
4297                                     contextPtr,
4298                                     tuOriginX,
4299                                     tuOriginY,
4300                                     PICTURE_BUFFER_DESC_CHROMA_MASK,
4301                                     colorFormat,
4302                                     EB_TRUE,
4303                                     tuSize,
4304                                     reconBuffer,
4305                                     residualBuffer,
4306                                     transformInnerArrayPtr);
4307                         }
4308                     }
4309                     yCbf  |= cuPtr->transformUnitArray[contextPtr->tuItr].lumaCbf;
4310                     cbCbf |= cuPtr->transformUnitArray[contextPtr->tuItr].cbCbf;
4311                     crCbf |= cuPtr->transformUnitArray[contextPtr->tuItr].crCbf;
4312                     cbCbf2 |= cuPtr->transformUnitArray[contextPtr->tuItr].cbCbf2;
4313                     crCbf2 |= cuPtr->transformUnitArray[contextPtr->tuItr].crCbf2;
4314 
4315                     if (dlfEnableFlag) {
4316 
4317                         EB_U32 lumaStride = (sequenceControlSetPtr->lumaWidth >> 2);
4318                         TransformUnit_t *tuPtr = &cuPtr->transformUnitArray[contextPtr->tuItr];
4319 
4320                         // Update the cbf map for DLF
4321                         startIndex = (tuOriginY >> 2) * lumaStride + (tuOriginX >> 2);
4322                         for (blk4x4IndexY = 0; blk4x4IndexY < (tuSize >> 2); ++blk4x4IndexY){
4323                             EB_MEMSET(&pictureControlSetPtr->cbfMapArray[startIndex], (EB_U8)tuPtr->lumaCbf, (tuSize >> 2));
4324                             startIndex += lumaStride;
4325                         }
4326 
4327                         if (cuStats->size == MAX_LCU_SIZE)
4328                             // Set the bS on TU boundary for DLF
4329                             SetBSArrayBasedOnTUBoundary(
4330                                     tuOriginX,
4331                                     tuOriginY,
4332                                     tuSize,
4333                                     tuSize,
4334                                     cuStats,
4335                                     (EB_PART_MODE)cuPtr->predictionModeFlag,
4336                                     lcuOriginX,
4337                                     lcuOriginY,
4338                                     pictureControlSetPtr,
4339                                     pictureControlSetPtr->horizontalEdgeBSArray[tbAddr],
4340                                     pictureControlSetPtr->verticalEdgeBSArray[tbAddr]);
4341                     }
4342                 } // Transform Loop
4343 
4344                 // Calculate Root CBF
4345                 cuPtr->rootCbf = (yCbf | cbCbf | cbCbf2 | crCbf | crCbf2 ) ? EB_TRUE  : EB_FALSE;
4346 
4347                 // Force Skip if MergeFlag == TRUE && RootCbf == 0
4348                 if (cuPtr->skipFlag == EB_FALSE &&
4349                         cuPtr->predictionUnitArray[0].mergeFlag == EB_TRUE &&
4350                         cuPtr->rootCbf == EB_FALSE ) {
4351                     cuPtr->skipFlag = EB_TRUE;
4352                 }
4353 
4354                 {
4355                     // Set the PU Loop Variables
4356                     puPtr = cuPtr->predictionUnitArray;
4357 
4358                     // Set MvUnit
4359                     contextPtr->mvUnit.predDirection = (EB_U8)puPtr->interPredDirectionIndex;
4360                     contextPtr->mvUnit.mv[REF_LIST_0].mvUnion = puPtr->mv[REF_LIST_0].mvUnion;
4361                     contextPtr->mvUnit.mv[REF_LIST_1].mvUnion = puPtr->mv[REF_LIST_1].mvUnion;
4362                     // set up the bS based on PU boundary for DLF
4363                     if (dlfEnableFlag /*&& cuStats->size < MAX_LCU_SIZE*/  ) {
4364                         SetBSArrayBasedOnPUBoundary(
4365                                 epModeTypeNeighborArray,
4366                                 epMvNeighborArray,
4367                                 puPtr,
4368                                 cuPtr,
4369                                 cuStats,
4370                                 lcuOriginX,
4371                                 lcuOriginY,
4372                                 tileLeftBoundary,
4373                                 tileTopBoundary,
4374                                 pictureControlSetPtr,
4375                                 pictureControlSetPtr->horizontalEdgeBSArray[tbAddr],
4376                                 pictureControlSetPtr->verticalEdgeBSArray[tbAddr]);
4377                     }
4378 
4379                     // Update Neighbor Arrays (Mode Type, MVs, SKIP)
4380                     {
4381                         EB_U8 skipFlag = (EB_U8)cuPtr->skipFlag;
4382                         EncodePassUpdateInterModeNeighborArrays(
4383                                 epModeTypeNeighborArray,
4384                                 epMvNeighborArray,
4385                                 epSkipFlagNeighborArray,
4386                                 &contextPtr->mvUnit,
4387                                 &skipFlag,
4388                                 contextPtr->cuOriginX,
4389                                 contextPtr->cuOriginY,
4390                                 cuStats->size);
4391 
4392                     }
4393 
4394                 } // 2nd Partition Loop
4395 
4396 
4397                 // Update Recon Samples Neighbor Arrays -INTER-
4398                 if (doRecon) {
4399                     EncodePassUpdateReconSampleNeighborArrays(
4400                             epLumaReconNeighborArray,
4401                             epCbReconNeighborArray,
4402                             epCrReconNeighborArray,
4403                             reconBuffer,
4404                             contextPtr->cuOriginX,
4405                             contextPtr->cuOriginY,
4406                             cuStats->size,
4407                             PICTURE_BUFFER_DESC_FULL_MASK,
4408                             colorFormat,
4409                             is16bit);
4410 
4411                     if (colorFormat == EB_YUV422) {
4412                         //Here need to update the 2nd chroma for neighbour
4413                         EncodePassUpdateReconSampleNeighborArrays(
4414                                 epLumaReconNeighborArray,
4415                                 epCbReconNeighborArray,
4416                                 epCrReconNeighborArray,
4417                                 reconBuffer,
4418                                 contextPtr->cuOriginX,
4419                                 contextPtr->cuOriginY+(cuStats->size>>1),
4420                                 cuStats->size,
4421                                 PICTURE_BUFFER_DESC_CHROMA_MASK,
4422                                 colorFormat,
4423                                 is16bit);
4424                     }
4425                 }
4426             } else {
4427                 CHECK_REPORT_ERROR_NC(
4428                         encodeContextPtr->appCallbackPtr,
4429                         EB_ENC_CL_ERROR2);
4430             }
4431 
4432 
4433             if (dlfEnableFlag) {
4434                 // Assign the LCU-level QP
4435                 if (cuPtr->predictionModeFlag == INTRA_MODE && puPtr->intraLumaMode == EB_INTRA_MODE_4x4) {
4436                     availableCoeff = (
4437                         contextPtr->cuPtr->transformUnitArray[1].lumaCbf ||
4438                         contextPtr->cuPtr->transformUnitArray[2].lumaCbf ||
4439                         contextPtr->cuPtr->transformUnitArray[3].lumaCbf ||
4440                         contextPtr->cuPtr->transformUnitArray[4].lumaCbf ||
4441                         contextPtr->cuPtr->transformUnitArray[1].crCbf ||
4442                         contextPtr->cuPtr->transformUnitArray[1].cbCbf ||
4443                         contextPtr->cuPtr->transformUnitArray[2].crCbf ||
4444                         contextPtr->cuPtr->transformUnitArray[2].cbCbf ||
4445                         contextPtr->cuPtr->transformUnitArray[3].crCbf ||
4446                         contextPtr->cuPtr->transformUnitArray[3].cbCbf ||
4447                         contextPtr->cuPtr->transformUnitArray[4].crCbf || // 422 case will use 3rd 4x4 for the 2nd chroma
4448                         contextPtr->cuPtr->transformUnitArray[4].cbCbf) ? EB_TRUE : EB_FALSE;
4449                 } else {
4450                     availableCoeff = (cuPtr->predictionModeFlag == INTER_MODE) ? (EB_BOOL)cuPtr->rootCbf :
4451                         (cuPtr->transformUnitArray[cuStats->size == MAX_LCU_SIZE ? 1 : 0].lumaCbf ||
4452                         cuPtr->transformUnitArray[cuStats->size == MAX_LCU_SIZE ? 1 : 0].crCbf ||
4453                         cuPtr->transformUnitArray[cuStats->size == MAX_LCU_SIZE ? 1 : 0].crCbf2 ||
4454                         cuPtr->transformUnitArray[cuStats->size == MAX_LCU_SIZE ? 1 : 0].cbCbf ||
4455                         cuPtr->transformUnitArray[cuStats->size == MAX_LCU_SIZE ? 1 : 0].cbCbf2) ? EB_TRUE : EB_FALSE;
4456                 }
4457 
4458 
4459                 // Assign the LCU-level QP
4460                 EncodePassUpdateQp(
4461                     pictureControlSetPtr,
4462                     contextPtr,
4463                     availableCoeff,
4464                     useDeltaQp,
4465                     &isDeltaQpNotCoded,
4466                     pictureControlSetPtr->difCuDeltaQpDepth,
4467                     &(pictureControlSetPtr->encPrevCodedQp[tileIdx][singleSegment ? 0 : lcuRowIndex]),
4468                     &(pictureControlSetPtr->encPrevQuantGroupCodedQp[tileIdx][singleSegment ? 0 : lcuRowIndex]),
4469                     lcuPtr->tileInfoPtr->tilePxlOriginX,
4470                     lcuPtr->tileInfoPtr->tilePxlOriginY,
4471                     lcuQp);
4472 
4473                 // Assign DLF QP
4474                 SetQpArrayBasedOnCU(
4475                     pictureControlSetPtr,
4476                     contextPtr->cuOriginX,
4477                     contextPtr->cuOriginY,
4478                     cuStats->size / MIN_CU_SIZE,
4479                     cuPtr->qp);
4480             }
4481 
4482             {
4483                 // Update Neighbor Arrays (Leaf Depth)
4484                 EncodePassUpdateLeafDepthNeighborArrays(
4485                     epLeafDepthNeighborArray,
4486                     cuStats->depth,
4487                     contextPtr->cuOriginX,
4488                     contextPtr->cuOriginY,
4489                     cuStats->size);
4490                 {
4491                     // Set the PU Loop Variables
4492                     puPtr = cuPtr->predictionUnitArray;
4493                     // Set MvUnit
4494                     contextPtr->mvUnit.predDirection = (EB_U8)puPtr->interPredDirectionIndex;
4495                     contextPtr->mvUnit.mv[REF_LIST_0].mvUnion = puPtr->mv[REF_LIST_0].mvUnion;
4496                     contextPtr->mvUnit.mv[REF_LIST_1].mvUnion = puPtr->mv[REF_LIST_1].mvUnion;
4497                 }
4498 
4499 
4500                 // Update TMVP Map (create new one and compare to the old one!!!)
4501                 if (tmvpMapWritePtr != EB_NULL){
4502 
4503                     puPtr = cuPtr->predictionUnitArray;
4504                     tmvpMapVerticalStartIndex   = (cuStats->originY + mvCompressionUnitSizeMinus1) >> LOG_MV_COMPRESS_UNIT_SIZE;         //elemPU's vertical index relative to current LCU on 16x16 basic unit
4505                     tmvpMapHorizontalEndIndex   = (cuStats->originX + cuStats->size + mvCompressionUnitSizeMinus1) >> LOG_MV_COMPRESS_UNIT_SIZE;
4506                     tmvpMapVerticalEndIndex     = (cuStats->originY + cuStats->size + mvCompressionUnitSizeMinus1) >> LOG_MV_COMPRESS_UNIT_SIZE; // the problem is at this line, in 64x48 PU, this value turns out to be 4 while it is supposed to be 3
4507                     tmvpMapHorizontalStartIndex = (cuStats->originX + mvCompressionUnitSizeMinus1) >> LOG_MV_COMPRESS_UNIT_SIZE;
4508 
4509                     while (tmvpMapVerticalStartIndex < tmvpMapVerticalEndIndex){
4510                         tmvpMapHorizontalStartIndex = (cuStats->originX + mvCompressionUnitSizeMinus1) >> LOG_MV_COMPRESS_UNIT_SIZE;
4511                         tmvpMapIndex = (tmvpMapVerticalStartIndex * (MAX_LCU_SIZE >> LOG_MV_COMPRESS_UNIT_SIZE)) + tmvpMapHorizontalStartIndex;
4512 
4513                         while ((tmvpMapHorizontalStartIndex) < tmvpMapHorizontalEndIndex){
4514                             switch (cuPtr->predictionModeFlag){
4515                             case INTER_MODE:
4516                                 switch (cuPtr->predictionUnitArray->interPredDirectionIndex){
4517 
4518                                 case UNI_PRED_LIST_0:
4519                                     tmvpMapWritePtr->availabilityFlag[tmvpMapIndex] = EB_TRUE;
4520                                     tmvpMapWritePtr->mv[REF_LIST_0][tmvpMapIndex].mvUnion = puPtr->mv[REF_LIST_0].mvUnion;
4521                                     tmvpMapWritePtr->predictionDirection[tmvpMapIndex] = UNI_PRED_LIST_0;
4522                                     tmvpMapWritePtr->refPicPOC[REF_LIST_0][tmvpMapIndex] = ((EbReferenceObject_t*)
4523                                         pictureControlSetPtr->refPicPtrArray[REF_LIST_0]->objectPtr
4524                                         )->refPOC;
4525                                     break;
4526 
4527                                 case UNI_PRED_LIST_1:
4528                                     tmvpMapWritePtr->availabilityFlag[tmvpMapIndex] = EB_TRUE;
4529                                     tmvpMapWritePtr->mv[REF_LIST_1][tmvpMapIndex].mvUnion = puPtr->mv[REF_LIST_1].mvUnion;
4530                                     tmvpMapWritePtr->predictionDirection[tmvpMapIndex] = UNI_PRED_LIST_1;
4531                                     tmvpMapWritePtr->refPicPOC[REF_LIST_1][tmvpMapIndex] = ((EbReferenceObject_t*)
4532                                         pictureControlSetPtr->refPicPtrArray[REF_LIST_1]->objectPtr
4533                                         )->refPOC;
4534                                     break;
4535 
4536                                 case BI_PRED:
4537                                     if (puPtr->interPredDirectionIndex == BI_PRED || puPtr->interPredDirectionIndex == UNI_PRED_LIST_0){
4538                                         tmvpMapWritePtr->availabilityFlag[tmvpMapIndex] = EB_TRUE;
4539                                         tmvpMapWritePtr->mv[REF_LIST_0][tmvpMapIndex].mvUnion = puPtr->mv[REF_LIST_0].mvUnion;
4540                                         tmvpMapWritePtr->predictionDirection[tmvpMapIndex] = (EB_PREDDIRECTION)puPtr->interPredDirectionIndex;
4541                                         tmvpMapWritePtr->refPicPOC[REF_LIST_0][tmvpMapIndex] = ((EbReferenceObject_t*)
4542                                             pictureControlSetPtr->refPicPtrArray[REF_LIST_0]->objectPtr
4543                                             )->refPOC;
4544                                     }
4545 
4546                                     if (puPtr->interPredDirectionIndex == BI_PRED || puPtr->interPredDirectionIndex == UNI_PRED_LIST_1){
4547                                         tmvpMapWritePtr->availabilityFlag[tmvpMapIndex] = EB_TRUE;
4548                                         tmvpMapWritePtr->mv[REF_LIST_1][tmvpMapIndex].mvUnion = puPtr->mv[REF_LIST_1].mvUnion;
4549                                         tmvpMapWritePtr->predictionDirection[tmvpMapIndex] = (EB_PREDDIRECTION)puPtr->interPredDirectionIndex;
4550                                         tmvpMapWritePtr->refPicPOC[REF_LIST_1][tmvpMapIndex] = ((EbReferenceObject_t*)
4551                                             pictureControlSetPtr->refPicPtrArray[REF_LIST_1]->objectPtr
4552                                             )->refPOC;
4553                                     }
4554 
4555                                     break;
4556 
4557                                 default:
4558                                     CHECK_REPORT_ERROR_NC(
4559                                         encodeContextPtr->appCallbackPtr,
4560                                         EB_ENC_INTER_PRED_ERROR0);
4561 
4562                                 }
4563                                 break;
4564 
4565                             case INTRA_MODE:
4566                                 tmvpMapWritePtr->availabilityFlag[tmvpMapIndex] = EB_FALSE;
4567                                 break;
4568 
4569                             default:
4570 
4571                                 CHECK_REPORT_ERROR_NC(
4572                                     encodeContextPtr->appCallbackPtr,
4573                                     EB_ENC_CL_ERROR2);
4574                                 break;
4575                             }
4576 
4577                             //*Note- Filling the map for list 1 motion info will be added when B-slices are ready
4578 
4579                             ++tmvpMapHorizontalStartIndex;
4580                             ++tmvpMapIndex;
4581                         }
4582                         ++tmvpMapVerticalStartIndex;
4583                     }
4584 
4585                 }
4586             }
4587 
4588             cuItr += DepthOffset[cuStats->depth];
4589         }
4590         else{
4591             cuItr++;
4592         }
4593 
4594     } // CU Loop
4595 
4596     contextPtr->codedLcuCount++;
4597     //Jing:
4598     //For true tile mode, need to change DLF accordingly
4599     // First Pass Deblocking
4600     if (dlfEnableFlag){
4601 
4602         EB_U32 pictureWidthInLcu = (sequenceControlSetPtr->lumaWidth + 63) >> LOG2F_MAX_LCU_SIZE;
4603 
4604         LcuInternalAreaDLFCoreFuncTable[is16bit](
4605             reconBuffer,
4606             lcuOriginX,
4607             lcuOriginY,
4608             lcuWidth,
4609             lcuHeight,
4610             pictureControlSetPtr->verticalEdgeBSArray[tbAddr],
4611             pictureControlSetPtr->horizontalEdgeBSArray[tbAddr],
4612             pictureControlSetPtr);
4613 
4614         LcuBoundaryDLFCoreFuncTable[is16bit](
4615             reconBuffer,
4616             lcuOriginX,
4617             lcuOriginY,
4618             lcuWidth,
4619             lcuHeight,
4620             pictureControlSetPtr->verticalEdgeBSArray[tbAddr],
4621             pictureControlSetPtr->horizontalEdgeBSArray[tbAddr],
4622             //lcuOriginY == 0 ? (EB_U8*)EB_NULL : pictureControlSetPtr->verticalEdgeBSArray[tbAddr - pictureWidthInLcu],
4623             //lcuOriginX == 0 ? (EB_U8*)EB_NULL : pictureControlSetPtr->horizontalEdgeBSArray[tbAddr - 1],
4624             lcuPtr->lcuEdgeInfoPtr->tileTopEdgeFlag ? (EB_U8*)EB_NULL : pictureControlSetPtr->verticalEdgeBSArray[tbAddr - pictureWidthInLcu],
4625             lcuPtr->lcuEdgeInfoPtr->tileLeftEdgeFlag ? (EB_U8*)EB_NULL : pictureControlSetPtr->horizontalEdgeBSArray[tbAddr - 1],
4626             pictureControlSetPtr);
4627 
4628         LcuPicEdgeDLFCoreFuncTable[is16bit](
4629             reconBuffer,
4630             tbAddr,
4631             lcuOriginX,
4632             lcuOriginY,
4633             lcuWidth,
4634             lcuHeight,
4635             pictureControlSetPtr);
4636 
4637     }
4638 
4639 
4640     // SAO Parameter Generation
4641     if (enableSaoFlag) {
4642 
4643         EB_S16 lcuDeltaQp = (EB_S16)(lcuPtr->qp - pictureControlSetPtr->ParentPcsPtr->averageQp);
4644 
4645         SaoParameters_t *leftSaoPtr;
4646         SaoParameters_t *topSaoPtr;
4647 
4648         //Jing: Double check for multi-tile
4649         //if (lcuOriginY != 0){
4650         if (!lcuPtr->lcuEdgeInfoPtr->tileTopEdgeFlag) {
4651             EB_U32 topSaoIndex = GetNeighborArrayUnitTopIndex(
4652                 pictureControlSetPtr->epSaoNeighborArray[tileIdx],
4653                 lcuOriginX);
4654 
4655             topSaoPtr = ((SaoParameters_t*)pictureControlSetPtr->epSaoNeighborArray[tileIdx]->topArray) + topSaoIndex;
4656         }
4657         else{
4658             topSaoPtr = (SaoParameters_t*)EB_NULL;
4659         }
4660         //if (lcuOriginX != 0){
4661         if (!lcuPtr->lcuEdgeInfoPtr->tileLeftEdgeFlag) {
4662             EB_U32 leftSaoIndex = GetNeighborArrayUnitLeftIndex(
4663                 pictureControlSetPtr->epSaoNeighborArray[tileIdx],
4664                 lcuOriginY);
4665 
4666             leftSaoPtr = ((SaoParameters_t*)pictureControlSetPtr->epSaoNeighborArray[tileIdx]->leftArray) + leftSaoIndex;
4667         }
4668         else{
4669             leftSaoPtr = (SaoParameters_t*)EB_NULL;
4670         }
4671 
4672 
4673         EB_U8   varCount32x32 = 0;
4674         varCount32x32 = ((pictureControlSetPtr->ParentPcsPtr->variance[tbAddr][1]) > 1000) +
4675             ((pictureControlSetPtr->ParentPcsPtr->variance[tbAddr][2]) > 1000) +
4676             ((pictureControlSetPtr->ParentPcsPtr->variance[tbAddr][3]) > 1000) +
4677             ((pictureControlSetPtr->ParentPcsPtr->variance[tbAddr][4]) > 1000);
4678 
4679         EB_BOOL shutSaoCondition0;
4680         EB_BOOL shutSaoCondition1;
4681 
4682         shutSaoCondition0 = (sequenceControlSetPtr->inputResolution < INPUT_SIZE_4K_RANGE || contextPtr->saoMode) ?
4683             EB_FALSE :
4684             ((pictureControlSetPtr->ParentPcsPtr->edgeResultsPtr[tbAddr].edgeBlockNum == 0 || (pictureControlSetPtr->sceneCaracteristicId != 0)) && (contextPtr->skipQpmFlag == EB_FALSE) && pictureControlSetPtr->ParentPcsPtr->picNoiseClass >= PIC_NOISE_CLASS_1 && !lcuStatPtr->stationaryEdgeOverTimeFlag);
4685 
4686         shutSaoCondition1 = (contextPtr->saoMode) ?
4687             EB_FALSE :
4688             (sequenceControlSetPtr->inputResolution < INPUT_SIZE_4K_RANGE) ?
4689             (varCount32x32 < 1 && lcuDeltaQp <= 0 && pictureControlSetPtr->sliceType != EB_I_PICTURE && !lcuStatPtr->stationaryEdgeOverTimeFlag) :
4690             (((varCount32x32 < 1) && (lcuDeltaQp <= 0 && pictureControlSetPtr->sliceType != EB_I_PICTURE) && (contextPtr->skipQpmFlag == EB_FALSE)) && pictureControlSetPtr->ParentPcsPtr->picNoiseClass >= PIC_NOISE_CLASS_1 && !lcuStatPtr->stationaryEdgeOverTimeFlag);
4691 
4692         if (doRecon == EB_FALSE || shutSaoCondition0 || shutSaoCondition1) {
4693 
4694             lcuPtr->saoParams.saoTypeIndex[SAO_COMPONENT_LUMA] = 0;
4695             lcuPtr->saoParams.saoTypeIndex[SAO_COMPONENT_CHROMA] = 0;
4696             lcuPtr->saoParams.saoOffset[SAO_COMPONENT_LUMA][0] = 0;
4697             lcuPtr->saoParams.saoOffset[SAO_COMPONENT_LUMA][1] = 0;
4698             lcuPtr->saoParams.saoOffset[SAO_COMPONENT_LUMA][2] = 0;
4699             lcuPtr->saoParams.saoOffset[SAO_COMPONENT_LUMA][3] = 0;
4700             lcuPtr->saoParams.saoBandPosition[SAO_COMPONENT_LUMA] = 0;
4701             lcuPtr->saoParams.saoMergeLeftFlag = EB_FALSE;
4702             lcuPtr->saoParams.saoMergeUpFlag = EB_FALSE;
4703 
4704             saoLumaBestCost = 0xFFFFFFFFFFFFFFFFull;
4705             saoChromaBestCost = 0xFFFFFFFFFFFFFFFFull;
4706 
4707         }
4708         else {
4709             // Generate the SAO Parameters
4710             if (is16bit){
4711                 SaoGenerationDecision16bit(
4712                     contextPtr->inputSample16bitBuffer,
4713                     contextPtr->saoStats,
4714                     &lcuPtr->saoParams,
4715                     contextPtr->mdRateEstimationPtr,
4716                     contextPtr->fullLambda,
4717 		    		contextPtr->fullChromaLambdaSao,
4718                     contextPtr->saoMode,
4719                     pictureControlSetPtr,
4720                     lcuOriginX,
4721                     lcuOriginY,
4722                     lcuWidth,
4723                     lcuHeight,
4724                     &lcuPtr->saoParams,
4725                     leftSaoPtr,
4726                     topSaoPtr,
4727                     &saoLumaBestCost,
4728                     &saoChromaBestCost);
4729 
4730             }
4731             else{
4732                 SaoGenerationDecision(
4733                     contextPtr->saoStats,
4734                     &lcuPtr->saoParams,
4735                     contextPtr->mdRateEstimationPtr,
4736                     contextPtr->fullLambda,
4737                     contextPtr->fullChromaLambdaSao,
4738                     contextPtr->saoMode,
4739                     pictureControlSetPtr,
4740                     lcuOriginX,
4741                     lcuOriginY,
4742                     lcuWidth,
4743                     lcuHeight,
4744                     &lcuPtr->saoParams,
4745                     leftSaoPtr,
4746                     topSaoPtr,
4747                     &saoLumaBestCost,
4748                     &saoChromaBestCost);
4749 
4750                 if (contextPtr->skipQpmFlag == EB_FALSE){
4751                     if (pictureControlSetPtr->ParentPcsPtr->picNoiseClass >= PIC_NOISE_CLASS_3_1 && pictureControlSetPtr->pictureQp >= 37) {
4752                         lcuPtr->saoParams.saoTypeIndex[SAO_COMPONENT_LUMA] = 0;
4753                         lcuPtr->saoParams.saoTypeIndex[SAO_COMPONENT_CHROMA] = 0;
4754                         lcuPtr->saoParams.saoMergeLeftFlag = EB_FALSE;
4755                         lcuPtr->saoParams.saoMergeUpFlag = EB_FALSE;
4756                     }
4757                 }
4758             }
4759         }
4760 
4761         // Update the SAO Neighbor Array
4762         EncodePassUpdateSaoNeighborArrays(
4763             pictureControlSetPtr->epSaoNeighborArray[tileIdx],
4764             &lcuPtr->saoParams,
4765             lcuOriginX,
4766             lcuOriginY,
4767             contextPtr->lcuSize);
4768     }
4769     return;
4770 }
4771 
4772 
UnusedVariablevoidFunc_CodingLoop()4773 void UnusedVariablevoidFunc_CodingLoop()
4774 {
4775     (void)NxMSadLoopKernel_funcPtrArray;
4776     (void)NxMSadAveragingKernel_funcPtrArray;
4777 }
4778