1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 #include <stdlib.h>
7 #include <string.h>
8 
9 #include "EbDefinitions.h"
10 #include "EbSystemResourceManager.h"
11 #include "EbPictureControlSet.h"
12 #include "EbSequenceControlSet.h"
13 #include "EbPictureBufferDesc.h"
14 
15 #include "EbResourceCoordinationResults.h"
16 #include "EbPictureAnalysisProcess.h"
17 #include "EbPictureAnalysisResults.h"
18 #include "EbMcp.h"
19 #include "EbMotionEstimation.h"
20 #include "EbReferenceObject.h"
21 
22 #include "EbComputeMean.h"
23 #include "EbMeSadCalculation.h"
24 #include "EbPictureOperators.h"
25 #include "EbComputeMean_SSE2.h"
26 #include "EbCombinedAveragingSAD_Intrinsic_AVX2.h"
27 
28 #define VARIANCE_PRECISION		16
29 #define  LCU_LOW_VAR_TH                5
30 #define  PIC_LOW_VAR_PERCENTAGE_TH    60
31 #define	FLAT_MAX_VAR			50
32 #define FLAT_MAX_VAR_DECIM		(50-00)
33 #define	NOISE_MIN_LEVEL_0		 70000//120000
34 #define NOISE_MIN_LEVEL_DECIM_0 (70000+000000)//(120000+000000)
35 #define	NOISE_MIN_LEVEL_1        120000
36 #define NOISE_MIN_LEVEL_DECIM_1	(120000+000000)
37 #define DENOISER_QP_TH			29
38 #define DENOISER_BITRATE_TH		14000000
39 #define SAMPLE_THRESHOLD_PRECENT_BORDER_LINE      15
40 #define SAMPLE_THRESHOLD_PRECENT_TWO_BORDER_LINES 10
41 
PictureAnalysisContextDctor(EB_PTR p)42 static void PictureAnalysisContextDctor(EB_PTR p)
43 {
44     PictureAnalysisContext_t *obj = (PictureAnalysisContext_t*)p;
45     EB_DELETE(obj->noisePicturePtr);
46     EB_DELETE(obj->denoisedPicturePtr);
47     EB_FREE_PTR_ARRAY(obj->grad, obj->lcuTotalCountAllocated);
48 }
49 /************************************************
50 * Picture Analysis Context Constructor
51 ************************************************/
PictureAnalysisContextCtor(PictureAnalysisContext_t * contextPtr,EbPictureBufferDescInitData_t * inputPictureBufferDescInitData,EB_BOOL denoiseFlag,EbFifo_t * resourceCoordinationResultsInputFifoPtr,EbFifo_t * pictureAnalysisResultsOutputFifoPtr,EB_U16 lcuTotalCount)52 EB_ERRORTYPE PictureAnalysisContextCtor(
53     PictureAnalysisContext_t       *contextPtr,
54     EbPictureBufferDescInitData_t  *inputPictureBufferDescInitData,
55     EB_BOOL                         denoiseFlag,
56     EbFifo_t                       *resourceCoordinationResultsInputFifoPtr,
57     EbFifo_t                       *pictureAnalysisResultsOutputFifoPtr,
58     EB_U16                          lcuTotalCount)
59 {
60     contextPtr->dctor = PictureAnalysisContextDctor;
61 	contextPtr->resourceCoordinationResultsInputFifoPtr = resourceCoordinationResultsInputFifoPtr;
62 	contextPtr->pictureAnalysisResultsOutputFifoPtr = pictureAnalysisResultsOutputFifoPtr;
63 
64 	if (denoiseFlag == EB_TRUE){
65 
66 		//denoised
67         // If 420/422, re-use luma for chroma
68         // If 444, re-use luma for Cr
69         if (inputPictureBufferDescInitData->colorFormat != EB_YUV444) {
70 		    inputPictureBufferDescInitData->bufferEnableMask = PICTURE_BUFFER_DESC_Y_FLAG;
71         } else {
72 		    inputPictureBufferDescInitData->bufferEnableMask = PICTURE_BUFFER_DESC_Y_FLAG | PICTURE_BUFFER_DESC_Cb_FLAG;
73         }
74 
75         EB_NEW(
76             contextPtr->denoisedPicturePtr,
77             EbPictureBufferDescCtor,
78             inputPictureBufferDescInitData);
79 
80         if (inputPictureBufferDescInitData->colorFormat != EB_YUV444) {
81 		contextPtr->denoisedPicturePtr->bufferCb = contextPtr->denoisedPicturePtr->bufferY;
82 		contextPtr->denoisedPicturePtr->bufferCr = contextPtr->denoisedPicturePtr->bufferY + contextPtr->denoisedPicturePtr->chromaSize;
83         } else {
84 		    contextPtr->denoisedPicturePtr->bufferCr = contextPtr->denoisedPicturePtr->bufferY;
85         }
86 
87 		// noise
88 		inputPictureBufferDescInitData->maxHeight = MAX_LCU_SIZE;
89 		inputPictureBufferDescInitData->bufferEnableMask = PICTURE_BUFFER_DESC_Y_FLAG;
90 
91 
92         EB_NEW(
93             contextPtr->noisePicturePtr,
94             EbPictureBufferDescCtor,
95             inputPictureBufferDescInitData);
96 	}
97     contextPtr->lcuTotalCountAllocated = lcuTotalCount;
98     EB_ALLOC_PTR_ARRAY(contextPtr->grad, lcuTotalCount);
99     for (EB_U16 lcuIndex = 0; lcuIndex < lcuTotalCount; ++lcuIndex) {
100         EB_MALLOC_ARRAY(contextPtr->grad[lcuIndex], CU_MAX_COUNT);
101     }
102 	return EB_ErrorNone;
103 }
104 
DownSampleChroma(EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * outputPicturePtr)105 static void DownSampleChroma(EbPictureBufferDesc_t* inputPicturePtr, EbPictureBufferDesc_t* outputPicturePtr)
106 {
107 	EB_U32 inputColorFormat = inputPicturePtr->colorFormat;
108 	EB_U16 inputSubWidthCMinus1 = (inputColorFormat == EB_YUV444 ? 1 : 2) - 1;
109 	EB_U16 inputSubHeightCMinus1 = (inputColorFormat >= EB_YUV422 ? 1 : 2) - 1;
110 
111 	EB_U32 outputColorFormat = outputPicturePtr->colorFormat;
112 	EB_U16 outputSubWidthCMinus1 = (outputColorFormat == EB_YUV444 ? 1 : 2) - 1;
113 	EB_U16 outputSubHeightCMinus1 = (outputColorFormat >= EB_YUV422 ? 1 : 2) - 1;
114 
115 	EB_U32 strideIn, strideOut;
116 	EB_U32 inputOriginIndex, outputOriginIndex;
117 
118 	EB_U8 *ptrIn;
119 	EB_U8 *ptrOut;
120 
121 	EB_U32 ii, jj;
122 
123 	//Cb
124 	{
125 		strideIn = inputPicturePtr->strideCb;
126 		inputOriginIndex = (inputPicturePtr->originX >> inputSubWidthCMinus1) +
127             (inputPicturePtr->originY >> inputSubHeightCMinus1)  * inputPicturePtr->strideCb;
128 		ptrIn = &(inputPicturePtr->bufferCb[inputOriginIndex]);
129 
130 		strideOut = outputPicturePtr->strideCb;
131 		outputOriginIndex = (outputPicturePtr->originX >> outputSubWidthCMinus1) +
132             (outputPicturePtr->originY >> outputSubHeightCMinus1)  * outputPicturePtr->strideCb;
133 		ptrOut = &(outputPicturePtr->bufferCb[outputOriginIndex]);
134 
135 		for (jj = 0; jj < (EB_U32)(outputPicturePtr->height >> outputSubHeightCMinus1); jj++) {
136 			for (ii = 0; ii < (EB_U32)(outputPicturePtr->width >> outputSubWidthCMinus1); ii++) {
137 				ptrOut[ii + jj * strideOut] =
138                     ptrIn[(ii << (1 - inputSubWidthCMinus1)) +
139                     (jj << (1 - inputSubHeightCMinus1)) * strideIn];
140 			}
141 		}
142 
143 	}
144 
145 	//Cr
146 	{
147 		strideIn = inputPicturePtr->strideCr;
148 		inputOriginIndex = (inputPicturePtr->originX >> inputSubWidthCMinus1) + (inputPicturePtr->originY >> inputSubHeightCMinus1)  * inputPicturePtr->strideCr;
149 		ptrIn = &(inputPicturePtr->bufferCr[inputOriginIndex]);
150 
151 		strideOut = outputPicturePtr->strideCr;
152 		outputOriginIndex = (outputPicturePtr->originX >> outputSubWidthCMinus1) + (outputPicturePtr->originY >> outputSubHeightCMinus1)  * outputPicturePtr->strideCr;
153 		ptrOut = &(outputPicturePtr->bufferCr[outputOriginIndex]);
154 
155 		for (jj = 0; jj < (EB_U32)(outputPicturePtr->height >> outputSubHeightCMinus1); jj++) {
156 			for (ii = 0; ii < (EB_U32)(outputPicturePtr->width >> outputSubWidthCMinus1); ii++) {
157 				ptrOut[ii + jj * strideOut] =
158                     ptrIn[(ii << (1 - inputSubWidthCMinus1)) +
159                     (jj << (1 - inputSubHeightCMinus1)) * strideIn];
160 			}
161 		}
162 	}
163 }
164 
165 /************************************************
166  * Picture Analysis Context Destructor
167  ************************************************/
168 
169 /********************************************
170  * Decimation2D
171  *      decimates the input
172  ********************************************/
Decimation2D(EB_U8 * inputSamples,EB_U32 inputStride,EB_U32 inputAreaWidth,EB_U32 inputAreaHeight,EB_U8 * decimSamples,EB_U32 decimStride,EB_U32 decimStep)173 void Decimation2D(
174 	EB_U8 *  inputSamples,      // input parameter, input samples Ptr
175 	EB_U32   inputStride,       // input parameter, input stride
176 	EB_U32   inputAreaWidth,    // input parameter, input area width
177 	EB_U32   inputAreaHeight,   // input parameter, input area height
178 	EB_U8 *  decimSamples,      // output parameter, decimated samples Ptr
179 	EB_U32   decimStride,       // input parameter, output stride
180 	EB_U32   decimStep)        // input parameter, area height
181 {
182 
183 	EB_U32 horizontalIndex;
184 	EB_U32 verticalIndex;
185 
186 
187 	for (verticalIndex = 0; verticalIndex < inputAreaHeight; verticalIndex += decimStep) {
188 		for (horizontalIndex = 0; horizontalIndex < inputAreaWidth; horizontalIndex += decimStep) {
189 
190 			decimSamples[(horizontalIndex >> (decimStep >> 1))] = inputSamples[horizontalIndex];
191 
192 		}
193 		inputSamples += (inputStride << (decimStep >> 1));
194 		decimSamples += decimStride;
195 	}
196 
197 	return;
198 }
199 
200 /********************************************
201 * CalculateHistogram
202 *      creates n-bins histogram for the input
203 ********************************************/
CalculateHistogram(EB_U8 * inputSamples,EB_U32 inputAreaWidth,EB_U32 inputAreaHeight,EB_U32 stride,EB_U8 decimStep,EB_U32 * histogram,EB_U64 * sum)204 static void CalculateHistogram(
205 	EB_U8 *  inputSamples,      // input parameter, input samples Ptr
206 	EB_U32   inputAreaWidth,    // input parameter, input area width
207 	EB_U32   inputAreaHeight,   // input parameter, input area height
208 	EB_U32   stride,            // input parameter, input stride
209     EB_U8    decimStep,         // input parameter, area height
210 	EB_U32  *histogram,			// output parameter, output histogram
211 	EB_U64  *sum)
212 
213 {
214 
215 	EB_U32 horizontalIndex;
216 	EB_U32 verticalIndex;
217 	*sum = 0;
218 
219 	for (verticalIndex = 0; verticalIndex < inputAreaHeight; verticalIndex += decimStep) {
220 		for (horizontalIndex = 0; horizontalIndex < inputAreaWidth; horizontalIndex += decimStep) {
221 			++(histogram[inputSamples[horizontalIndex]]);
222 			*sum += inputSamples[horizontalIndex];
223 		}
224         inputSamples += (stride << (decimStep >> 1));
225 	}
226 
227 	return;
228 }
229 
230 
ComputeVariance32x32(EbPictureBufferDesc_t * inputPaddedPicturePtr,EB_U32 inputLumaOriginIndex,EB_U64 * variance8x8)231 static EB_U64 ComputeVariance32x32(
232 	EbPictureBufferDesc_t       *inputPaddedPicturePtr,         // input parameter, Input Padded Picture
233 	EB_U32                       inputLumaOriginIndex,          // input parameter, LCU index, used to point to source/reference samples
234 	EB_U64						*variance8x8)
235 {
236 
237 	EB_U32 blockIndex;
238 
239 	EB_U64 meanOf8x8Blocks[16];
240 	EB_U64 meanOf8x8SquaredValuesBlocks[16];
241 
242 	EB_U64 meanOf16x16Blocks[4];
243 	EB_U64 meanOf16x16SquaredValuesBlocks[4];
244 
245 	EB_U64 meanOf32x32Blocks;
246 	EB_U64 meanOf32x32SquaredValuesBlocks;
247 	/////////////////////////////////////////////
248 	// (0,0)
249 	blockIndex = inputLumaOriginIndex;
250 
251 	meanOf8x8Blocks[0] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
252 	meanOf8x8SquaredValuesBlocks[0] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
253 
254 	// (0,1)
255 	blockIndex = blockIndex + 8;
256 	meanOf8x8Blocks[1] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
257 	meanOf8x8SquaredValuesBlocks[1] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
258 
259 	// (0,2)
260 	blockIndex = blockIndex + 8;
261 	meanOf8x8Blocks[2] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
262 	meanOf8x8SquaredValuesBlocks[2] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
263 
264 	// (0,3)
265 	blockIndex = blockIndex + 8;
266 	meanOf8x8Blocks[3] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
267 	meanOf8x8SquaredValuesBlocks[3] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
268 
269 
270 
271 	// (1,0)
272 	blockIndex = inputLumaOriginIndex + (inputPaddedPicturePtr->strideY << 3);
273 	meanOf8x8Blocks[4] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
274 	meanOf8x8SquaredValuesBlocks[4] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
275 
276 	// (1,1)
277 	blockIndex = blockIndex + 8;
278 	meanOf8x8Blocks[5] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
279 	meanOf8x8SquaredValuesBlocks[5] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
280 
281 	// (1,2)
282 	blockIndex = blockIndex + 8;
283 	meanOf8x8Blocks[6] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
284 	meanOf8x8SquaredValuesBlocks[6] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
285 
286 	// (1,3)
287 	blockIndex = blockIndex + 8;
288 	meanOf8x8Blocks[7] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
289 	meanOf8x8SquaredValuesBlocks[7] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
290 
291 
292 
293 	// (2,0)
294 	blockIndex = inputLumaOriginIndex + (inputPaddedPicturePtr->strideY << 4);
295 	meanOf8x8Blocks[8] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
296 	meanOf8x8SquaredValuesBlocks[8] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
297 
298 	// (2,1)
299 	blockIndex = blockIndex + 8;
300 	meanOf8x8Blocks[9] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
301 	meanOf8x8SquaredValuesBlocks[9] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
302 
303 	// (2,2)
304 	blockIndex = blockIndex + 8;
305 	meanOf8x8Blocks[10] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
306 	meanOf8x8SquaredValuesBlocks[10] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
307 
308 	// (2,3)
309 	blockIndex = blockIndex + 8;
310 	meanOf8x8Blocks[11] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
311 	meanOf8x8SquaredValuesBlocks[11] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
312 
313 
314 
315 	// (3,0)
316 	blockIndex = inputLumaOriginIndex + (inputPaddedPicturePtr->strideY << 3) + (inputPaddedPicturePtr->strideY << 4);
317 	meanOf8x8Blocks[12] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
318 	meanOf8x8SquaredValuesBlocks[12] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
319 
320 	// (3,1)
321 	blockIndex = blockIndex + 8;
322 	meanOf8x8Blocks[13] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
323 	meanOf8x8SquaredValuesBlocks[13] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
324 
325 	// (3,2)
326 	blockIndex = blockIndex + 8;
327 	meanOf8x8Blocks[14] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
328 	meanOf8x8SquaredValuesBlocks[14] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
329 
330 	// (3,3)
331 	blockIndex = blockIndex + 8;
332 	meanOf8x8Blocks[15] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
333 	meanOf8x8SquaredValuesBlocks[15] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
334 
335 
336 	/////////////////////////////////////////////
337 
338 	variance8x8[0] = meanOf8x8SquaredValuesBlocks[0] - (meanOf8x8Blocks[0] * meanOf8x8Blocks[0]);
339 	variance8x8[1] = meanOf8x8SquaredValuesBlocks[1] - (meanOf8x8Blocks[1] * meanOf8x8Blocks[1]);
340 	variance8x8[2] = meanOf8x8SquaredValuesBlocks[2] - (meanOf8x8Blocks[2] * meanOf8x8Blocks[2]);
341 	variance8x8[3] = meanOf8x8SquaredValuesBlocks[3] - (meanOf8x8Blocks[3] * meanOf8x8Blocks[3]);
342 	variance8x8[4] = meanOf8x8SquaredValuesBlocks[4] - (meanOf8x8Blocks[4] * meanOf8x8Blocks[4]);
343 	variance8x8[5] = meanOf8x8SquaredValuesBlocks[5] - (meanOf8x8Blocks[5] * meanOf8x8Blocks[5]);
344 	variance8x8[6] = meanOf8x8SquaredValuesBlocks[6] - (meanOf8x8Blocks[6] * meanOf8x8Blocks[6]);
345 	variance8x8[7] = meanOf8x8SquaredValuesBlocks[7] - (meanOf8x8Blocks[7] * meanOf8x8Blocks[7]);
346 	variance8x8[8] = meanOf8x8SquaredValuesBlocks[8] - (meanOf8x8Blocks[8] * meanOf8x8Blocks[8]);
347 	variance8x8[9] = meanOf8x8SquaredValuesBlocks[9] - (meanOf8x8Blocks[9] * meanOf8x8Blocks[9]);
348 	variance8x8[10] = meanOf8x8SquaredValuesBlocks[10] - (meanOf8x8Blocks[10] * meanOf8x8Blocks[10]);
349 	variance8x8[11] = meanOf8x8SquaredValuesBlocks[11] - (meanOf8x8Blocks[11] * meanOf8x8Blocks[11]);
350 	variance8x8[12] = meanOf8x8SquaredValuesBlocks[12] - (meanOf8x8Blocks[12] * meanOf8x8Blocks[12]);
351 	variance8x8[13] = meanOf8x8SquaredValuesBlocks[13] - (meanOf8x8Blocks[13] * meanOf8x8Blocks[13]);
352 	variance8x8[14] = meanOf8x8SquaredValuesBlocks[14] - (meanOf8x8Blocks[14] * meanOf8x8Blocks[14]);
353 	variance8x8[15] = meanOf8x8SquaredValuesBlocks[15] - (meanOf8x8Blocks[15] * meanOf8x8Blocks[15]);
354 
355 	// 16x16
356 	meanOf16x16Blocks[0] = (meanOf8x8Blocks[0] + meanOf8x8Blocks[1] + meanOf8x8Blocks[8] + meanOf8x8Blocks[9]) >> 2;
357 	meanOf16x16Blocks[1] = (meanOf8x8Blocks[2] + meanOf8x8Blocks[3] + meanOf8x8Blocks[10] + meanOf8x8Blocks[11]) >> 2;
358 	meanOf16x16Blocks[2] = (meanOf8x8Blocks[4] + meanOf8x8Blocks[5] + meanOf8x8Blocks[12] + meanOf8x8Blocks[13]) >> 2;
359 	meanOf16x16Blocks[3] = (meanOf8x8Blocks[6] + meanOf8x8Blocks[7] + meanOf8x8Blocks[14] + meanOf8x8Blocks[15]) >> 2;
360 
361 
362 	meanOf16x16SquaredValuesBlocks[0] = (meanOf8x8SquaredValuesBlocks[0] + meanOf8x8SquaredValuesBlocks[1] + meanOf8x8SquaredValuesBlocks[8] + meanOf8x8SquaredValuesBlocks[9]) >> 2;
363 	meanOf16x16SquaredValuesBlocks[1] = (meanOf8x8SquaredValuesBlocks[2] + meanOf8x8SquaredValuesBlocks[3] + meanOf8x8SquaredValuesBlocks[10] + meanOf8x8SquaredValuesBlocks[11]) >> 2;
364 	meanOf16x16SquaredValuesBlocks[2] = (meanOf8x8SquaredValuesBlocks[4] + meanOf8x8SquaredValuesBlocks[5] + meanOf8x8SquaredValuesBlocks[12] + meanOf8x8SquaredValuesBlocks[13]) >> 2;
365 	meanOf16x16SquaredValuesBlocks[3] = (meanOf8x8SquaredValuesBlocks[6] + meanOf8x8SquaredValuesBlocks[7] + meanOf8x8SquaredValuesBlocks[14] + meanOf8x8SquaredValuesBlocks[15]) >> 2;
366 
367 	// 32x32
368 	meanOf32x32Blocks = (meanOf16x16Blocks[0] + meanOf16x16Blocks[1] + meanOf16x16Blocks[2] + meanOf16x16Blocks[3]) >> 2;
369 
370 
371 	meanOf32x32SquaredValuesBlocks = (meanOf16x16SquaredValuesBlocks[0] + meanOf16x16SquaredValuesBlocks[1] + meanOf16x16SquaredValuesBlocks[2] + meanOf16x16SquaredValuesBlocks[3]) >> 2;
372 
373 
374 	return (meanOf32x32SquaredValuesBlocks - (meanOf32x32Blocks * meanOf32x32Blocks));
375 }
376 
ComputeVariance16x16(EbPictureBufferDesc_t * inputPaddedPicturePtr,EB_U32 inputLumaOriginIndex,EB_U64 * variance8x8)377 static EB_U64 ComputeVariance16x16(
378 	EbPictureBufferDesc_t       *inputPaddedPicturePtr,         // input parameter, Input Padded Picture
379 	EB_U32                       inputLumaOriginIndex,          // input parameter, LCU index, used to point to source/reference samples
380 	EB_U64						*variance8x8)
381 {
382 
383 	EB_U32 blockIndex;
384 
385 	EB_U64 meanOf8x8Blocks[4];
386 	EB_U64 meanOf8x8SquaredValuesBlocks[4];
387 
388 	EB_U64 meanOf16x16Blocks;
389 	EB_U64 meanOf16x16SquaredValuesBlocks;
390 
391 	// (0,0)
392 	blockIndex = inputLumaOriginIndex;
393 
394 	meanOf8x8Blocks[0] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
395 	meanOf8x8SquaredValuesBlocks[0] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
396 
397 	// (0,1)
398 	blockIndex = blockIndex + 8;
399 	meanOf8x8Blocks[1] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
400 	meanOf8x8SquaredValuesBlocks[1] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
401 
402 	// (1,0)
403 	blockIndex = inputLumaOriginIndex + (inputPaddedPicturePtr->strideY << 3);
404 	meanOf8x8Blocks[2] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
405 	meanOf8x8SquaredValuesBlocks[2] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
406 
407 	// (1,1)
408 	blockIndex = blockIndex + 8;
409 	meanOf8x8Blocks[3] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
410 	meanOf8x8SquaredValuesBlocks[3] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
411 
412 	variance8x8[0] = meanOf8x8SquaredValuesBlocks[0] - (meanOf8x8Blocks[0] * meanOf8x8Blocks[0]);
413 	variance8x8[1] = meanOf8x8SquaredValuesBlocks[1] - (meanOf8x8Blocks[1] * meanOf8x8Blocks[1]);
414 	variance8x8[2] = meanOf8x8SquaredValuesBlocks[2] - (meanOf8x8Blocks[2] * meanOf8x8Blocks[2]);
415 	variance8x8[3] = meanOf8x8SquaredValuesBlocks[3] - (meanOf8x8Blocks[3] * meanOf8x8Blocks[3]);
416 
417 	// 16x16
418 	meanOf16x16Blocks = (meanOf8x8Blocks[0] + meanOf8x8Blocks[1] + meanOf8x8Blocks[2] + meanOf8x8Blocks[3]) >> 2;
419 	meanOf16x16SquaredValuesBlocks = (meanOf8x8SquaredValuesBlocks[0] + meanOf8x8SquaredValuesBlocks[1] + meanOf8x8SquaredValuesBlocks[2] + meanOf8x8SquaredValuesBlocks[3]) >> 2;
420 
421 	return (meanOf16x16SquaredValuesBlocks - (meanOf16x16Blocks * meanOf16x16Blocks));
422 }
423 
424 /*******************************************
425 ComputeVariance64x64
426 this function is exactly same as
427 PictureAnalysisComputeVarianceLcu excpet it
428 does not store data for every block,
429 just returns the 64x64 data point
430 *******************************************/
ComputeVariance64x64(EbPictureBufferDesc_t * inputPaddedPicturePtr,EB_U32 inputLumaOriginIndex,EB_U64 * variance32x32)431 static EB_U64 ComputeVariance64x64(
432     EbPictureBufferDesc_t       *inputPaddedPicturePtr,         // input parameter, Input Padded Picture
433 	EB_U32                       inputLumaOriginIndex,          // input parameter, LCU index, used to point to source/reference samples
434 	EB_U64						*variance32x32)
435 {
436 
437 
438 	EB_U32 blockIndex;
439 
440 	EB_U64 meanOf8x8Blocks[64];
441 	EB_U64 meanOf8x8SquaredValuesBlocks[64];
442 
443 	EB_U64 meanOf16x16Blocks[16];
444 	EB_U64 meanOf16x16SquaredValuesBlocks[16];
445 
446 	EB_U64 meanOf32x32Blocks[4];
447 	EB_U64 meanOf32x32SquaredValuesBlocks[4];
448 
449 	EB_U64 meanOf64x64Blocks;
450 	EB_U64 meanOf64x64SquaredValuesBlocks;
451 
452 	// (0,0)
453 	blockIndex = inputLumaOriginIndex;
454 	const EB_U16 strideY = inputPaddedPicturePtr->strideY;
455 
456     if (!!(ASM_TYPES & AVX2_MASK)) {
457 
458         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[0], &meanOf8x8SquaredValuesBlocks[0]);
459 
460         // (0,1)
461         blockIndex = blockIndex + 32;
462 
463         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[4], &meanOf8x8SquaredValuesBlocks[4]);
464         // (0,5)
465         blockIndex = blockIndex + 24;
466 
467         // (1,0)
468         blockIndex = inputLumaOriginIndex + (strideY << 3);
469 
470         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[8], &meanOf8x8SquaredValuesBlocks[8]);
471 
472         // (1,1)
473         blockIndex = blockIndex + 32;
474 
475         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[12], &meanOf8x8SquaredValuesBlocks[12]);
476 
477         // (1,5)
478         blockIndex = blockIndex + 24;
479 
480         // (2,0)
481         blockIndex = inputLumaOriginIndex + (strideY << 4);
482 
483         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[16], &meanOf8x8SquaredValuesBlocks[16]);
484 
485         // (2,1)
486         blockIndex = blockIndex + 32;
487 
488         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[20], &meanOf8x8SquaredValuesBlocks[20]);
489 
490         // (2,5)
491         blockIndex = blockIndex + 24;
492 
493         // (3,0)
494         blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4);
495 
496         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[24], &meanOf8x8SquaredValuesBlocks[24]);
497 
498         // (3,1)
499         blockIndex = blockIndex + 32;
500 
501         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[28], &meanOf8x8SquaredValuesBlocks[28]);
502 
503         // (3,5)
504         blockIndex = blockIndex + 24;
505 
506         // (4,0)
507         blockIndex = inputLumaOriginIndex + (strideY << 5);
508 
509         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[32], &meanOf8x8SquaredValuesBlocks[32]);
510 
511         // (4,1)
512         blockIndex = blockIndex + 32;
513 
514         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[36], &meanOf8x8SquaredValuesBlocks[36]);
515 
516         // (4,5)
517         blockIndex = blockIndex + 24;
518 
519         // (5,0)
520         blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 5);
521 
522         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[40], &meanOf8x8SquaredValuesBlocks[40]);
523 
524         // (5,1)
525         blockIndex = blockIndex + 32;
526 
527         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[44], &meanOf8x8SquaredValuesBlocks[44]);
528 
529         // (5,5)
530         blockIndex = blockIndex + 24;
531 
532         // (6,0)
533         blockIndex = inputLumaOriginIndex + (strideY << 4) + (strideY << 5);
534 
535         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[48], &meanOf8x8SquaredValuesBlocks[48]);
536 
537         // (6,1)
538         blockIndex = blockIndex + 32;
539 
540         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[52], &meanOf8x8SquaredValuesBlocks[52]);
541 
542         // (6,5)
543         blockIndex = blockIndex + 24;
544 
545         // (7,0)
546         blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4) + (strideY << 5);
547 
548         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[56], &meanOf8x8SquaredValuesBlocks[56]);
549 
550         // (7,1)
551         blockIndex = blockIndex + 32;
552 
553         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[60], &meanOf8x8SquaredValuesBlocks[60]);
554 
555 
556     }
557     else{
558         meanOf8x8Blocks[0] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
559 	    meanOf8x8SquaredValuesBlocks[0] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
560 
561 	    // (0,1)
562 	    blockIndex = blockIndex + 8;
563 	    meanOf8x8Blocks[1] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
564 	    meanOf8x8SquaredValuesBlocks[1] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
565 
566 	    // (0,2)
567 	    blockIndex = blockIndex + 8;
568 	    meanOf8x8Blocks[2] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
569 	    meanOf8x8SquaredValuesBlocks[2] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
570 
571 	    // (0,3)
572 	    blockIndex = blockIndex + 8;
573 	    meanOf8x8Blocks[3] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
574 	    meanOf8x8SquaredValuesBlocks[3] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
575 
576 	    // (0,4)
577 	    blockIndex = blockIndex + 8;
578 	    meanOf8x8Blocks[4] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
579 	    meanOf8x8SquaredValuesBlocks[4] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
580 
581 	    // (0,5)
582 	    blockIndex = blockIndex + 8;
583 	    meanOf8x8Blocks[5] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
584 	    meanOf8x8SquaredValuesBlocks[5] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
585 
586 	    // (0,6)
587 	    blockIndex = blockIndex + 8;
588 	    meanOf8x8Blocks[6] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
589 	    meanOf8x8SquaredValuesBlocks[6] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
590 
591 	    // (0,7)
592 	    blockIndex = blockIndex + 8;
593 	    meanOf8x8Blocks[7] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
594 	    meanOf8x8SquaredValuesBlocks[7] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
595 
596 	    // (1,0)
597 	    blockIndex = inputLumaOriginIndex + (strideY << 3);
598 	    meanOf8x8Blocks[8] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
599 	    meanOf8x8SquaredValuesBlocks[8] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
600 
601 	    // (1,1)
602 	    blockIndex = blockIndex + 8;
603 	    meanOf8x8Blocks[9] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
604 	    meanOf8x8SquaredValuesBlocks[9] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
605 
606 	    // (1,2)
607 	    blockIndex = blockIndex + 8;
608 	    meanOf8x8Blocks[10] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
609 	    meanOf8x8SquaredValuesBlocks[10] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
610 
611 	    // (1,3)
612 	    blockIndex = blockIndex + 8;
613 	    meanOf8x8Blocks[11] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
614 	    meanOf8x8SquaredValuesBlocks[11] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
615 
616 	    // (1,4)
617 	    blockIndex = blockIndex + 8;
618 	    meanOf8x8Blocks[12] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
619 	    meanOf8x8SquaredValuesBlocks[12] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
620 
621 	    // (1,5)
622 	    blockIndex = blockIndex + 8;
623 	    meanOf8x8Blocks[13] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
624 	    meanOf8x8SquaredValuesBlocks[13] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
625 
626 	    // (1,6)
627 	    blockIndex = blockIndex + 8;
628 	    meanOf8x8Blocks[14] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
629 	    meanOf8x8SquaredValuesBlocks[14] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
630 
631 	    // (1,7)
632 	    blockIndex = blockIndex + 8;
633 	    meanOf8x8Blocks[15] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
634 	    meanOf8x8SquaredValuesBlocks[15] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
635 
636 	    // (2,0)
637 	    blockIndex = inputLumaOriginIndex + (strideY << 4);
638 	    meanOf8x8Blocks[16] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
639 	    meanOf8x8SquaredValuesBlocks[16] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
640 
641 	    // (2,1)
642 	    blockIndex = blockIndex + 8;
643 	    meanOf8x8Blocks[17] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
644 	    meanOf8x8SquaredValuesBlocks[17] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
645 
646 	    // (2,2)
647 	    blockIndex = blockIndex + 8;
648 	    meanOf8x8Blocks[18] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
649 	    meanOf8x8SquaredValuesBlocks[18] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
650 
651 	    // (2,3)
652 	    blockIndex = blockIndex + 8;
653 	    meanOf8x8Blocks[19] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
654 	    meanOf8x8SquaredValuesBlocks[19] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
655 
656 	    /// (2,4)
657 	    blockIndex = blockIndex + 8;
658 	    meanOf8x8Blocks[20] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
659 	    meanOf8x8SquaredValuesBlocks[20] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
660 
661 	    // (2,5)
662 	    blockIndex = blockIndex + 8;
663 	    meanOf8x8Blocks[21] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
664 	    meanOf8x8SquaredValuesBlocks[21] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
665 
666 	    // (2,6)
667 	    blockIndex = blockIndex + 8;
668 	    meanOf8x8Blocks[22] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
669 	    meanOf8x8SquaredValuesBlocks[22] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
670 
671 	    // (2,7)
672 	    blockIndex = blockIndex + 8;
673 	    meanOf8x8Blocks[23] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
674 	    meanOf8x8SquaredValuesBlocks[23] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
675 
676 	    // (3,0)
677 	    blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4);
678 	    meanOf8x8Blocks[24] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
679 	    meanOf8x8SquaredValuesBlocks[24] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
680 
681 	    // (3,1)
682 	    blockIndex = blockIndex + 8;
683 	    meanOf8x8Blocks[25] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
684 	    meanOf8x8SquaredValuesBlocks[25] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
685 
686 	    // (3,2)
687 	    blockIndex = blockIndex + 8;
688 	    meanOf8x8Blocks[26] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
689 	    meanOf8x8SquaredValuesBlocks[26] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
690 
691 	    // (3,3)
692 	    blockIndex = blockIndex + 8;
693 	    meanOf8x8Blocks[27] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
694 	    meanOf8x8SquaredValuesBlocks[27] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
695 
696 	    // (3,4)
697 	    blockIndex = blockIndex + 8;
698 	    meanOf8x8Blocks[28] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
699 	    meanOf8x8SquaredValuesBlocks[28] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
700 
701 	    // (3,5)
702 	    blockIndex = blockIndex + 8;
703 	    meanOf8x8Blocks[29] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
704 	    meanOf8x8SquaredValuesBlocks[29] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
705 
706 	    // (3,6)
707 	    blockIndex = blockIndex + 8;
708 	    meanOf8x8Blocks[30] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
709 	    meanOf8x8SquaredValuesBlocks[30] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
710 
711 	    // (3,7)
712 	    blockIndex = blockIndex + 8;
713 	    meanOf8x8Blocks[31] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
714 	    meanOf8x8SquaredValuesBlocks[31] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
715 
716 	    // (4,0)
717 	    blockIndex = inputLumaOriginIndex + (strideY << 5);
718 	    meanOf8x8Blocks[32] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
719 	    meanOf8x8SquaredValuesBlocks[32] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
720 
721 	    // (4,1)
722 	    blockIndex = blockIndex + 8;
723 	    meanOf8x8Blocks[33] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
724 	    meanOf8x8SquaredValuesBlocks[33] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
725 
726 	    // (4,2)
727 	    blockIndex = blockIndex + 8;
728 	    meanOf8x8Blocks[34] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
729 	    meanOf8x8SquaredValuesBlocks[34] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
730 
731 	    // (4,3)
732 	    blockIndex = blockIndex + 8;
733 	    meanOf8x8Blocks[35] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
734 	    meanOf8x8SquaredValuesBlocks[35] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
735 
736 	    // (4,4)
737 	    blockIndex = blockIndex + 8;
738 	    meanOf8x8Blocks[36] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
739 	    meanOf8x8SquaredValuesBlocks[36] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
740 
741 	    // (4,5)
742 	    blockIndex = blockIndex + 8;
743 	    meanOf8x8Blocks[37] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
744 	    meanOf8x8SquaredValuesBlocks[37] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
745 
746 	    // (4,6)
747 	    blockIndex = blockIndex + 8;
748 	    meanOf8x8Blocks[38] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
749 	    meanOf8x8SquaredValuesBlocks[38] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
750 
751 	    // (4,7)
752 	    blockIndex = blockIndex + 8;
753 	    meanOf8x8Blocks[39] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
754 	    meanOf8x8SquaredValuesBlocks[39] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
755 
756 	    // (5,0)
757 	    blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 5);
758 	    meanOf8x8Blocks[40] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
759 	    meanOf8x8SquaredValuesBlocks[40] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
760 
761 	    // (5,1)
762 	    blockIndex = blockIndex + 8;
763 	    meanOf8x8Blocks[41] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
764 	    meanOf8x8SquaredValuesBlocks[41] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
765 
766 	    // (5,2)
767 	    blockIndex = blockIndex + 8;
768 	    meanOf8x8Blocks[42] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
769 	    meanOf8x8SquaredValuesBlocks[42] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
770 
771 	    // (5,3)
772 	    blockIndex = blockIndex + 8;
773 	    meanOf8x8Blocks[43] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
774 	    meanOf8x8SquaredValuesBlocks[43] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
775 
776 	    // (5,4)
777 	    blockIndex = blockIndex + 8;
778 	    meanOf8x8Blocks[44] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
779 	    meanOf8x8SquaredValuesBlocks[44] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
780 
781 	    // (5,5)
782 	    blockIndex = blockIndex + 8;
783 	    meanOf8x8Blocks[45] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
784 	    meanOf8x8SquaredValuesBlocks[45] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
785 
786 	    // (5,6)
787 	    blockIndex = blockIndex + 8;
788 	    meanOf8x8Blocks[46] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
789 	    meanOf8x8SquaredValuesBlocks[46] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
790 
791 	    // (5,7)
792 	    blockIndex = blockIndex + 8;
793 	    meanOf8x8Blocks[47] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
794 	    meanOf8x8SquaredValuesBlocks[47] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
795 
796 	    // (6,0)
797 	    blockIndex = inputLumaOriginIndex + (strideY << 4) + (strideY << 5);
798 	    meanOf8x8Blocks[48] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
799 	    meanOf8x8SquaredValuesBlocks[48] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
800 
801 	    // (6,1)
802 	    blockIndex = blockIndex + 8;
803 	    meanOf8x8Blocks[49] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
804 	    meanOf8x8SquaredValuesBlocks[49] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
805 
806 	    // (6,2)
807 	    blockIndex = blockIndex + 8;
808 	    meanOf8x8Blocks[50] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
809 	    meanOf8x8SquaredValuesBlocks[50] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
810 
811 	    // (6,3)
812 	    blockIndex = blockIndex + 8;
813 	    meanOf8x8Blocks[51] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
814 	    meanOf8x8SquaredValuesBlocks[51] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
815 
816 	    // (6,4)
817 	    blockIndex = blockIndex + 8;
818 	    meanOf8x8Blocks[52] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
819 	    meanOf8x8SquaredValuesBlocks[52] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
820 
821 	    // (6,5)
822 	    blockIndex = blockIndex + 8;
823 	    meanOf8x8Blocks[53] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
824 	    meanOf8x8SquaredValuesBlocks[53] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
825 
826 	    // (6,6)
827 	    blockIndex = blockIndex + 8;
828 	    meanOf8x8Blocks[54] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
829 	    meanOf8x8SquaredValuesBlocks[54] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
830 
831 	    // (6,7)
832 	    blockIndex = blockIndex + 8;
833 	    meanOf8x8Blocks[55] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
834 	    meanOf8x8SquaredValuesBlocks[55] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
835 
836 	    // (7,0)
837 	    blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4) + (strideY << 5);
838 	    meanOf8x8Blocks[56] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
839 	    meanOf8x8SquaredValuesBlocks[56] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
840 
841 	    // (7,1)
842 	    blockIndex = blockIndex + 8;
843 	    meanOf8x8Blocks[57] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
844 	    meanOf8x8SquaredValuesBlocks[57] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
845 
846 	    // (7,2)
847 	    blockIndex = blockIndex + 8;
848 	    meanOf8x8Blocks[58] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
849 	    meanOf8x8SquaredValuesBlocks[58] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
850 
851 	    // (7,3)
852 	    blockIndex = blockIndex + 8;
853 	    meanOf8x8Blocks[59] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
854 	    meanOf8x8SquaredValuesBlocks[59] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
855 
856 	    // (7,4)
857 	    blockIndex = blockIndex + 8;
858 	    meanOf8x8Blocks[60] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
859 	    meanOf8x8SquaredValuesBlocks[60] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
860 
861 	    // (7,5)
862 	    blockIndex = blockIndex + 8;
863 	    meanOf8x8Blocks[61] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
864 	    meanOf8x8SquaredValuesBlocks[61] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
865 
866 	    // (7,6)
867 	    blockIndex = blockIndex + 8;
868 	    meanOf8x8Blocks[62] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
869 	    meanOf8x8SquaredValuesBlocks[62] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
870 
871 	    // (7,7)
872 	    blockIndex = blockIndex + 8;
873 	    meanOf8x8Blocks[63] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
874 	    meanOf8x8SquaredValuesBlocks[63] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]),  strideY);
875 
876 
877     }
878 
879 
880 	// 16x16
881 	meanOf16x16Blocks[0] = (meanOf8x8Blocks[0] + meanOf8x8Blocks[1] + meanOf8x8Blocks[8] + meanOf8x8Blocks[9]) >> 2;
882 	meanOf16x16Blocks[1] = (meanOf8x8Blocks[2] + meanOf8x8Blocks[3] + meanOf8x8Blocks[10] + meanOf8x8Blocks[11]) >> 2;
883 	meanOf16x16Blocks[2] = (meanOf8x8Blocks[4] + meanOf8x8Blocks[5] + meanOf8x8Blocks[12] + meanOf8x8Blocks[13]) >> 2;
884 	meanOf16x16Blocks[3] = (meanOf8x8Blocks[6] + meanOf8x8Blocks[7] + meanOf8x8Blocks[14] + meanOf8x8Blocks[15]) >> 2;
885 
886 	meanOf16x16Blocks[4] = (meanOf8x8Blocks[16] + meanOf8x8Blocks[17] + meanOf8x8Blocks[24] + meanOf8x8Blocks[25]) >> 2;
887 	meanOf16x16Blocks[5] = (meanOf8x8Blocks[18] + meanOf8x8Blocks[19] + meanOf8x8Blocks[26] + meanOf8x8Blocks[27]) >> 2;
888 	meanOf16x16Blocks[6] = (meanOf8x8Blocks[20] + meanOf8x8Blocks[21] + meanOf8x8Blocks[28] + meanOf8x8Blocks[29]) >> 2;
889 	meanOf16x16Blocks[7] = (meanOf8x8Blocks[22] + meanOf8x8Blocks[23] + meanOf8x8Blocks[30] + meanOf8x8Blocks[31]) >> 2;
890 
891 	meanOf16x16Blocks[8] = (meanOf8x8Blocks[32] + meanOf8x8Blocks[33] + meanOf8x8Blocks[40] + meanOf8x8Blocks[41]) >> 2;
892 	meanOf16x16Blocks[9] = (meanOf8x8Blocks[34] + meanOf8x8Blocks[35] + meanOf8x8Blocks[42] + meanOf8x8Blocks[43]) >> 2;
893 	meanOf16x16Blocks[10] = (meanOf8x8Blocks[36] + meanOf8x8Blocks[37] + meanOf8x8Blocks[44] + meanOf8x8Blocks[45]) >> 2;
894 	meanOf16x16Blocks[11] = (meanOf8x8Blocks[38] + meanOf8x8Blocks[39] + meanOf8x8Blocks[46] + meanOf8x8Blocks[47]) >> 2;
895 
896 	meanOf16x16Blocks[12] = (meanOf8x8Blocks[48] + meanOf8x8Blocks[49] + meanOf8x8Blocks[56] + meanOf8x8Blocks[57]) >> 2;
897 	meanOf16x16Blocks[13] = (meanOf8x8Blocks[50] + meanOf8x8Blocks[51] + meanOf8x8Blocks[58] + meanOf8x8Blocks[59]) >> 2;
898 	meanOf16x16Blocks[14] = (meanOf8x8Blocks[52] + meanOf8x8Blocks[53] + meanOf8x8Blocks[60] + meanOf8x8Blocks[61]) >> 2;
899 	meanOf16x16Blocks[15] = (meanOf8x8Blocks[54] + meanOf8x8Blocks[55] + meanOf8x8Blocks[62] + meanOf8x8Blocks[63]) >> 2;
900 
901 	meanOf16x16SquaredValuesBlocks[0] = (meanOf8x8SquaredValuesBlocks[0] + meanOf8x8SquaredValuesBlocks[1] + meanOf8x8SquaredValuesBlocks[8] + meanOf8x8SquaredValuesBlocks[9]) >> 2;
902 	meanOf16x16SquaredValuesBlocks[1] = (meanOf8x8SquaredValuesBlocks[2] + meanOf8x8SquaredValuesBlocks[3] + meanOf8x8SquaredValuesBlocks[10] + meanOf8x8SquaredValuesBlocks[11]) >> 2;
903 	meanOf16x16SquaredValuesBlocks[2] = (meanOf8x8SquaredValuesBlocks[4] + meanOf8x8SquaredValuesBlocks[5] + meanOf8x8SquaredValuesBlocks[12] + meanOf8x8SquaredValuesBlocks[13]) >> 2;
904 	meanOf16x16SquaredValuesBlocks[3] = (meanOf8x8SquaredValuesBlocks[6] + meanOf8x8SquaredValuesBlocks[7] + meanOf8x8SquaredValuesBlocks[14] + meanOf8x8SquaredValuesBlocks[15]) >> 2;
905 
906 	meanOf16x16SquaredValuesBlocks[4] = (meanOf8x8SquaredValuesBlocks[16] + meanOf8x8SquaredValuesBlocks[17] + meanOf8x8SquaredValuesBlocks[24] + meanOf8x8SquaredValuesBlocks[25]) >> 2;
907 	meanOf16x16SquaredValuesBlocks[5] = (meanOf8x8SquaredValuesBlocks[18] + meanOf8x8SquaredValuesBlocks[19] + meanOf8x8SquaredValuesBlocks[26] + meanOf8x8SquaredValuesBlocks[27]) >> 2;
908 	meanOf16x16SquaredValuesBlocks[6] = (meanOf8x8SquaredValuesBlocks[20] + meanOf8x8SquaredValuesBlocks[21] + meanOf8x8SquaredValuesBlocks[28] + meanOf8x8SquaredValuesBlocks[29]) >> 2;
909 	meanOf16x16SquaredValuesBlocks[7] = (meanOf8x8SquaredValuesBlocks[22] + meanOf8x8SquaredValuesBlocks[23] + meanOf8x8SquaredValuesBlocks[30] + meanOf8x8SquaredValuesBlocks[31]) >> 2;
910 
911 	meanOf16x16SquaredValuesBlocks[8] = (meanOf8x8SquaredValuesBlocks[32] + meanOf8x8SquaredValuesBlocks[33] + meanOf8x8SquaredValuesBlocks[40] + meanOf8x8SquaredValuesBlocks[41]) >> 2;
912 	meanOf16x16SquaredValuesBlocks[9] = (meanOf8x8SquaredValuesBlocks[34] + meanOf8x8SquaredValuesBlocks[35] + meanOf8x8SquaredValuesBlocks[42] + meanOf8x8SquaredValuesBlocks[43]) >> 2;
913 	meanOf16x16SquaredValuesBlocks[10] = (meanOf8x8SquaredValuesBlocks[36] + meanOf8x8SquaredValuesBlocks[37] + meanOf8x8SquaredValuesBlocks[44] + meanOf8x8SquaredValuesBlocks[45]) >> 2;
914 	meanOf16x16SquaredValuesBlocks[11] = (meanOf8x8SquaredValuesBlocks[38] + meanOf8x8SquaredValuesBlocks[39] + meanOf8x8SquaredValuesBlocks[46] + meanOf8x8SquaredValuesBlocks[47]) >> 2;
915 
916 	meanOf16x16SquaredValuesBlocks[12] = (meanOf8x8SquaredValuesBlocks[48] + meanOf8x8SquaredValuesBlocks[49] + meanOf8x8SquaredValuesBlocks[56] + meanOf8x8SquaredValuesBlocks[57]) >> 2;
917 	meanOf16x16SquaredValuesBlocks[13] = (meanOf8x8SquaredValuesBlocks[50] + meanOf8x8SquaredValuesBlocks[51] + meanOf8x8SquaredValuesBlocks[58] + meanOf8x8SquaredValuesBlocks[59]) >> 2;
918 	meanOf16x16SquaredValuesBlocks[14] = (meanOf8x8SquaredValuesBlocks[52] + meanOf8x8SquaredValuesBlocks[53] + meanOf8x8SquaredValuesBlocks[60] + meanOf8x8SquaredValuesBlocks[61]) >> 2;
919 	meanOf16x16SquaredValuesBlocks[15] = (meanOf8x8SquaredValuesBlocks[54] + meanOf8x8SquaredValuesBlocks[55] + meanOf8x8SquaredValuesBlocks[62] + meanOf8x8SquaredValuesBlocks[63]) >> 2;
920 
921 	// 32x32
922 	meanOf32x32Blocks[0] = (meanOf16x16Blocks[0] + meanOf16x16Blocks[1] + meanOf16x16Blocks[4] + meanOf16x16Blocks[5]) >> 2;
923 	meanOf32x32Blocks[1] = (meanOf16x16Blocks[2] + meanOf16x16Blocks[3] + meanOf16x16Blocks[6] + meanOf16x16Blocks[7]) >> 2;
924 	meanOf32x32Blocks[2] = (meanOf16x16Blocks[8] + meanOf16x16Blocks[9] + meanOf16x16Blocks[12] + meanOf16x16Blocks[13]) >> 2;
925 	meanOf32x32Blocks[3] = (meanOf16x16Blocks[10] + meanOf16x16Blocks[11] + meanOf16x16Blocks[14] + meanOf16x16Blocks[15]) >> 2;
926 
927 	meanOf32x32SquaredValuesBlocks[0] = (meanOf16x16SquaredValuesBlocks[0] + meanOf16x16SquaredValuesBlocks[1] + meanOf16x16SquaredValuesBlocks[4] + meanOf16x16SquaredValuesBlocks[5]) >> 2;
928 	meanOf32x32SquaredValuesBlocks[1] = (meanOf16x16SquaredValuesBlocks[2] + meanOf16x16SquaredValuesBlocks[3] + meanOf16x16SquaredValuesBlocks[6] + meanOf16x16SquaredValuesBlocks[7]) >> 2;
929 	meanOf32x32SquaredValuesBlocks[2] = (meanOf16x16SquaredValuesBlocks[8] + meanOf16x16SquaredValuesBlocks[9] + meanOf16x16SquaredValuesBlocks[12] + meanOf16x16SquaredValuesBlocks[13]) >> 2;
930 	meanOf32x32SquaredValuesBlocks[3] = (meanOf16x16SquaredValuesBlocks[10] + meanOf16x16SquaredValuesBlocks[11] + meanOf16x16SquaredValuesBlocks[14] + meanOf16x16SquaredValuesBlocks[15]) >> 2;
931 
932 
933 	variance32x32[0] = meanOf32x32SquaredValuesBlocks[0] - (meanOf32x32Blocks[0] * meanOf32x32Blocks[0]);
934 	variance32x32[1] = meanOf32x32SquaredValuesBlocks[1] - (meanOf32x32Blocks[1] * meanOf32x32Blocks[1]);
935 	variance32x32[2] = meanOf32x32SquaredValuesBlocks[2] - (meanOf32x32Blocks[2] * meanOf32x32Blocks[2]);
936 	variance32x32[3] = meanOf32x32SquaredValuesBlocks[3] - (meanOf32x32Blocks[3] * meanOf32x32Blocks[3]);
937 
938 
939 	// 64x64
940 	meanOf64x64Blocks = (meanOf32x32Blocks[0] + meanOf32x32Blocks[1] + meanOf32x32Blocks[2] + meanOf32x32Blocks[3]) >> 2;
941 	meanOf64x64SquaredValuesBlocks = (meanOf32x32SquaredValuesBlocks[0] + meanOf32x32SquaredValuesBlocks[1] + meanOf32x32SquaredValuesBlocks[2] + meanOf32x32SquaredValuesBlocks[3]) >> 2;
942 
943 	return (meanOf64x64SquaredValuesBlocks - (meanOf64x64Blocks * meanOf64x64Blocks));
944 }
945 
946 
947 
getFilteredTypes(EB_U8 * ptr,EB_U32 stride,EB_U8 EbHevcFilterType)948 static EB_U8  getFilteredTypes(EB_U8  *ptr,
949 	EB_U32  stride,
950 	EB_U8   EbHevcFilterType)
951 {
952 	EB_U8 *p = ptr - 1 - stride;
953 
954 	EB_U32 a = 0;
955 
956 	if (EbHevcFilterType == 0){
957 
958 		//Luma
959 		a = (p[1] +
960 			p[0 + stride] + 4 * p[1 + stride] + p[2 + stride] +
961 			p[1 + 2 * stride]) / 8;
962 
963 	}
964 	else if (EbHevcFilterType == 1){
965         a = (                    2 * p[1] +
966 			 2 * p[0 + stride] + 4 * p[1 + stride] + 2 * p[2 + stride] +
967 			                     2 * p[1 + 2 * stride]  );
968 
969         a =  (( (EB_U32)((a *2730) >> 14) + 1) >> 1) & 0xFFFF;
970 
971         //fixed point version of a=a/12 to mimic x86 instruction _mm256_mulhrs_epi16;
972         //a= (a*2730)>>15;
973 	}
974 	else if (EbHevcFilterType == 2){
975 
976 
977 		a = (4 * p[1] +
978 			4 * p[0 + stride] + 4 * p[1 + stride] + 4 * p[2 + stride] +
979 			4 * p[1 + 2 * stride]) / 20;
980 	}
981 	else if (EbHevcFilterType == 3){
982 
983 		a = (1 * p[0] + 1 * p[1] + 1 * p[2] +
984 			1 * p[0 + stride] + 4 * p[1 + stride] + 1 * p[2 + stride] +
985 			1 * p[0 + 2 * stride] + 1 * p[1 + 2 * stride] + 1 * p[2 + 2 * stride]) / 12;
986 
987 
988 	}
989 	else if (EbHevcFilterType == 4){
990 
991 		//gaussian matrix(Chroma)
992 		a = (1 * p[0] + 2 * p[1] + 1 * p[2] +
993 			2 * p[0 + stride] + 4 * p[1 + stride] + 2 * p[2 + stride] +
994 			1 * p[0 + 2 * stride] + 2 * p[1 + 2 * stride] + 1 * p[2 + 2 * stride]) / 16;
995 
996 	}
997 	else if (EbHevcFilterType == 5){
998 
999 		a = (2 * p[0] + 2 * p[1] + 2 * p[2] +
1000 			2 * p[0 + stride] + 4 * p[1 + stride] + 2 * p[2 + stride] +
1001 			2 * p[0 + 2 * stride] + 2 * p[1 + 2 * stride] + 2 * p[2 + 2 * stride]) / 20;
1002 
1003 	}
1004 	else if (EbHevcFilterType == 6){
1005 
1006 		a = (4 * p[0] + 4 * p[1] + 4 * p[2] +
1007 			4 * p[0 + stride] + 4 * p[1 + stride] + 4 * p[2 + stride] +
1008 			4 * p[0 + 2 * stride] + 4 * p[1 + 2 * stride] + 4 * p[2 + 2 * stride]) / 36;
1009 
1010 	}
1011 
1012 	return  (EB_U8)CLIP3EQ(0, 255, a);
1013 }
1014 
1015 
1016 /*******************************************
1017 * noiseExtractLumaStrong
1018 *  strong filter Luma.
1019 *******************************************/
noiseExtractLumaStrong(EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EB_U32 lcuOriginY,EB_U32 lcuOriginX)1020 void noiseExtractLumaStrong(
1021 	EbPictureBufferDesc_t       *inputPicturePtr,
1022 	EbPictureBufferDesc_t       *denoisedPicturePtr,
1023 	EB_U32                       lcuOriginY
1024 	, EB_U32                       lcuOriginX
1025 	)
1026 {
1027 	EB_U32  ii, jj;
1028 	EB_U32  picHeight, lcuHeight;
1029 	EB_U32  picWidth;
1030 	EB_U32  inputOriginIndex;
1031 	EB_U32  inputOriginIndexPad;
1032 
1033 	EB_U8 *ptrIn;
1034 	EB_U32 strideIn;
1035 	EB_U8 *ptrDenoised;
1036 
1037 	EB_U32 strideOut;
1038 	EB_U32 idx = (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width) ? lcuOriginX : 0;
1039 
1040 	//Luma
1041 	{
1042 		picHeight = inputPicturePtr->height;
1043 		picWidth = inputPicturePtr->width;
1044 		lcuHeight = MIN(MAX_LCU_SIZE, picHeight - lcuOriginY);
1045 
1046 		strideIn = inputPicturePtr->strideY;
1047 		inputOriginIndex = inputPicturePtr->originX + (inputPicturePtr->originY + lcuOriginY)* inputPicturePtr->strideY;
1048 		ptrIn = &(inputPicturePtr->bufferY[inputOriginIndex]);
1049 
1050 		inputOriginIndexPad = denoisedPicturePtr->originX + (denoisedPicturePtr->originY + lcuOriginY) * denoisedPicturePtr->strideY;
1051 		strideOut = denoisedPicturePtr->strideY;
1052 		ptrDenoised = &(denoisedPicturePtr->bufferY[inputOriginIndexPad]);
1053 
1054 		for (jj = 0; jj < lcuHeight; jj++){
1055 			for (ii = idx; ii < picWidth; ii++){
1056 
1057 				if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || lcuOriginY + lcuHeight < picHeight) && ii>0 && ii < picWidth - 1){
1058 
1059 					ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 4);
1060 
1061 				}
1062 				else{
1063 					ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1064 
1065 				}
1066 
1067 			}
1068 		}
1069 	}
1070 
1071 }
1072 
1073 /*******************************************
1074 * noiseExtractChromaStrong
1075 *  strong filter chroma.
1076 *******************************************/
noiseExtractChromaStrong(EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EB_U32 lcuOriginY,EB_U32 lcuOriginX)1077 void noiseExtractChromaStrong(
1078 	EbPictureBufferDesc_t       *inputPicturePtr,
1079 	EbPictureBufferDesc_t       *denoisedPicturePtr,
1080 	EB_U32                       lcuOriginY
1081 	, EB_U32                       lcuOriginX
1082 	)
1083 {
1084 	EB_U32  ii, jj;
1085 	EB_U32  picHeight, lcuHeight;
1086 	EB_U32  picWidth;
1087 	EB_U32  inputOriginIndex;
1088 	EB_U32  inputOriginIndexPad;
1089 
1090 	EB_U8 *ptrIn;
1091 	EB_U32 strideIn;
1092 	EB_U8 *ptrDenoised;
1093 
1094 	EB_U32 strideOut;
1095 	EB_U32 idx = (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width) ? lcuOriginX : 0;
1096 
1097     EB_U32 colorFormat      = inputPicturePtr->colorFormat;
1098     EB_U16 subWidthCMinus1  = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
1099     EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
1100 
1101 
1102 	//Cb
1103 	{
1104 		picHeight = inputPicturePtr->height >> subHeightCMinus1;
1105 		picWidth = inputPicturePtr->width >> subWidthCMinus1;
1106 		lcuHeight = MIN(MAX_LCU_SIZE >> subHeightCMinus1, picHeight - lcuOriginY);
1107 
1108 		strideIn = inputPicturePtr->strideCb;
1109 		inputOriginIndex = (inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) + lcuOriginY)  * inputPicturePtr->strideCb;
1110 		ptrIn = &(inputPicturePtr->bufferCb[inputOriginIndex]);
1111 
1112 		inputOriginIndexPad = (denoisedPicturePtr->originX >> subWidthCMinus1) + ((denoisedPicturePtr->originY >> subHeightCMinus1) + lcuOriginY)  * denoisedPicturePtr->strideCb;
1113 		strideOut = denoisedPicturePtr->strideCb;
1114 		ptrDenoised = &(denoisedPicturePtr->bufferCb[inputOriginIndexPad]);
1115 
1116 
1117 		for (jj = 0; jj < lcuHeight; jj++){
1118 			for (ii = idx; ii < picWidth; ii++){
1119 
1120 
1121 				if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || (lcuOriginY + lcuHeight) < picHeight) && ii > 0 && ii < picWidth - 1){
1122 					ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 6);
1123 				}
1124 				else{
1125 					ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1126 				}
1127 
1128 			}
1129 		}
1130 	}
1131 
1132 	//Cr
1133 	{
1134 		picHeight = inputPicturePtr->height >> subHeightCMinus1;
1135 		picWidth = inputPicturePtr->width >> subWidthCMinus1;
1136 		lcuHeight = MIN(MAX_LCU_SIZE >> subHeightCMinus1, picHeight - lcuOriginY);
1137 
1138 		strideIn = inputPicturePtr->strideCr;
1139 		inputOriginIndex = (inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) + lcuOriginY)  * inputPicturePtr->strideCr;
1140 		ptrIn = &(inputPicturePtr->bufferCr[inputOriginIndex]);
1141 
1142 		inputOriginIndexPad = (denoisedPicturePtr->originX >> subWidthCMinus1) + ((denoisedPicturePtr->originY >> subHeightCMinus1) + lcuOriginY)  * denoisedPicturePtr->strideCr;
1143 		strideOut = denoisedPicturePtr->strideCr;
1144 		ptrDenoised = &(denoisedPicturePtr->bufferCr[inputOriginIndexPad]);
1145 
1146 
1147 		for (jj = 0; jj < lcuHeight; jj++){
1148 			for (ii = idx; ii < picWidth; ii++){
1149 
1150 				if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || (lcuOriginY + lcuHeight) < picHeight) && ii > 0 && ii < picWidth - 1){
1151 					ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 6);
1152 				}
1153 				else{
1154 					ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1155 				}
1156 
1157 			}
1158 		}
1159 	}
1160 }
1161 
1162 /*******************************************
1163 * noiseExtractChromaWeak
1164 *  weak filter chroma.
1165 *******************************************/
noiseExtractChromaWeak(EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EB_U32 lcuOriginY,EB_U32 lcuOriginX)1166 void noiseExtractChromaWeak(
1167 	EbPictureBufferDesc_t       *inputPicturePtr,
1168 	EbPictureBufferDesc_t       *denoisedPicturePtr,
1169 	EB_U32                       lcuOriginY
1170 	, EB_U32                       lcuOriginX
1171 	)
1172 {
1173 	EB_U32  ii, jj;
1174 	EB_U32  picHeight, lcuHeight;
1175 	EB_U32  picWidth;
1176 	EB_U32  inputOriginIndex;
1177 	EB_U32  inputOriginIndexPad;
1178 
1179 	EB_U8 *ptrIn;
1180 	EB_U32 strideIn;
1181 	EB_U8 *ptrDenoised;
1182 
1183 	EB_U32 strideOut;
1184 
1185 	EB_U32 idx = (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width) ? lcuOriginX : 0;
1186 
1187     EB_U32 colorFormat      = inputPicturePtr->colorFormat;
1188     EB_U16 subWidthCMinus1  = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
1189     EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
1190 
1191 
1192 	//Cb
1193 	{
1194 		picHeight = inputPicturePtr->height >> subHeightCMinus1;
1195 		picWidth = inputPicturePtr->width >> subWidthCMinus1;
1196 
1197 		lcuHeight = MIN(MAX_LCU_SIZE >> subHeightCMinus1, picHeight - lcuOriginY);
1198 
1199 		strideIn = inputPicturePtr->strideCb;
1200 		inputOriginIndex = (inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) + lcuOriginY)* inputPicturePtr->strideCb;
1201 		ptrIn = &(inputPicturePtr->bufferCb[inputOriginIndex]);
1202 
1203 		inputOriginIndexPad = (denoisedPicturePtr->originX >> subWidthCMinus1) + ((denoisedPicturePtr->originY >> subHeightCMinus1) + lcuOriginY)* denoisedPicturePtr->strideCb;
1204 		strideOut = denoisedPicturePtr->strideCb;
1205 		ptrDenoised = &(denoisedPicturePtr->bufferCb[inputOriginIndexPad]);
1206 
1207 
1208 		for (jj = 0; jj < lcuHeight; jj++){
1209 			for (ii = idx; ii < picWidth; ii++){
1210 
1211 
1212 				if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || (lcuOriginY + lcuHeight) < picHeight) && ii > 0 && ii < picWidth - 1){
1213 					ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 4);
1214 				}
1215 				else{
1216 					ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1217 				}
1218 
1219 			}
1220 		}
1221 	}
1222 
1223 	//Cr
1224 	{
1225 		picHeight = inputPicturePtr->height >> subHeightCMinus1;
1226 		picWidth = inputPicturePtr->width >> subWidthCMinus1;
1227 		lcuHeight = MIN(MAX_LCU_SIZE >> subHeightCMinus1, picHeight - lcuOriginY);
1228 
1229 		strideIn = inputPicturePtr->strideCr;
1230 		inputOriginIndex = (inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) + lcuOriginY)* inputPicturePtr->strideCr;
1231 		ptrIn = &(inputPicturePtr->bufferCr[inputOriginIndex]);
1232 
1233 		inputOriginIndexPad = (denoisedPicturePtr->originX >> subWidthCMinus1) + ((denoisedPicturePtr->originY >> subHeightCMinus1) + lcuOriginY)* denoisedPicturePtr->strideCr;
1234 		strideOut = denoisedPicturePtr->strideCr;
1235 		ptrDenoised = &(denoisedPicturePtr->bufferCr[inputOriginIndexPad]);
1236 
1237 
1238 		for (jj = 0; jj < lcuHeight; jj++){
1239 			for (ii = idx; ii < picWidth; ii++){
1240 
1241 				if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || (lcuOriginY + lcuHeight) < picHeight) && ii > 0 && ii < picWidth - 1){
1242 					ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 4);
1243 				}
1244 				else{
1245 					ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1246 				}
1247 
1248 			}
1249 		}
1250 	}
1251 
1252 }
1253 
1254 /*******************************************
1255 * noiseExtractLumaWeak
1256 *  weak filter Luma and store noise.
1257 *******************************************/
noiseExtractLumaWeak(EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EbPictureBufferDesc_t * noisePicturePtr,EB_U32 lcuOriginY,EB_U32 lcuOriginX)1258 void noiseExtractLumaWeak(
1259 	EbPictureBufferDesc_t       *inputPicturePtr,
1260 	EbPictureBufferDesc_t       *denoisedPicturePtr,
1261 	EbPictureBufferDesc_t       *noisePicturePtr,
1262 	EB_U32                       lcuOriginY
1263 	, EB_U32						 lcuOriginX
1264 	)
1265 {
1266 	EB_U32  ii, jj;
1267 	EB_U32  picHeight, lcuHeight;
1268 	EB_U32  picWidth;
1269 	EB_U32  inputOriginIndex;
1270 	EB_U32  inputOriginIndexPad;
1271 	EB_U32  noiseOriginIndex;
1272 
1273 	EB_U8 *ptrIn;
1274 	EB_U32 strideIn;
1275 	EB_U8 *ptrDenoised;
1276 
1277 	EB_U8 *ptrNoise;
1278 	EB_U32 strideOut;
1279 
1280 	EB_U32 idx = (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width) ? lcuOriginX : 0;
1281 
1282 	//Luma
1283 	{
1284 		picHeight = inputPicturePtr->height;
1285 		picWidth = inputPicturePtr->width;
1286 		lcuHeight = MIN(MAX_LCU_SIZE, picHeight - lcuOriginY);
1287 
1288 		strideIn = inputPicturePtr->strideY;
1289 		inputOriginIndex = inputPicturePtr->originX + (inputPicturePtr->originY + lcuOriginY) * inputPicturePtr->strideY;
1290 		ptrIn = &(inputPicturePtr->bufferY[inputOriginIndex]);
1291 
1292 		inputOriginIndexPad = denoisedPicturePtr->originX + (denoisedPicturePtr->originY + lcuOriginY) * denoisedPicturePtr->strideY;
1293 		strideOut = denoisedPicturePtr->strideY;
1294 		ptrDenoised = &(denoisedPicturePtr->bufferY[inputOriginIndexPad]);
1295 
1296 		noiseOriginIndex = noisePicturePtr->originX + noisePicturePtr->originY * noisePicturePtr->strideY;
1297 		ptrNoise = &(noisePicturePtr->bufferY[noiseOriginIndex]);
1298 
1299 
1300 		for (jj = 0; jj < lcuHeight; jj++){
1301 			for (ii = idx; ii < picWidth; ii++){
1302 
1303 				if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || lcuOriginY + lcuHeight < picHeight) && ii>0 && ii < picWidth - 1){
1304 
1305 					ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 0);
1306 					ptrNoise[ii + jj*strideOut] = CLIP3EQ(0, 255, ptrIn[ii + jj*strideIn] - ptrDenoised[ii + jj*strideOut]);
1307 
1308 				}
1309 				else{
1310 					ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1311 					ptrNoise[ii + jj*strideOut] = 0;
1312 				}
1313 
1314 			}
1315 		}
1316 	}
1317 
1318 }
1319 
noiseExtractLumaWeakLcu(EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EbPictureBufferDesc_t * noisePicturePtr,EB_U32 lcuOriginY,EB_U32 lcuOriginX)1320 void noiseExtractLumaWeakLcu(
1321 	EbPictureBufferDesc_t       *inputPicturePtr,
1322 	EbPictureBufferDesc_t       *denoisedPicturePtr,
1323 	EbPictureBufferDesc_t       *noisePicturePtr,
1324 	EB_U32                       lcuOriginY
1325 	, EB_U32						 lcuOriginX
1326 	)
1327 {
1328 	EB_U32  ii, jj;
1329 	EB_U32  picHeight, lcuHeight;
1330 	EB_U32  picWidth, lcuWidth;
1331 	EB_U32  inputOriginIndex;
1332 	EB_U32  inputOriginIndexPad;
1333 	EB_U32  noiseOriginIndex;
1334 
1335 	EB_U8 *ptrIn;
1336 	EB_U32 strideIn;
1337 	EB_U8 *ptrDenoised;
1338 
1339 	EB_U8 *ptrNoise;
1340 	EB_U32 strideOut;
1341 
1342 	EB_U32 idx = (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width) ? lcuOriginX : 0;
1343 
1344 	//Luma
1345 	{
1346 		picHeight = inputPicturePtr->height;
1347 		picWidth = inputPicturePtr->width;
1348 		lcuHeight = MIN(MAX_LCU_SIZE, picHeight - lcuOriginY);
1349 		lcuWidth = MIN(MAX_LCU_SIZE, picWidth - lcuOriginX);
1350 
1351 		strideIn = inputPicturePtr->strideY;
1352 		inputOriginIndex = inputPicturePtr->originX + lcuOriginX + (inputPicturePtr->originY + lcuOriginY) * inputPicturePtr->strideY;
1353 		ptrIn = &(inputPicturePtr->bufferY[inputOriginIndex]);
1354 
1355 		inputOriginIndexPad = denoisedPicturePtr->originX + lcuOriginX + (denoisedPicturePtr->originY + lcuOriginY) * denoisedPicturePtr->strideY;
1356 		strideOut = denoisedPicturePtr->strideY;
1357 		ptrDenoised = &(denoisedPicturePtr->bufferY[inputOriginIndexPad]);
1358 
1359 		noiseOriginIndex = noisePicturePtr->originX + lcuOriginX + noisePicturePtr->originY * noisePicturePtr->strideY;
1360 		ptrNoise = &(noisePicturePtr->bufferY[noiseOriginIndex]);
1361 
1362 
1363 		for (jj = 0; jj < lcuHeight; jj++){
1364 			for (ii = idx; ii < lcuWidth; ii++){
1365 
1366 				if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || lcuOriginY + lcuHeight < picHeight) && (ii>0 || lcuOriginX>0) && (ii + lcuOriginX) < picWidth - 1/* & ii < lcuWidth - 1*/){
1367 
1368 					ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 0);
1369 					ptrNoise[ii + jj*strideOut] = CLIP3EQ(0, 255, ptrIn[ii + jj*strideIn] - ptrDenoised[ii + jj*strideOut]);
1370 
1371 				}
1372 				else{
1373 					ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1374 					ptrNoise[ii + jj*strideOut] = 0;
1375 				}
1376 
1377 			}
1378 		}
1379 	}
1380 
1381 }
1382 
ZeroOutChromaBlockMean(PictureParentControlSet_t * pictureControlSetPtr,EB_U32 lcuCodingOrder)1383 static EB_ERRORTYPE ZeroOutChromaBlockMean(
1384 	PictureParentControlSet_t   *pictureControlSetPtr,          // input parameter, Picture Control Set Ptr
1385 	EB_U32                       lcuCodingOrder                // input parameter, LCU address
1386 	)
1387 {
1388 
1389 	EB_ERRORTYPE return_error = EB_ErrorNone;
1390 	// 16x16 mean
1391 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_0] = 0;
1392 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_1] = 0;
1393 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_2] = 0;
1394 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_3] = 0;
1395 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_4] = 0;
1396 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_5] = 0;
1397 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_6] = 0;
1398 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_7] = 0;
1399 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_8] = 0;
1400 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_9] = 0;
1401 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_10] = 0;
1402 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_11] = 0;
1403 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_12] = 0;
1404 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_13] = 0;
1405 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_14] = 0;
1406 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_15] = 0;
1407 
1408 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_0] = 0;
1409 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_1] = 0;
1410 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_2] = 0;
1411 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_3] = 0;
1412 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_4] = 0;
1413 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_5] = 0;
1414 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_6] = 0;
1415 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_7] = 0;
1416 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_8] = 0;
1417 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_9] = 0;
1418 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_10] = 0;
1419 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_11] = 0;
1420 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_12] = 0;
1421 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_13] = 0;
1422 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_14] = 0;
1423 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_15] = 0;
1424 
1425 	// 32x32 mean
1426 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_0] = 0;
1427 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_1] = 0;
1428 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_2] = 0;
1429 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_3] = 0;
1430 
1431 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_0] = 0;
1432 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_1] = 0;
1433 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_2] = 0;
1434 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_3] = 0;
1435 
1436 	// 64x64 mean
1437 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_64x64] = 0;
1438 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_64x64] = 0;
1439 
1440 	return return_error;
1441 
1442 }
1443 
1444 /*******************************************
1445 * ComputeChromaBlockMean
1446 *   computes the chroma block mean for 64x64, 32x32 and 16x16 CUs inside the tree block
1447 *******************************************/
ComputeChromaBlockMean(PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPaddedPicturePtr,EB_U32 lcuCodingOrder,EB_U32 inputCbOriginIndex,EB_U32 inputCrOriginIndex)1448 static EB_ERRORTYPE ComputeChromaBlockMean(
1449 	PictureParentControlSet_t   *pictureControlSetPtr,          // input parameter, Picture Control Set Ptr
1450 	EbPictureBufferDesc_t       *inputPaddedPicturePtr,         // input parameter, Input Padded Picture
1451 	EB_U32                       lcuCodingOrder,                // input parameter, LCU address
1452 	EB_U32                       inputCbOriginIndex,            // input parameter, LCU index, used to point to source/reference samples
1453 	EB_U32                       inputCrOriginIndex)            // input parameter, LCU index, used to point to source/reference samples
1454 {
1455 
1456 	EB_ERRORTYPE return_error = EB_ErrorNone;
1457 
1458 	EB_U32 cbBlockIndex, crBlockIndex;
1459 
1460 	EB_U64 cbMeanOf16x16Blocks[16];
1461 	EB_U64 crMeanOf16x16Blocks[16];
1462 
1463 	EB_U64 cbMeanOf32x32Blocks[4];
1464 	EB_U64 crMeanOf32x32Blocks[4];
1465 
1466 	EB_U64 cbMeanOf64x64Blocks;
1467 	EB_U64 crMeanOf64x64Blocks;
1468 
1469 
1470 	// (0,0) 16x16 block
1471 	cbBlockIndex = inputCbOriginIndex;
1472 	crBlockIndex = inputCrOriginIndex;
1473 
1474 	const EB_U16 strideCb = inputPaddedPicturePtr->strideCb;
1475 	const EB_U16 strideCr = inputPaddedPicturePtr->strideCr;
1476 
1477 	cbMeanOf16x16Blocks[0] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1478 	crMeanOf16x16Blocks[0] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1479 
1480 	// (0,1)
1481 	cbBlockIndex = cbBlockIndex + 8;
1482 	crBlockIndex = crBlockIndex + 8;
1483 	cbMeanOf16x16Blocks[1] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1484 	crMeanOf16x16Blocks[1] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1485 
1486 	// (0,2)
1487 	cbBlockIndex = cbBlockIndex + 8;
1488 	crBlockIndex = crBlockIndex + 8;
1489 	cbMeanOf16x16Blocks[2] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1490 	crMeanOf16x16Blocks[2] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1491 
1492 	// (0,3)
1493 	cbBlockIndex = cbBlockIndex + 8;
1494 	crBlockIndex = crBlockIndex + 8;
1495 	cbMeanOf16x16Blocks[3] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1496 	crMeanOf16x16Blocks[3] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1497 
1498 	// (1,0)
1499 	cbBlockIndex = inputCbOriginIndex + (strideCb << 3);
1500 	crBlockIndex = inputCrOriginIndex + (strideCr << 3);
1501 	cbMeanOf16x16Blocks[4] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1502 	crMeanOf16x16Blocks[4] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1503 
1504 	// (1,1)
1505 	cbBlockIndex = cbBlockIndex + 8;
1506 	crBlockIndex = crBlockIndex + 8;
1507 	cbMeanOf16x16Blocks[5] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1508 	crMeanOf16x16Blocks[5] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1509 
1510 	// (1,2)
1511 	cbBlockIndex = cbBlockIndex + 8;
1512 	crBlockIndex = crBlockIndex + 8;
1513 	cbMeanOf16x16Blocks[6] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1514 	crMeanOf16x16Blocks[6] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1515 
1516 	// (1,3)
1517 	cbBlockIndex = cbBlockIndex + 8;
1518 	crBlockIndex = crBlockIndex + 8;
1519 	cbMeanOf16x16Blocks[7] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1520 	crMeanOf16x16Blocks[7] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1521 
1522 	// (2,0)
1523 	cbBlockIndex = inputCbOriginIndex + (strideCb << 4);
1524 	crBlockIndex = inputCrOriginIndex + (strideCr << 4);
1525 	cbMeanOf16x16Blocks[8] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1526 	crMeanOf16x16Blocks[8] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1527 
1528 	// (2,1)
1529 	cbBlockIndex = cbBlockIndex + 8;
1530 	crBlockIndex = crBlockIndex + 8;
1531 	cbMeanOf16x16Blocks[9] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1532 	crMeanOf16x16Blocks[9] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1533 
1534 	// (2,2)
1535 	cbBlockIndex = cbBlockIndex + 8;
1536 	crBlockIndex = crBlockIndex + 8;
1537 	cbMeanOf16x16Blocks[10] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1538 	crMeanOf16x16Blocks[10] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1539 
1540 	// (2,3)
1541 	cbBlockIndex = cbBlockIndex + 8;
1542 	crBlockIndex = crBlockIndex + 8;
1543 	cbMeanOf16x16Blocks[11] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1544 	crMeanOf16x16Blocks[11] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1545 
1546 	// (3,0)
1547 	cbBlockIndex = inputCbOriginIndex + (strideCb * 24);
1548 	crBlockIndex = inputCrOriginIndex + (strideCr * 24);
1549 	cbMeanOf16x16Blocks[12] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1550 	crMeanOf16x16Blocks[12] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1551 
1552 	// (3,1)
1553 	cbBlockIndex = cbBlockIndex + 8;
1554 	crBlockIndex = crBlockIndex + 8;
1555 	cbMeanOf16x16Blocks[13] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1556 	crMeanOf16x16Blocks[13] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1557 
1558 	// (3,2)
1559 	cbBlockIndex = cbBlockIndex + 8;
1560 	crBlockIndex = crBlockIndex + 8;
1561 	cbMeanOf16x16Blocks[14] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1562 	crMeanOf16x16Blocks[14] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1563 
1564 	// (3,3)
1565 	cbBlockIndex = cbBlockIndex + 8;
1566 	crBlockIndex = crBlockIndex + 8;
1567 	cbMeanOf16x16Blocks[15] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1568 	crMeanOf16x16Blocks[15] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1569 
1570 
1571 	// 32x32
1572 	cbMeanOf32x32Blocks[0] = (cbMeanOf16x16Blocks[0] + cbMeanOf16x16Blocks[1] + cbMeanOf16x16Blocks[4] + cbMeanOf16x16Blocks[5]) >> 2;
1573 	crMeanOf32x32Blocks[0] = (crMeanOf16x16Blocks[0] + crMeanOf16x16Blocks[1] + crMeanOf16x16Blocks[4] + crMeanOf16x16Blocks[5]) >> 2;
1574 
1575 	cbMeanOf32x32Blocks[1] = (cbMeanOf16x16Blocks[2] + cbMeanOf16x16Blocks[3] + cbMeanOf16x16Blocks[6] + cbMeanOf16x16Blocks[7]) >> 2;
1576 	crMeanOf32x32Blocks[1] = (crMeanOf16x16Blocks[2] + crMeanOf16x16Blocks[3] + crMeanOf16x16Blocks[6] + crMeanOf16x16Blocks[7]) >> 2;
1577 
1578 
1579 	cbMeanOf32x32Blocks[2] = (cbMeanOf16x16Blocks[8] + cbMeanOf16x16Blocks[9] + cbMeanOf16x16Blocks[12] + cbMeanOf16x16Blocks[13]) >> 2;
1580 	crMeanOf32x32Blocks[2] = (crMeanOf16x16Blocks[8] + crMeanOf16x16Blocks[9] + crMeanOf16x16Blocks[12] + crMeanOf16x16Blocks[13]) >> 2;
1581 
1582 	cbMeanOf32x32Blocks[3] = (cbMeanOf16x16Blocks[10] + cbMeanOf16x16Blocks[11] + cbMeanOf16x16Blocks[14] + cbMeanOf16x16Blocks[15]) >> 2;
1583 	crMeanOf32x32Blocks[3] = (crMeanOf16x16Blocks[10] + crMeanOf16x16Blocks[11] + crMeanOf16x16Blocks[14] + crMeanOf16x16Blocks[15]) >> 2;
1584 
1585 	// 64x64
1586 	cbMeanOf64x64Blocks = (cbMeanOf32x32Blocks[0] + cbMeanOf32x32Blocks[1] + cbMeanOf32x32Blocks[3] + cbMeanOf32x32Blocks[3]) >> 2;
1587 	crMeanOf64x64Blocks = (crMeanOf32x32Blocks[0] + crMeanOf32x32Blocks[1] + crMeanOf32x32Blocks[3] + crMeanOf32x32Blocks[3]) >> 2;
1588 	// 16x16 mean
1589 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_0] = (EB_U8) (cbMeanOf16x16Blocks[0] >> MEAN_PRECISION);
1590 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_1] = (EB_U8) (cbMeanOf16x16Blocks[1] >> MEAN_PRECISION);
1591 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_2] = (EB_U8) (cbMeanOf16x16Blocks[2] >> MEAN_PRECISION);
1592 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_3] = (EB_U8) (cbMeanOf16x16Blocks[3] >> MEAN_PRECISION);
1593 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_4] = (EB_U8) (cbMeanOf16x16Blocks[4] >> MEAN_PRECISION);
1594 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_5] = (EB_U8) (cbMeanOf16x16Blocks[5] >> MEAN_PRECISION);
1595 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_6] = (EB_U8) (cbMeanOf16x16Blocks[6] >> MEAN_PRECISION);
1596 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_7] = (EB_U8) (cbMeanOf16x16Blocks[7] >> MEAN_PRECISION);
1597 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_8] = (EB_U8) (cbMeanOf16x16Blocks[8] >> MEAN_PRECISION);
1598 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_9] = (EB_U8) (cbMeanOf16x16Blocks[9] >> MEAN_PRECISION);
1599 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_10] = (EB_U8) (cbMeanOf16x16Blocks[10] >> MEAN_PRECISION);
1600 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_11] = (EB_U8) (cbMeanOf16x16Blocks[11] >> MEAN_PRECISION);
1601 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_12] = (EB_U8) (cbMeanOf16x16Blocks[12] >> MEAN_PRECISION);
1602 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_13] = (EB_U8) (cbMeanOf16x16Blocks[13] >> MEAN_PRECISION);
1603 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_14] = (EB_U8) (cbMeanOf16x16Blocks[14] >> MEAN_PRECISION);
1604 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_15] = (EB_U8) (cbMeanOf16x16Blocks[15] >> MEAN_PRECISION);
1605 
1606 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_0] = (EB_U8) (crMeanOf16x16Blocks[0] >> MEAN_PRECISION);
1607 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_1] = (EB_U8) (crMeanOf16x16Blocks[1] >> MEAN_PRECISION);
1608 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_2] = (EB_U8) (crMeanOf16x16Blocks[2] >> MEAN_PRECISION);
1609 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_3] = (EB_U8) (crMeanOf16x16Blocks[3] >> MEAN_PRECISION);
1610 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_4] = (EB_U8) (crMeanOf16x16Blocks[4] >> MEAN_PRECISION);
1611 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_5] = (EB_U8) (crMeanOf16x16Blocks[5] >> MEAN_PRECISION);
1612 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_6] = (EB_U8) (crMeanOf16x16Blocks[6] >> MEAN_PRECISION);
1613 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_7] = (EB_U8) (crMeanOf16x16Blocks[7] >> MEAN_PRECISION);
1614 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_8] = (EB_U8) (crMeanOf16x16Blocks[8] >> MEAN_PRECISION);
1615 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_9] = (EB_U8) (crMeanOf16x16Blocks[9] >> MEAN_PRECISION);
1616 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_10] = (EB_U8) (crMeanOf16x16Blocks[10] >> MEAN_PRECISION);
1617 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_11] = (EB_U8) (crMeanOf16x16Blocks[11] >> MEAN_PRECISION);
1618 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_12] = (EB_U8) (crMeanOf16x16Blocks[12] >> MEAN_PRECISION);
1619 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_13] = (EB_U8) (crMeanOf16x16Blocks[13] >> MEAN_PRECISION);
1620 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_14] = (EB_U8) (crMeanOf16x16Blocks[14] >> MEAN_PRECISION);
1621 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_15] = (EB_U8) (crMeanOf16x16Blocks[15] >> MEAN_PRECISION);
1622 
1623 	// 32x32 mean
1624 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_0] = (EB_U8) (cbMeanOf32x32Blocks[0] >> MEAN_PRECISION);
1625 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_1] = (EB_U8) (cbMeanOf32x32Blocks[1] >> MEAN_PRECISION);
1626 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_2] = (EB_U8) (cbMeanOf32x32Blocks[2] >> MEAN_PRECISION);
1627 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_3] = (EB_U8) (cbMeanOf32x32Blocks[3] >> MEAN_PRECISION);
1628 
1629 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_0] = (EB_U8)(crMeanOf32x32Blocks[0] >> MEAN_PRECISION);
1630 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_1] = (EB_U8)(crMeanOf32x32Blocks[1] >> MEAN_PRECISION);
1631 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_2] = (EB_U8)(crMeanOf32x32Blocks[2] >> MEAN_PRECISION);
1632 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_3] = (EB_U8)(crMeanOf32x32Blocks[3] >> MEAN_PRECISION);
1633 
1634 	// 64x64 mean
1635 	pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_64x64] = (EB_U8) (cbMeanOf64x64Blocks >> MEAN_PRECISION);
1636 	pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_64x64] = (EB_U8) (crMeanOf64x64Blocks >> MEAN_PRECISION);
1637 
1638 	return return_error;
1639 }
1640 
1641 
1642 /*******************************************
1643 * ComputeBlockMeanComputeVariance
1644 *   computes the variance and the block mean of all CUs inside the tree block
1645 *******************************************/
ComputeBlockMeanComputeVariance(PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPaddedPicturePtr,EB_U32 lcuIndex,EB_U32 inputLumaOriginIndex)1646 static EB_ERRORTYPE ComputeBlockMeanComputeVariance(
1647     PictureParentControlSet_t   *pictureControlSetPtr,          // input parameter, Picture Control Set Ptr
1648     EbPictureBufferDesc_t       *inputPaddedPicturePtr,         // input parameter, Input Padded Picture
1649     EB_U32                       lcuIndex,                      // input parameter, LCU address
1650     EB_U32                       inputLumaOriginIndex)          // input parameter, LCU index, used to point to source/reference samples
1651 {
1652 
1653     EB_ERRORTYPE return_error = EB_ErrorNone;
1654 
1655     EB_U32 blockIndex;
1656 
1657     EB_U64 meanOf8x8Blocks[64];
1658     EB_U64 meanOf8x8SquaredValuesBlocks[64];
1659 
1660     EB_U64 meanOf16x16Blocks[16];
1661     EB_U64 meanOf16x16SquaredValuesBlocks[16];
1662 
1663     EB_U64 meanOf32x32Blocks[4];
1664     EB_U64 meanOf32x32SquaredValuesBlocks[4];
1665 
1666     EB_U64 meanOf64x64Blocks;
1667     EB_U64 meanOf64x64SquaredValuesBlocks;
1668 
1669 	if (pictureControlSetPtr->disableVarianceFlag) {
1670 		memset16bit(pictureControlSetPtr->variance[lcuIndex], 125, MAX_ME_PU_COUNT);
1671 		EB_MEMSET(pictureControlSetPtr->yMean[lcuIndex], 125, sizeof(EB_U8) * MAX_ME_PU_COUNT);
1672 
1673 	}
1674 	else {
1675 
1676 		// (0,0)
1677 		blockIndex = inputLumaOriginIndex;
1678 
1679     const EB_U16 strideY = inputPaddedPicturePtr->strideY;
1680 
1681     if (!!(ASM_TYPES & AVX2_MASK)){
1682 
1683         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[0], &meanOf8x8SquaredValuesBlocks[0]);
1684 
1685         // (0,1)
1686         blockIndex = blockIndex + 32;
1687 
1688 
1689         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[4], &meanOf8x8SquaredValuesBlocks[4]);
1690 
1691         // (0,5)
1692         blockIndex = blockIndex + 24;
1693 
1694         // (1,0)
1695         blockIndex = inputLumaOriginIndex + (strideY << 3);
1696 
1697         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[8], &meanOf8x8SquaredValuesBlocks[8]);
1698 
1699         // (1,1)
1700         blockIndex = blockIndex + 32;
1701 
1702         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[12], &meanOf8x8SquaredValuesBlocks[12]);
1703 
1704         // (1,5)
1705         blockIndex = blockIndex + 24;
1706 
1707         // (2,0)
1708         blockIndex = inputLumaOriginIndex + (strideY << 4);
1709 
1710         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[16], &meanOf8x8SquaredValuesBlocks[16]);
1711 
1712         // (2,1)
1713         blockIndex = blockIndex + 32;
1714 
1715         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[20], &meanOf8x8SquaredValuesBlocks[20]);
1716 
1717         // (2,5)
1718         blockIndex = blockIndex + 24;
1719 
1720         // (3,0)
1721         blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4);
1722 
1723 
1724         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[24], &meanOf8x8SquaredValuesBlocks[24]);
1725 
1726         // (3,1)
1727         blockIndex = blockIndex + 32;
1728 
1729         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[28], &meanOf8x8SquaredValuesBlocks[28]);
1730 
1731         // (3,5)
1732         blockIndex = blockIndex + 24;
1733 
1734         // (4,0)
1735         blockIndex = inputLumaOriginIndex + (strideY << 5);
1736 
1737         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[32], &meanOf8x8SquaredValuesBlocks[32]);
1738 
1739         // (4,1)
1740         blockIndex = blockIndex + 32;
1741 
1742         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[36], &meanOf8x8SquaredValuesBlocks[36]);
1743 
1744         // (4,5)
1745         blockIndex = blockIndex + 24;
1746 
1747         // (5,0)
1748         blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 5);
1749 
1750         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[40], &meanOf8x8SquaredValuesBlocks[40]);
1751 
1752         // (5,1)
1753         blockIndex = blockIndex + 32;
1754 
1755         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[44], &meanOf8x8SquaredValuesBlocks[44]);
1756 
1757         // (5,5)
1758         blockIndex = blockIndex + 24;
1759 
1760         // (6,0)
1761         blockIndex = inputLumaOriginIndex + (strideY << 4) + (strideY << 5);
1762 
1763         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[48], &meanOf8x8SquaredValuesBlocks[48]);
1764 
1765         // (6,1)
1766         blockIndex = blockIndex + 32;
1767 
1768         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[52], &meanOf8x8SquaredValuesBlocks[52]);
1769 
1770         // (6,5)
1771         blockIndex = blockIndex + 24;
1772 
1773 
1774         // (7,0)
1775         blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4) + (strideY << 5);
1776 
1777         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[56], &meanOf8x8SquaredValuesBlocks[56]);
1778 
1779 
1780         // (7,1)
1781         blockIndex = blockIndex + 32;
1782 
1783         ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[60], &meanOf8x8SquaredValuesBlocks[60]);
1784 
1785 
1786     }
1787     else{
1788         meanOf8x8Blocks[0] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1789         meanOf8x8SquaredValuesBlocks[0] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1790 
1791         // (0,1)
1792         blockIndex = blockIndex + 8;
1793         meanOf8x8Blocks[1] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1794         meanOf8x8SquaredValuesBlocks[1] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1795 
1796         // (0,2)
1797         blockIndex = blockIndex + 8;
1798         meanOf8x8Blocks[2] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1799         meanOf8x8SquaredValuesBlocks[2] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1800 
1801         // (0,3)
1802         blockIndex = blockIndex + 8;
1803         meanOf8x8Blocks[3] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1804         meanOf8x8SquaredValuesBlocks[3] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1805 
1806         // (0,4)
1807         blockIndex = blockIndex + 8;
1808         meanOf8x8Blocks[4] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1809         meanOf8x8SquaredValuesBlocks[4] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1810 
1811         // (0,5)
1812         blockIndex = blockIndex + 8;
1813         meanOf8x8Blocks[5] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1814         meanOf8x8SquaredValuesBlocks[5] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1815 
1816         // (0,6)
1817         blockIndex = blockIndex + 8;
1818         meanOf8x8Blocks[6] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1819         meanOf8x8SquaredValuesBlocks[6] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1820 
1821         // (0,7)
1822         blockIndex = blockIndex + 8;
1823         meanOf8x8Blocks[7] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1824         meanOf8x8SquaredValuesBlocks[7] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1825 
1826         // (1,0)
1827         blockIndex = inputLumaOriginIndex + (strideY << 3);
1828         meanOf8x8Blocks[8] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1829         meanOf8x8SquaredValuesBlocks[8] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1830 
1831         // (1,1)
1832         blockIndex = blockIndex + 8;
1833         meanOf8x8Blocks[9] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1834         meanOf8x8SquaredValuesBlocks[9] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1835 
1836         // (1,2)
1837         blockIndex = blockIndex + 8;
1838         meanOf8x8Blocks[10] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1839         meanOf8x8SquaredValuesBlocks[10] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1840 
1841         // (1,3)
1842         blockIndex = blockIndex + 8;
1843         meanOf8x8Blocks[11] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1844         meanOf8x8SquaredValuesBlocks[11] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1845 
1846         // (1,4)
1847         blockIndex = blockIndex + 8;
1848         meanOf8x8Blocks[12] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1849         meanOf8x8SquaredValuesBlocks[12] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1850 
1851         // (1,5)
1852         blockIndex = blockIndex + 8;
1853         meanOf8x8Blocks[13] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1854         meanOf8x8SquaredValuesBlocks[13] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1855 
1856         // (1,6)
1857         blockIndex = blockIndex + 8;
1858         meanOf8x8Blocks[14] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1859         meanOf8x8SquaredValuesBlocks[14] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1860 
1861         // (1,7)
1862         blockIndex = blockIndex + 8;
1863         meanOf8x8Blocks[15] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1864         meanOf8x8SquaredValuesBlocks[15] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1865 
1866         // (2,0)
1867         blockIndex = inputLumaOriginIndex + (strideY << 4);
1868         meanOf8x8Blocks[16] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1869         meanOf8x8SquaredValuesBlocks[16] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1870 
1871         // (2,1)
1872         blockIndex = blockIndex + 8;
1873         meanOf8x8Blocks[17] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1874         meanOf8x8SquaredValuesBlocks[17] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1875 
1876         // (2,2)
1877         blockIndex = blockIndex + 8;
1878         meanOf8x8Blocks[18] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1879         meanOf8x8SquaredValuesBlocks[18] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1880 
1881         // (2,3)
1882         blockIndex = blockIndex + 8;
1883         meanOf8x8Blocks[19] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1884         meanOf8x8SquaredValuesBlocks[19] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1885 
1886         /// (2,4)
1887         blockIndex = blockIndex + 8;
1888         meanOf8x8Blocks[20] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1889         meanOf8x8SquaredValuesBlocks[20] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1890 
1891         // (2,5)
1892         blockIndex = blockIndex + 8;
1893         meanOf8x8Blocks[21] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1894         meanOf8x8SquaredValuesBlocks[21] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1895 
1896         // (2,6)
1897         blockIndex = blockIndex + 8;
1898         meanOf8x8Blocks[22] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1899         meanOf8x8SquaredValuesBlocks[22] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1900 
1901         // (2,7)
1902         blockIndex = blockIndex + 8;
1903         meanOf8x8Blocks[23] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1904         meanOf8x8SquaredValuesBlocks[23] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1905 
1906         // (3,0)
1907         blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4);
1908         meanOf8x8Blocks[24] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1909         meanOf8x8SquaredValuesBlocks[24] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1910 
1911         // (3,1)
1912         blockIndex = blockIndex + 8;
1913         meanOf8x8Blocks[25] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1914         meanOf8x8SquaredValuesBlocks[25] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1915 
1916         // (3,2)
1917         blockIndex = blockIndex + 8;
1918         meanOf8x8Blocks[26] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1919         meanOf8x8SquaredValuesBlocks[26] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1920 
1921         // (3,3)
1922         blockIndex = blockIndex + 8;
1923         meanOf8x8Blocks[27] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1924         meanOf8x8SquaredValuesBlocks[27] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1925 
1926         // (3,4)
1927         blockIndex = blockIndex + 8;
1928         meanOf8x8Blocks[28] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1929         meanOf8x8SquaredValuesBlocks[28] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1930 
1931         // (3,5)
1932         blockIndex = blockIndex + 8;
1933         meanOf8x8Blocks[29] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1934         meanOf8x8SquaredValuesBlocks[29] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1935 
1936         // (3,6)
1937         blockIndex = blockIndex + 8;
1938         meanOf8x8Blocks[30] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1939         meanOf8x8SquaredValuesBlocks[30] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1940 
1941         // (3,7)
1942         blockIndex = blockIndex + 8;
1943         meanOf8x8Blocks[31] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1944         meanOf8x8SquaredValuesBlocks[31] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1945 
1946         // (4,0)
1947         blockIndex = inputLumaOriginIndex + (strideY << 5);
1948         meanOf8x8Blocks[32] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1949         meanOf8x8SquaredValuesBlocks[32] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1950 
1951         // (4,1)
1952         blockIndex = blockIndex + 8;
1953         meanOf8x8Blocks[33] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1954         meanOf8x8SquaredValuesBlocks[33] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1955 
1956         // (4,2)
1957         blockIndex = blockIndex + 8;
1958         meanOf8x8Blocks[34] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1959         meanOf8x8SquaredValuesBlocks[34] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1960 
1961         // (4,3)
1962         blockIndex = blockIndex + 8;
1963         meanOf8x8Blocks[35] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1964         meanOf8x8SquaredValuesBlocks[35] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1965 
1966         // (4,4)
1967         blockIndex = blockIndex + 8;
1968         meanOf8x8Blocks[36] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1969         meanOf8x8SquaredValuesBlocks[36] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1970 
1971         // (4,5)
1972         blockIndex = blockIndex + 8;
1973         meanOf8x8Blocks[37] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1974         meanOf8x8SquaredValuesBlocks[37] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1975 
1976         // (4,6)
1977         blockIndex = blockIndex + 8;
1978         meanOf8x8Blocks[38] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1979         meanOf8x8SquaredValuesBlocks[38] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1980 
1981         // (4,7)
1982         blockIndex = blockIndex + 8;
1983         meanOf8x8Blocks[39] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1984         meanOf8x8SquaredValuesBlocks[39] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1985 
1986         // (5,0)
1987         blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 5);
1988         meanOf8x8Blocks[40] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1989         meanOf8x8SquaredValuesBlocks[40] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1990 
1991         // (5,1)
1992         blockIndex = blockIndex + 8;
1993         meanOf8x8Blocks[41] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1994         meanOf8x8SquaredValuesBlocks[41] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1995 
1996         // (5,2)
1997         blockIndex = blockIndex + 8;
1998         meanOf8x8Blocks[42] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1999         meanOf8x8SquaredValuesBlocks[42] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2000 
2001         // (5,3)
2002         blockIndex = blockIndex + 8;
2003         meanOf8x8Blocks[43] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2004         meanOf8x8SquaredValuesBlocks[43] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2005 
2006         // (5,4)
2007         blockIndex = blockIndex + 8;
2008         meanOf8x8Blocks[44] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2009         meanOf8x8SquaredValuesBlocks[44] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2010 
2011         // (5,5)
2012         blockIndex = blockIndex + 8;
2013         meanOf8x8Blocks[45] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2014         meanOf8x8SquaredValuesBlocks[45] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2015 
2016         // (5,6)
2017         blockIndex = blockIndex + 8;
2018         meanOf8x8Blocks[46] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2019         meanOf8x8SquaredValuesBlocks[46] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2020 
2021         // (5,7)
2022         blockIndex = blockIndex + 8;
2023         meanOf8x8Blocks[47] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2024         meanOf8x8SquaredValuesBlocks[47] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2025 
2026         // (6,0)
2027         blockIndex = inputLumaOriginIndex + (strideY << 4) + (strideY << 5);
2028         meanOf8x8Blocks[48] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2029         meanOf8x8SquaredValuesBlocks[48] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2030 
2031         // (6,1)
2032         blockIndex = blockIndex + 8;
2033         meanOf8x8Blocks[49] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2034         meanOf8x8SquaredValuesBlocks[49] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2035 
2036         // (6,2)
2037         blockIndex = blockIndex + 8;
2038         meanOf8x8Blocks[50] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2039         meanOf8x8SquaredValuesBlocks[50] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2040 
2041         // (6,3)
2042         blockIndex = blockIndex + 8;
2043         meanOf8x8Blocks[51] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2044         meanOf8x8SquaredValuesBlocks[51] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2045 
2046         // (6,4)
2047         blockIndex = blockIndex + 8;
2048         meanOf8x8Blocks[52] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2049         meanOf8x8SquaredValuesBlocks[52] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2050 
2051         // (6,5)
2052         blockIndex = blockIndex + 8;
2053         meanOf8x8Blocks[53] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2054         meanOf8x8SquaredValuesBlocks[53] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2055 
2056         // (6,6)
2057         blockIndex = blockIndex + 8;
2058         meanOf8x8Blocks[54] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2059         meanOf8x8SquaredValuesBlocks[54] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2060 
2061         // (6,7)
2062         blockIndex = blockIndex + 8;
2063         meanOf8x8Blocks[55] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2064         meanOf8x8SquaredValuesBlocks[55] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2065 
2066         // (7,0)
2067         blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4) + (strideY << 5);
2068         meanOf8x8Blocks[56] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2069         meanOf8x8SquaredValuesBlocks[56] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2070 
2071         // (7,1)
2072         blockIndex = blockIndex + 8;
2073         meanOf8x8Blocks[57] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2074         meanOf8x8SquaredValuesBlocks[57] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2075 
2076         // (7,2)
2077         blockIndex = blockIndex + 8;
2078         meanOf8x8Blocks[58] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2079         meanOf8x8SquaredValuesBlocks[58] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2080 
2081         // (7,3)
2082         blockIndex = blockIndex + 8;
2083         meanOf8x8Blocks[59] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2084         meanOf8x8SquaredValuesBlocks[59] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2085 
2086         // (7,4)
2087         blockIndex = blockIndex + 8;
2088         meanOf8x8Blocks[60] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2089         meanOf8x8SquaredValuesBlocks[60] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2090 
2091         // (7,5)
2092         blockIndex = blockIndex + 8;
2093         meanOf8x8Blocks[61] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2094         meanOf8x8SquaredValuesBlocks[61] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2095 
2096         // (7,6)
2097         blockIndex = blockIndex + 8;
2098         meanOf8x8Blocks[62] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2099         meanOf8x8SquaredValuesBlocks[62] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2100 
2101         // (7,7)
2102         blockIndex = blockIndex + 8;
2103         meanOf8x8Blocks[63] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2104         meanOf8x8SquaredValuesBlocks[63] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2105     }
2106 
2107 
2108 	// 16x16
2109 	meanOf16x16Blocks[0] = (meanOf8x8Blocks[0] + meanOf8x8Blocks[1] + meanOf8x8Blocks[8] + meanOf8x8Blocks[9]) >> 2;
2110 	meanOf16x16Blocks[1] = (meanOf8x8Blocks[2] + meanOf8x8Blocks[3] + meanOf8x8Blocks[10] + meanOf8x8Blocks[11]) >> 2;
2111 	meanOf16x16Blocks[2] = (meanOf8x8Blocks[4] + meanOf8x8Blocks[5] + meanOf8x8Blocks[12] + meanOf8x8Blocks[13]) >> 2;
2112 	meanOf16x16Blocks[3] = (meanOf8x8Blocks[6] + meanOf8x8Blocks[7] + meanOf8x8Blocks[14] + meanOf8x8Blocks[15]) >> 2;
2113 
2114 	meanOf16x16Blocks[4] = (meanOf8x8Blocks[16] + meanOf8x8Blocks[17] + meanOf8x8Blocks[24] + meanOf8x8Blocks[25]) >> 2;
2115 	meanOf16x16Blocks[5] = (meanOf8x8Blocks[18] + meanOf8x8Blocks[19] + meanOf8x8Blocks[26] + meanOf8x8Blocks[27]) >> 2;
2116 	meanOf16x16Blocks[6] = (meanOf8x8Blocks[20] + meanOf8x8Blocks[21] + meanOf8x8Blocks[28] + meanOf8x8Blocks[29]) >> 2;
2117 	meanOf16x16Blocks[7] = (meanOf8x8Blocks[22] + meanOf8x8Blocks[23] + meanOf8x8Blocks[30] + meanOf8x8Blocks[31]) >> 2;
2118 
2119 	meanOf16x16Blocks[8] = (meanOf8x8Blocks[32] + meanOf8x8Blocks[33] + meanOf8x8Blocks[40] + meanOf8x8Blocks[41]) >> 2;
2120 	meanOf16x16Blocks[9] = (meanOf8x8Blocks[34] + meanOf8x8Blocks[35] + meanOf8x8Blocks[42] + meanOf8x8Blocks[43]) >> 2;
2121 	meanOf16x16Blocks[10] = (meanOf8x8Blocks[36] + meanOf8x8Blocks[37] + meanOf8x8Blocks[44] + meanOf8x8Blocks[45]) >> 2;
2122 	meanOf16x16Blocks[11] = (meanOf8x8Blocks[38] + meanOf8x8Blocks[39] + meanOf8x8Blocks[46] + meanOf8x8Blocks[47]) >> 2;
2123 
2124 	meanOf16x16Blocks[12] = (meanOf8x8Blocks[48] + meanOf8x8Blocks[49] + meanOf8x8Blocks[56] + meanOf8x8Blocks[57]) >> 2;
2125 	meanOf16x16Blocks[13] = (meanOf8x8Blocks[50] + meanOf8x8Blocks[51] + meanOf8x8Blocks[58] + meanOf8x8Blocks[59]) >> 2;
2126 	meanOf16x16Blocks[14] = (meanOf8x8Blocks[52] + meanOf8x8Blocks[53] + meanOf8x8Blocks[60] + meanOf8x8Blocks[61]) >> 2;
2127 	meanOf16x16Blocks[15] = (meanOf8x8Blocks[54] + meanOf8x8Blocks[55] + meanOf8x8Blocks[62] + meanOf8x8Blocks[63]) >> 2;
2128 
2129 	meanOf16x16SquaredValuesBlocks[0] = (meanOf8x8SquaredValuesBlocks[0] + meanOf8x8SquaredValuesBlocks[1] + meanOf8x8SquaredValuesBlocks[8] + meanOf8x8SquaredValuesBlocks[9]) >> 2;
2130 	meanOf16x16SquaredValuesBlocks[1] = (meanOf8x8SquaredValuesBlocks[2] + meanOf8x8SquaredValuesBlocks[3] + meanOf8x8SquaredValuesBlocks[10] + meanOf8x8SquaredValuesBlocks[11]) >> 2;
2131 	meanOf16x16SquaredValuesBlocks[2] = (meanOf8x8SquaredValuesBlocks[4] + meanOf8x8SquaredValuesBlocks[5] + meanOf8x8SquaredValuesBlocks[12] + meanOf8x8SquaredValuesBlocks[13]) >> 2;
2132 	meanOf16x16SquaredValuesBlocks[3] = (meanOf8x8SquaredValuesBlocks[6] + meanOf8x8SquaredValuesBlocks[7] + meanOf8x8SquaredValuesBlocks[14] + meanOf8x8SquaredValuesBlocks[15]) >> 2;
2133 
2134 	meanOf16x16SquaredValuesBlocks[4] = (meanOf8x8SquaredValuesBlocks[16] + meanOf8x8SquaredValuesBlocks[17] + meanOf8x8SquaredValuesBlocks[24] + meanOf8x8SquaredValuesBlocks[25]) >> 2;
2135 	meanOf16x16SquaredValuesBlocks[5] = (meanOf8x8SquaredValuesBlocks[18] + meanOf8x8SquaredValuesBlocks[19] + meanOf8x8SquaredValuesBlocks[26] + meanOf8x8SquaredValuesBlocks[27]) >> 2;
2136 	meanOf16x16SquaredValuesBlocks[6] = (meanOf8x8SquaredValuesBlocks[20] + meanOf8x8SquaredValuesBlocks[21] + meanOf8x8SquaredValuesBlocks[28] + meanOf8x8SquaredValuesBlocks[29]) >> 2;
2137 	meanOf16x16SquaredValuesBlocks[7] = (meanOf8x8SquaredValuesBlocks[22] + meanOf8x8SquaredValuesBlocks[23] + meanOf8x8SquaredValuesBlocks[30] + meanOf8x8SquaredValuesBlocks[31]) >> 2;
2138 
2139 	meanOf16x16SquaredValuesBlocks[8] = (meanOf8x8SquaredValuesBlocks[32] + meanOf8x8SquaredValuesBlocks[33] + meanOf8x8SquaredValuesBlocks[40] + meanOf8x8SquaredValuesBlocks[41]) >> 2;
2140 	meanOf16x16SquaredValuesBlocks[9] = (meanOf8x8SquaredValuesBlocks[34] + meanOf8x8SquaredValuesBlocks[35] + meanOf8x8SquaredValuesBlocks[42] + meanOf8x8SquaredValuesBlocks[43]) >> 2;
2141 	meanOf16x16SquaredValuesBlocks[10] = (meanOf8x8SquaredValuesBlocks[36] + meanOf8x8SquaredValuesBlocks[37] + meanOf8x8SquaredValuesBlocks[44] + meanOf8x8SquaredValuesBlocks[45]) >> 2;
2142 	meanOf16x16SquaredValuesBlocks[11] = (meanOf8x8SquaredValuesBlocks[38] + meanOf8x8SquaredValuesBlocks[39] + meanOf8x8SquaredValuesBlocks[46] + meanOf8x8SquaredValuesBlocks[47]) >> 2;
2143 
2144 	meanOf16x16SquaredValuesBlocks[12] = (meanOf8x8SquaredValuesBlocks[48] + meanOf8x8SquaredValuesBlocks[49] + meanOf8x8SquaredValuesBlocks[56] + meanOf8x8SquaredValuesBlocks[57]) >> 2;
2145 	meanOf16x16SquaredValuesBlocks[13] = (meanOf8x8SquaredValuesBlocks[50] + meanOf8x8SquaredValuesBlocks[51] + meanOf8x8SquaredValuesBlocks[58] + meanOf8x8SquaredValuesBlocks[59]) >> 2;
2146 	meanOf16x16SquaredValuesBlocks[14] = (meanOf8x8SquaredValuesBlocks[52] + meanOf8x8SquaredValuesBlocks[53] + meanOf8x8SquaredValuesBlocks[60] + meanOf8x8SquaredValuesBlocks[61]) >> 2;
2147 	meanOf16x16SquaredValuesBlocks[15] = (meanOf8x8SquaredValuesBlocks[54] + meanOf8x8SquaredValuesBlocks[55] + meanOf8x8SquaredValuesBlocks[62] + meanOf8x8SquaredValuesBlocks[63]) >> 2;
2148 
2149 	// 32x32
2150 	meanOf32x32Blocks[0] = (meanOf16x16Blocks[0] + meanOf16x16Blocks[1] + meanOf16x16Blocks[4] + meanOf16x16Blocks[5]) >> 2;
2151 	meanOf32x32Blocks[1] = (meanOf16x16Blocks[2] + meanOf16x16Blocks[3] + meanOf16x16Blocks[6] + meanOf16x16Blocks[7]) >> 2;
2152 	meanOf32x32Blocks[2] = (meanOf16x16Blocks[8] + meanOf16x16Blocks[9] + meanOf16x16Blocks[12] + meanOf16x16Blocks[13]) >> 2;
2153 	meanOf32x32Blocks[3] = (meanOf16x16Blocks[10] + meanOf16x16Blocks[11] + meanOf16x16Blocks[14] + meanOf16x16Blocks[15]) >> 2;
2154 
2155 	meanOf32x32SquaredValuesBlocks[0] = (meanOf16x16SquaredValuesBlocks[0] + meanOf16x16SquaredValuesBlocks[1] + meanOf16x16SquaredValuesBlocks[4] + meanOf16x16SquaredValuesBlocks[5]) >> 2;
2156 	meanOf32x32SquaredValuesBlocks[1] = (meanOf16x16SquaredValuesBlocks[2] + meanOf16x16SquaredValuesBlocks[3] + meanOf16x16SquaredValuesBlocks[6] + meanOf16x16SquaredValuesBlocks[7]) >> 2;
2157 	meanOf32x32SquaredValuesBlocks[2] = (meanOf16x16SquaredValuesBlocks[8] + meanOf16x16SquaredValuesBlocks[9] + meanOf16x16SquaredValuesBlocks[12] + meanOf16x16SquaredValuesBlocks[13]) >> 2;
2158 	meanOf32x32SquaredValuesBlocks[3] = (meanOf16x16SquaredValuesBlocks[10] + meanOf16x16SquaredValuesBlocks[11] + meanOf16x16SquaredValuesBlocks[14] + meanOf16x16SquaredValuesBlocks[15]) >> 2;
2159 
2160 	// 64x64
2161 	meanOf64x64Blocks = (meanOf32x32Blocks[0] + meanOf32x32Blocks[1] + meanOf32x32Blocks[2] + meanOf32x32Blocks[3]) >> 2;
2162 	meanOf64x64SquaredValuesBlocks = (meanOf32x32SquaredValuesBlocks[0] + meanOf32x32SquaredValuesBlocks[1] + meanOf32x32SquaredValuesBlocks[2] + meanOf32x32SquaredValuesBlocks[3]) >> 2;
2163 
2164 	// 8x8 means
2165 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_0] = (EB_U8)(meanOf8x8Blocks[0] >> MEAN_PRECISION);
2166 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_1] = (EB_U8)(meanOf8x8Blocks[1] >> MEAN_PRECISION);
2167 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_2] = (EB_U8)(meanOf8x8Blocks[2] >> MEAN_PRECISION);
2168 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_3] = (EB_U8)(meanOf8x8Blocks[3] >> MEAN_PRECISION);
2169 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_4] = (EB_U8)(meanOf8x8Blocks[4] >> MEAN_PRECISION);
2170 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_5] = (EB_U8)(meanOf8x8Blocks[5] >> MEAN_PRECISION);
2171 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_6] = (EB_U8)(meanOf8x8Blocks[6] >> MEAN_PRECISION);
2172 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_7] = (EB_U8)(meanOf8x8Blocks[7] >> MEAN_PRECISION);
2173 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_8] = (EB_U8)(meanOf8x8Blocks[8] >> MEAN_PRECISION);
2174 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_9] = (EB_U8)(meanOf8x8Blocks[9] >> MEAN_PRECISION);
2175 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_10] = (EB_U8)(meanOf8x8Blocks[10] >> MEAN_PRECISION);
2176 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_11] = (EB_U8)(meanOf8x8Blocks[11] >> MEAN_PRECISION);
2177 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_12] = (EB_U8)(meanOf8x8Blocks[12] >> MEAN_PRECISION);
2178 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_13] = (EB_U8)(meanOf8x8Blocks[13] >> MEAN_PRECISION);
2179 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_14] = (EB_U8)(meanOf8x8Blocks[14] >> MEAN_PRECISION);
2180 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_15] = (EB_U8)(meanOf8x8Blocks[15] >> MEAN_PRECISION);
2181 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_16] = (EB_U8)(meanOf8x8Blocks[16] >> MEAN_PRECISION);
2182 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_17] = (EB_U8)(meanOf8x8Blocks[17] >> MEAN_PRECISION);
2183 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_18] = (EB_U8)(meanOf8x8Blocks[18] >> MEAN_PRECISION);
2184 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_19] = (EB_U8)(meanOf8x8Blocks[19] >> MEAN_PRECISION);
2185 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_20] = (EB_U8)(meanOf8x8Blocks[20] >> MEAN_PRECISION);
2186 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_21] = (EB_U8)(meanOf8x8Blocks[21] >> MEAN_PRECISION);
2187 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_22] = (EB_U8)(meanOf8x8Blocks[22] >> MEAN_PRECISION);
2188 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_23] = (EB_U8)(meanOf8x8Blocks[23] >> MEAN_PRECISION);
2189 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_24] = (EB_U8)(meanOf8x8Blocks[24] >> MEAN_PRECISION);
2190 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_25] = (EB_U8)(meanOf8x8Blocks[25] >> MEAN_PRECISION);
2191 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_26] = (EB_U8)(meanOf8x8Blocks[26] >> MEAN_PRECISION);
2192 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_27] = (EB_U8)(meanOf8x8Blocks[27] >> MEAN_PRECISION);
2193 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_28] = (EB_U8)(meanOf8x8Blocks[28] >> MEAN_PRECISION);
2194 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_29] = (EB_U8)(meanOf8x8Blocks[29] >> MEAN_PRECISION);
2195 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_30] = (EB_U8)(meanOf8x8Blocks[30] >> MEAN_PRECISION);
2196 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_31] = (EB_U8)(meanOf8x8Blocks[31] >> MEAN_PRECISION);
2197 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_32] = (EB_U8)(meanOf8x8Blocks[32] >> MEAN_PRECISION);
2198 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_33] = (EB_U8)(meanOf8x8Blocks[33] >> MEAN_PRECISION);
2199 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_34] = (EB_U8)(meanOf8x8Blocks[34] >> MEAN_PRECISION);
2200 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_35] = (EB_U8)(meanOf8x8Blocks[35] >> MEAN_PRECISION);
2201 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_36] = (EB_U8)(meanOf8x8Blocks[36] >> MEAN_PRECISION);
2202 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_37] = (EB_U8)(meanOf8x8Blocks[37] >> MEAN_PRECISION);
2203 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_38] = (EB_U8)(meanOf8x8Blocks[38] >> MEAN_PRECISION);
2204 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_39] = (EB_U8)(meanOf8x8Blocks[39] >> MEAN_PRECISION);
2205 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_40] = (EB_U8)(meanOf8x8Blocks[40] >> MEAN_PRECISION);
2206 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_41] = (EB_U8)(meanOf8x8Blocks[41] >> MEAN_PRECISION);
2207 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_42] = (EB_U8)(meanOf8x8Blocks[42] >> MEAN_PRECISION);
2208 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_43] = (EB_U8)(meanOf8x8Blocks[43] >> MEAN_PRECISION);
2209 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_44] = (EB_U8)(meanOf8x8Blocks[44] >> MEAN_PRECISION);
2210 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_45] = (EB_U8)(meanOf8x8Blocks[45] >> MEAN_PRECISION);
2211 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_46] = (EB_U8)(meanOf8x8Blocks[46] >> MEAN_PRECISION);
2212 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_47] = (EB_U8)(meanOf8x8Blocks[47] >> MEAN_PRECISION);
2213 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_48] = (EB_U8)(meanOf8x8Blocks[48] >> MEAN_PRECISION);
2214 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_49] = (EB_U8)(meanOf8x8Blocks[49] >> MEAN_PRECISION);
2215 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_50] = (EB_U8)(meanOf8x8Blocks[50] >> MEAN_PRECISION);
2216 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_51] = (EB_U8)(meanOf8x8Blocks[51] >> MEAN_PRECISION);
2217 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_52] = (EB_U8)(meanOf8x8Blocks[52] >> MEAN_PRECISION);
2218 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_53] = (EB_U8)(meanOf8x8Blocks[53] >> MEAN_PRECISION);
2219 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_54] = (EB_U8)(meanOf8x8Blocks[54] >> MEAN_PRECISION);
2220 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_55] = (EB_U8)(meanOf8x8Blocks[55] >> MEAN_PRECISION);
2221 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_56] = (EB_U8)(meanOf8x8Blocks[56] >> MEAN_PRECISION);
2222 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_57] = (EB_U8)(meanOf8x8Blocks[57] >> MEAN_PRECISION);
2223 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_58] = (EB_U8)(meanOf8x8Blocks[58] >> MEAN_PRECISION);
2224 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_59] = (EB_U8)(meanOf8x8Blocks[59] >> MEAN_PRECISION);
2225 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_60] = (EB_U8)(meanOf8x8Blocks[60] >> MEAN_PRECISION);
2226 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_61] = (EB_U8)(meanOf8x8Blocks[61] >> MEAN_PRECISION);
2227 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_62] = (EB_U8)(meanOf8x8Blocks[62] >> MEAN_PRECISION);
2228 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_63] = (EB_U8)(meanOf8x8Blocks[63] >> MEAN_PRECISION);
2229 
2230 	// 16x16 mean
2231 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_0] = (EB_U8)(meanOf16x16Blocks[0] >> MEAN_PRECISION);
2232 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_1] = (EB_U8)(meanOf16x16Blocks[1] >> MEAN_PRECISION);
2233 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_2] = (EB_U8)(meanOf16x16Blocks[2] >> MEAN_PRECISION);
2234 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_3] = (EB_U8)(meanOf16x16Blocks[3] >> MEAN_PRECISION);
2235 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_4] = (EB_U8)(meanOf16x16Blocks[4] >> MEAN_PRECISION);
2236 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_5] = (EB_U8)(meanOf16x16Blocks[5] >> MEAN_PRECISION);
2237 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_6] = (EB_U8)(meanOf16x16Blocks[6] >> MEAN_PRECISION);
2238 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_7] = (EB_U8)(meanOf16x16Blocks[7] >> MEAN_PRECISION);
2239 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_8] = (EB_U8)(meanOf16x16Blocks[8] >> MEAN_PRECISION);
2240 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_9] = (EB_U8)(meanOf16x16Blocks[9] >> MEAN_PRECISION);
2241 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_10] = (EB_U8)(meanOf16x16Blocks[10] >> MEAN_PRECISION);
2242 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_11] = (EB_U8)(meanOf16x16Blocks[11] >> MEAN_PRECISION);
2243 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_12] = (EB_U8)(meanOf16x16Blocks[12] >> MEAN_PRECISION);
2244 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_13] = (EB_U8)(meanOf16x16Blocks[13] >> MEAN_PRECISION);
2245 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_14] = (EB_U8)(meanOf16x16Blocks[14] >> MEAN_PRECISION);
2246 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_15] = (EB_U8)(meanOf16x16Blocks[15] >> MEAN_PRECISION);
2247 
2248 	// 32x32 mean
2249 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_32x32_0] = (EB_U8)(meanOf32x32Blocks[0] >> MEAN_PRECISION);
2250 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_32x32_1] = (EB_U8)(meanOf32x32Blocks[1] >> MEAN_PRECISION);
2251 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_32x32_2] = (EB_U8)(meanOf32x32Blocks[2] >> MEAN_PRECISION);
2252 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_32x32_3] = (EB_U8)(meanOf32x32Blocks[3] >> MEAN_PRECISION);
2253 
2254 	// 64x64 mean
2255 	pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_64x64] = (EB_U8)(meanOf64x64Blocks >> MEAN_PRECISION);
2256 
2257 	// 8x8 variances
2258 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_0] = (EB_U16)((meanOf8x8SquaredValuesBlocks[0] - (meanOf8x8Blocks[0] * meanOf8x8Blocks[0])) >> VARIANCE_PRECISION);
2259 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_1] = (EB_U16)((meanOf8x8SquaredValuesBlocks[1] - (meanOf8x8Blocks[1] * meanOf8x8Blocks[1])) >> VARIANCE_PRECISION);
2260 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_2] = (EB_U16)((meanOf8x8SquaredValuesBlocks[2] - (meanOf8x8Blocks[2] * meanOf8x8Blocks[2])) >> VARIANCE_PRECISION);
2261 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_3] = (EB_U16)((meanOf8x8SquaredValuesBlocks[3] - (meanOf8x8Blocks[3] * meanOf8x8Blocks[3])) >> VARIANCE_PRECISION);
2262 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_4] = (EB_U16)((meanOf8x8SquaredValuesBlocks[4] - (meanOf8x8Blocks[4] * meanOf8x8Blocks[4])) >> VARIANCE_PRECISION);
2263 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_5] = (EB_U16)((meanOf8x8SquaredValuesBlocks[5] - (meanOf8x8Blocks[5] * meanOf8x8Blocks[5])) >> VARIANCE_PRECISION);
2264 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_6] = (EB_U16)((meanOf8x8SquaredValuesBlocks[6] - (meanOf8x8Blocks[6] * meanOf8x8Blocks[6])) >> VARIANCE_PRECISION);
2265 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_7] = (EB_U16)((meanOf8x8SquaredValuesBlocks[7] - (meanOf8x8Blocks[7] * meanOf8x8Blocks[7])) >> VARIANCE_PRECISION);
2266 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_8] = (EB_U16)((meanOf8x8SquaredValuesBlocks[8] - (meanOf8x8Blocks[8] * meanOf8x8Blocks[8])) >> VARIANCE_PRECISION);
2267 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_9] = (EB_U16)((meanOf8x8SquaredValuesBlocks[9] - (meanOf8x8Blocks[9] * meanOf8x8Blocks[9])) >> VARIANCE_PRECISION);
2268 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_10] = (EB_U16)((meanOf8x8SquaredValuesBlocks[10] - (meanOf8x8Blocks[10] * meanOf8x8Blocks[10])) >> VARIANCE_PRECISION);
2269 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_11] = (EB_U16)((meanOf8x8SquaredValuesBlocks[11] - (meanOf8x8Blocks[11] * meanOf8x8Blocks[11])) >> VARIANCE_PRECISION);
2270 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_12] = (EB_U16)((meanOf8x8SquaredValuesBlocks[12] - (meanOf8x8Blocks[12] * meanOf8x8Blocks[12])) >> VARIANCE_PRECISION);
2271 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_13] = (EB_U16)((meanOf8x8SquaredValuesBlocks[13] - (meanOf8x8Blocks[13] * meanOf8x8Blocks[13])) >> VARIANCE_PRECISION);
2272 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_14] = (EB_U16)((meanOf8x8SquaredValuesBlocks[14] - (meanOf8x8Blocks[14] * meanOf8x8Blocks[14])) >> VARIANCE_PRECISION);
2273 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_15] = (EB_U16)((meanOf8x8SquaredValuesBlocks[15] - (meanOf8x8Blocks[15] * meanOf8x8Blocks[15])) >> VARIANCE_PRECISION);
2274 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_16] = (EB_U16)((meanOf8x8SquaredValuesBlocks[16] - (meanOf8x8Blocks[16] * meanOf8x8Blocks[16])) >> VARIANCE_PRECISION);
2275 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_17] = (EB_U16)((meanOf8x8SquaredValuesBlocks[17] - (meanOf8x8Blocks[17] * meanOf8x8Blocks[17])) >> VARIANCE_PRECISION);
2276 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_18] = (EB_U16)((meanOf8x8SquaredValuesBlocks[18] - (meanOf8x8Blocks[18] * meanOf8x8Blocks[18])) >> VARIANCE_PRECISION);
2277 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_19] = (EB_U16)((meanOf8x8SquaredValuesBlocks[19] - (meanOf8x8Blocks[19] * meanOf8x8Blocks[19])) >> VARIANCE_PRECISION);
2278 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_20] = (EB_U16)((meanOf8x8SquaredValuesBlocks[20] - (meanOf8x8Blocks[20] * meanOf8x8Blocks[20])) >> VARIANCE_PRECISION);
2279 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_21] = (EB_U16)((meanOf8x8SquaredValuesBlocks[21] - (meanOf8x8Blocks[21] * meanOf8x8Blocks[21])) >> VARIANCE_PRECISION);
2280 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_22] = (EB_U16)((meanOf8x8SquaredValuesBlocks[22] - (meanOf8x8Blocks[22] * meanOf8x8Blocks[22])) >> VARIANCE_PRECISION);
2281 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_23] = (EB_U16)((meanOf8x8SquaredValuesBlocks[23] - (meanOf8x8Blocks[23] * meanOf8x8Blocks[23])) >> VARIANCE_PRECISION);
2282 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_24] = (EB_U16)((meanOf8x8SquaredValuesBlocks[24] - (meanOf8x8Blocks[24] * meanOf8x8Blocks[24])) >> VARIANCE_PRECISION);
2283 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_25] = (EB_U16)((meanOf8x8SquaredValuesBlocks[25] - (meanOf8x8Blocks[25] * meanOf8x8Blocks[25])) >> VARIANCE_PRECISION);
2284 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_26] = (EB_U16)((meanOf8x8SquaredValuesBlocks[26] - (meanOf8x8Blocks[26] * meanOf8x8Blocks[26])) >> VARIANCE_PRECISION);
2285 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_27] = (EB_U16)((meanOf8x8SquaredValuesBlocks[27] - (meanOf8x8Blocks[27] * meanOf8x8Blocks[27])) >> VARIANCE_PRECISION);
2286 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_28] = (EB_U16)((meanOf8x8SquaredValuesBlocks[28] - (meanOf8x8Blocks[28] * meanOf8x8Blocks[28])) >> VARIANCE_PRECISION);
2287 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_29] = (EB_U16)((meanOf8x8SquaredValuesBlocks[29] - (meanOf8x8Blocks[29] * meanOf8x8Blocks[29])) >> VARIANCE_PRECISION);
2288 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_30] = (EB_U16)((meanOf8x8SquaredValuesBlocks[30] - (meanOf8x8Blocks[30] * meanOf8x8Blocks[30])) >> VARIANCE_PRECISION);
2289 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_31] = (EB_U16)((meanOf8x8SquaredValuesBlocks[31] - (meanOf8x8Blocks[31] * meanOf8x8Blocks[31])) >> VARIANCE_PRECISION);
2290 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_32] = (EB_U16)((meanOf8x8SquaredValuesBlocks[32] - (meanOf8x8Blocks[32] * meanOf8x8Blocks[32])) >> VARIANCE_PRECISION);
2291 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_33] = (EB_U16)((meanOf8x8SquaredValuesBlocks[33] - (meanOf8x8Blocks[33] * meanOf8x8Blocks[33])) >> VARIANCE_PRECISION);
2292 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_34] = (EB_U16)((meanOf8x8SquaredValuesBlocks[34] - (meanOf8x8Blocks[34] * meanOf8x8Blocks[34])) >> VARIANCE_PRECISION);
2293 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_35] = (EB_U16)((meanOf8x8SquaredValuesBlocks[35] - (meanOf8x8Blocks[35] * meanOf8x8Blocks[35])) >> VARIANCE_PRECISION);
2294 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_36] = (EB_U16)((meanOf8x8SquaredValuesBlocks[36] - (meanOf8x8Blocks[36] * meanOf8x8Blocks[36])) >> VARIANCE_PRECISION);
2295 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_37] = (EB_U16)((meanOf8x8SquaredValuesBlocks[37] - (meanOf8x8Blocks[37] * meanOf8x8Blocks[37])) >> VARIANCE_PRECISION);
2296 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_38] = (EB_U16)((meanOf8x8SquaredValuesBlocks[38] - (meanOf8x8Blocks[38] * meanOf8x8Blocks[38])) >> VARIANCE_PRECISION);
2297 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_39] = (EB_U16)((meanOf8x8SquaredValuesBlocks[39] - (meanOf8x8Blocks[39] * meanOf8x8Blocks[39])) >> VARIANCE_PRECISION);
2298 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_40] = (EB_U16)((meanOf8x8SquaredValuesBlocks[40] - (meanOf8x8Blocks[40] * meanOf8x8Blocks[40])) >> VARIANCE_PRECISION);
2299 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_41] = (EB_U16)((meanOf8x8SquaredValuesBlocks[41] - (meanOf8x8Blocks[41] * meanOf8x8Blocks[41])) >> VARIANCE_PRECISION);
2300 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_42] = (EB_U16)((meanOf8x8SquaredValuesBlocks[42] - (meanOf8x8Blocks[42] * meanOf8x8Blocks[42])) >> VARIANCE_PRECISION);
2301 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_43] = (EB_U16)((meanOf8x8SquaredValuesBlocks[43] - (meanOf8x8Blocks[43] * meanOf8x8Blocks[43])) >> VARIANCE_PRECISION);
2302 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_44] = (EB_U16)((meanOf8x8SquaredValuesBlocks[44] - (meanOf8x8Blocks[44] * meanOf8x8Blocks[44])) >> VARIANCE_PRECISION);
2303 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_45] = (EB_U16)((meanOf8x8SquaredValuesBlocks[45] - (meanOf8x8Blocks[45] * meanOf8x8Blocks[45])) >> VARIANCE_PRECISION);
2304 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_46] = (EB_U16)((meanOf8x8SquaredValuesBlocks[46] - (meanOf8x8Blocks[46] * meanOf8x8Blocks[46])) >> VARIANCE_PRECISION);
2305 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_47] = (EB_U16)((meanOf8x8SquaredValuesBlocks[47] - (meanOf8x8Blocks[47] * meanOf8x8Blocks[47])) >> VARIANCE_PRECISION);
2306 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_48] = (EB_U16)((meanOf8x8SquaredValuesBlocks[48] - (meanOf8x8Blocks[48] * meanOf8x8Blocks[48])) >> VARIANCE_PRECISION);
2307 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_49] = (EB_U16)((meanOf8x8SquaredValuesBlocks[49] - (meanOf8x8Blocks[49] * meanOf8x8Blocks[49])) >> VARIANCE_PRECISION);
2308 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_50] = (EB_U16)((meanOf8x8SquaredValuesBlocks[50] - (meanOf8x8Blocks[50] * meanOf8x8Blocks[50])) >> VARIANCE_PRECISION);
2309 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_51] = (EB_U16)((meanOf8x8SquaredValuesBlocks[51] - (meanOf8x8Blocks[51] * meanOf8x8Blocks[51])) >> VARIANCE_PRECISION);
2310 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_52] = (EB_U16)((meanOf8x8SquaredValuesBlocks[52] - (meanOf8x8Blocks[52] * meanOf8x8Blocks[52])) >> VARIANCE_PRECISION);
2311 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_53] = (EB_U16)((meanOf8x8SquaredValuesBlocks[53] - (meanOf8x8Blocks[53] * meanOf8x8Blocks[53])) >> VARIANCE_PRECISION);
2312 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_54] = (EB_U16)((meanOf8x8SquaredValuesBlocks[54] - (meanOf8x8Blocks[54] * meanOf8x8Blocks[54])) >> VARIANCE_PRECISION);
2313 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_55] = (EB_U16)((meanOf8x8SquaredValuesBlocks[55] - (meanOf8x8Blocks[55] * meanOf8x8Blocks[55])) >> VARIANCE_PRECISION);
2314 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_56] = (EB_U16)((meanOf8x8SquaredValuesBlocks[56] - (meanOf8x8Blocks[56] * meanOf8x8Blocks[56])) >> VARIANCE_PRECISION);
2315 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_57] = (EB_U16)((meanOf8x8SquaredValuesBlocks[57] - (meanOf8x8Blocks[57] * meanOf8x8Blocks[57])) >> VARIANCE_PRECISION);
2316 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_58] = (EB_U16)((meanOf8x8SquaredValuesBlocks[58] - (meanOf8x8Blocks[58] * meanOf8x8Blocks[58])) >> VARIANCE_PRECISION);
2317 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_59] = (EB_U16)((meanOf8x8SquaredValuesBlocks[59] - (meanOf8x8Blocks[59] * meanOf8x8Blocks[59])) >> VARIANCE_PRECISION);
2318 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_60] = (EB_U16)((meanOf8x8SquaredValuesBlocks[60] - (meanOf8x8Blocks[60] * meanOf8x8Blocks[60])) >> VARIANCE_PRECISION);
2319 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_61] = (EB_U16)((meanOf8x8SquaredValuesBlocks[61] - (meanOf8x8Blocks[61] * meanOf8x8Blocks[61])) >> VARIANCE_PRECISION);
2320 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_62] = (EB_U16)((meanOf8x8SquaredValuesBlocks[62] - (meanOf8x8Blocks[62] * meanOf8x8Blocks[62])) >> VARIANCE_PRECISION);
2321 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_63] = (EB_U16)((meanOf8x8SquaredValuesBlocks[63] - (meanOf8x8Blocks[63] * meanOf8x8Blocks[63])) >> VARIANCE_PRECISION);
2322 
2323 	// 16x16 variances
2324 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_0] = (EB_U16)((meanOf16x16SquaredValuesBlocks[0] - (meanOf16x16Blocks[0] * meanOf16x16Blocks[0])) >> VARIANCE_PRECISION);
2325 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_1] = (EB_U16)((meanOf16x16SquaredValuesBlocks[1] - (meanOf16x16Blocks[1] * meanOf16x16Blocks[1])) >> VARIANCE_PRECISION);
2326 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_2] = (EB_U16)((meanOf16x16SquaredValuesBlocks[2] - (meanOf16x16Blocks[2] * meanOf16x16Blocks[2])) >> VARIANCE_PRECISION);
2327 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_3] = (EB_U16)((meanOf16x16SquaredValuesBlocks[3] - (meanOf16x16Blocks[3] * meanOf16x16Blocks[3])) >> VARIANCE_PRECISION);
2328 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_4] = (EB_U16)((meanOf16x16SquaredValuesBlocks[4] - (meanOf16x16Blocks[4] * meanOf16x16Blocks[4])) >> VARIANCE_PRECISION);
2329 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_5] = (EB_U16)((meanOf16x16SquaredValuesBlocks[5] - (meanOf16x16Blocks[5] * meanOf16x16Blocks[5])) >> VARIANCE_PRECISION);
2330 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_6] = (EB_U16)((meanOf16x16SquaredValuesBlocks[6] - (meanOf16x16Blocks[6] * meanOf16x16Blocks[6])) >> VARIANCE_PRECISION);
2331 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_7] = (EB_U16)((meanOf16x16SquaredValuesBlocks[7] - (meanOf16x16Blocks[7] * meanOf16x16Blocks[7])) >> VARIANCE_PRECISION);
2332 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_8] = (EB_U16)((meanOf16x16SquaredValuesBlocks[8] - (meanOf16x16Blocks[8] * meanOf16x16Blocks[8])) >> VARIANCE_PRECISION);
2333 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_9] = (EB_U16)((meanOf16x16SquaredValuesBlocks[9] - (meanOf16x16Blocks[9] * meanOf16x16Blocks[9])) >> VARIANCE_PRECISION);
2334 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_10] = (EB_U16)((meanOf16x16SquaredValuesBlocks[10] - (meanOf16x16Blocks[10] * meanOf16x16Blocks[10])) >> VARIANCE_PRECISION);
2335 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_11] = (EB_U16)((meanOf16x16SquaredValuesBlocks[11] - (meanOf16x16Blocks[11] * meanOf16x16Blocks[11])) >> VARIANCE_PRECISION);
2336 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_12] = (EB_U16)((meanOf16x16SquaredValuesBlocks[12] - (meanOf16x16Blocks[12] * meanOf16x16Blocks[12])) >> VARIANCE_PRECISION);
2337 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_13] = (EB_U16)((meanOf16x16SquaredValuesBlocks[13] - (meanOf16x16Blocks[13] * meanOf16x16Blocks[13])) >> VARIANCE_PRECISION);
2338 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_14] = (EB_U16)((meanOf16x16SquaredValuesBlocks[14] - (meanOf16x16Blocks[14] * meanOf16x16Blocks[14])) >> VARIANCE_PRECISION);
2339 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_15] = (EB_U16)((meanOf16x16SquaredValuesBlocks[15] - (meanOf16x16Blocks[15] * meanOf16x16Blocks[15])) >> VARIANCE_PRECISION);
2340 
2341 	// 32x32 variances
2342 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_32x32_0] = (EB_U16)((meanOf32x32SquaredValuesBlocks[0] - (meanOf32x32Blocks[0] * meanOf32x32Blocks[0])) >> VARIANCE_PRECISION);
2343 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_32x32_1] = (EB_U16)((meanOf32x32SquaredValuesBlocks[1] - (meanOf32x32Blocks[1] * meanOf32x32Blocks[1])) >> VARIANCE_PRECISION);
2344 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_32x32_2] = (EB_U16)((meanOf32x32SquaredValuesBlocks[2] - (meanOf32x32Blocks[2] * meanOf32x32Blocks[2])) >> VARIANCE_PRECISION);
2345 	pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_32x32_3] = (EB_U16)((meanOf32x32SquaredValuesBlocks[3] - (meanOf32x32Blocks[3] * meanOf32x32Blocks[3])) >> VARIANCE_PRECISION);
2346 
2347 		// 64x64 variance
2348 		pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_64x64] = (EB_U16)((meanOf64x64SquaredValuesBlocks - (meanOf64x64Blocks * meanOf64x64Blocks)) >> VARIANCE_PRECISION);
2349 }
2350 	return return_error;
2351 }
2352 
DenoiseInputPicture(PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr)2353 static EB_ERRORTYPE DenoiseInputPicture(
2354 	PictureAnalysisContext_t	*contextPtr,
2355 	SequenceControlSet_t		*sequenceControlSetPtr,
2356 	PictureParentControlSet_t   *pictureControlSetPtr,
2357 	EbPictureBufferDesc_t       *inputPicturePtr,
2358 	EbPictureBufferDesc_t       *denoisedPicturePtr)
2359 {
2360 	EB_ERRORTYPE return_error = EB_ErrorNone;
2361 
2362 	EB_U32		 lcuIndex;
2363 	EB_U32       lcuOriginX;
2364 	EB_U32       lcuOriginY;
2365 	EB_U16       verticalIdx;
2366     EB_U32 		 colorFormat      = inputPicturePtr->colorFormat;
2367     EB_U16 		 subWidthCMinus1  = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
2368     EB_U16 		 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
2369 	//use denoised input if the source is extremly noisy
2370 	if (pictureControlSetPtr->picNoiseClass >= PIC_NOISE_CLASS_4){
2371 
2372 		EB_U32 inLumaOffSet = inputPicturePtr->originX + inputPicturePtr->originY      * inputPicturePtr->strideY;
2373         EB_U32 inChromaOffSet = (inputPicturePtr->originX >> subWidthCMinus1) + (inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideCb;
2374 		EB_U32 denLumaOffSet = denoisedPicturePtr->originX + denoisedPicturePtr->originY   * denoisedPicturePtr->strideY;
2375         EB_U32 denChromaOffSet = (denoisedPicturePtr->originX >> subWidthCMinus1) + (denoisedPicturePtr->originY >> subHeightCMinus1) * denoisedPicturePtr->strideCb;
2376 
2377 		//filter Luma
2378         for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2379 
2380             LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2381 
2382             lcuOriginX = lcuParams->originX;
2383             lcuOriginY = lcuParams->originY;
2384 
2385 
2386 			if (lcuOriginX == 0)
2387 				StrongLumaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2388 				inputPicturePtr,
2389 				denoisedPicturePtr,
2390 				lcuOriginY,
2391 				lcuOriginX);
2392 
2393 			if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2394 			{
2395 				noiseExtractLumaStrong(
2396 					inputPicturePtr,
2397 					denoisedPicturePtr,
2398 					lcuOriginY,
2399 					lcuOriginX);
2400 			}
2401 
2402 		}
2403 
2404 		//copy Luma
2405 		for (verticalIdx = 0; verticalIdx < inputPicturePtr->height; ++verticalIdx) {
2406 			EB_MEMCPY(inputPicturePtr->bufferY + inLumaOffSet + verticalIdx * inputPicturePtr->strideY,
2407 				denoisedPicturePtr->bufferY + denLumaOffSet + verticalIdx * denoisedPicturePtr->strideY,
2408 				sizeof(EB_U8) * inputPicturePtr->width);
2409 		}
2410 
2411 		//copy chroma
2412         for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2413 
2414             LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2415 
2416             lcuOriginX = lcuParams->originX;
2417             lcuOriginY = lcuParams->originY;
2418 
2419 			if (lcuOriginX == 0)
2420 				StrongChromaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2421 				inputPicturePtr,
2422 				denoisedPicturePtr,
2423 				lcuOriginY >> subHeightCMinus1,
2424 				lcuOriginX >> subWidthCMinus1);
2425 
2426 			if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2427 			{
2428 				noiseExtractChromaStrong(
2429 					inputPicturePtr,
2430 					denoisedPicturePtr,
2431 					lcuOriginY >> subHeightCMinus1,
2432 					lcuOriginX >> subWidthCMinus1);
2433 			}
2434 
2435 		}
2436 
2437 		//copy chroma
2438 		for (verticalIdx = 0; verticalIdx < inputPicturePtr->height >> subHeightCMinus1; ++verticalIdx) {
2439 
2440 			EB_MEMCPY(inputPicturePtr->bufferCb + inChromaOffSet + verticalIdx * inputPicturePtr->strideCb,
2441 				denoisedPicturePtr->bufferCb + denChromaOffSet + verticalIdx * denoisedPicturePtr->strideCb,
2442 				sizeof(EB_U8) * inputPicturePtr->width >> subWidthCMinus1);
2443 
2444 			EB_MEMCPY(inputPicturePtr->bufferCr + inChromaOffSet + verticalIdx * inputPicturePtr->strideCr,
2445 				denoisedPicturePtr->bufferCr + denChromaOffSet + verticalIdx * denoisedPicturePtr->strideCr,
2446 				sizeof(EB_U8) * inputPicturePtr->width >> subWidthCMinus1);
2447 		}
2448 
2449 	}
2450 	else if (pictureControlSetPtr->picNoiseClass >= PIC_NOISE_CLASS_3_1){
2451 
2452 		EB_U32 inLumaOffSet = inputPicturePtr->originX + inputPicturePtr->originY      * inputPicturePtr->strideY;
2453         EB_U32 inChromaOffSet = (inputPicturePtr->originX >> subWidthCMinus1) + (inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideCb;
2454 		EB_U32 denLumaOffSet = denoisedPicturePtr->originX + denoisedPicturePtr->originY   * denoisedPicturePtr->strideY;
2455         EB_U32 denChromaOffSet = (denoisedPicturePtr->originX >> subWidthCMinus1) + (denoisedPicturePtr->originY >> subHeightCMinus1) * denoisedPicturePtr->strideCb;
2456 
2457 
2458 		for (verticalIdx = 0; verticalIdx < inputPicturePtr->height; ++verticalIdx) {
2459 			EB_MEMCPY(inputPicturePtr->bufferY + inLumaOffSet + verticalIdx * inputPicturePtr->strideY,
2460 				denoisedPicturePtr->bufferY + denLumaOffSet + verticalIdx * denoisedPicturePtr->strideY,
2461 				sizeof(EB_U8) * inputPicturePtr->width);
2462 		}
2463 
2464 		//copy chroma
2465         for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2466 
2467             LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2468 
2469             lcuOriginX = lcuParams->originX;
2470             lcuOriginY = lcuParams->originY;
2471 
2472 			if (lcuOriginX == 0)
2473 				WeakChromaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2474 				inputPicturePtr,
2475 				denoisedPicturePtr,
2476 				lcuOriginY >> subHeightCMinus1,
2477 				lcuOriginX >> subWidthCMinus1);
2478 
2479 			if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2480 			{
2481 				noiseExtractChromaWeak(
2482 					inputPicturePtr,
2483 					denoisedPicturePtr,
2484 					lcuOriginY >> subHeightCMinus1,
2485 					lcuOriginX >> subWidthCMinus1);
2486 			}
2487 
2488 		}
2489 
2490 
2491 
2492 		for (verticalIdx = 0; verticalIdx < inputPicturePtr->height >> subHeightCMinus1; ++verticalIdx) {
2493 
2494 			EB_MEMCPY(inputPicturePtr->bufferCb + inChromaOffSet + verticalIdx * inputPicturePtr->strideCb,
2495 				denoisedPicturePtr->bufferCb + denChromaOffSet + verticalIdx * denoisedPicturePtr->strideCb,
2496 				sizeof(EB_U8) * inputPicturePtr->width >> subWidthCMinus1);
2497 
2498 			EB_MEMCPY(inputPicturePtr->bufferCr + inChromaOffSet + verticalIdx * inputPicturePtr->strideCr,
2499 				denoisedPicturePtr->bufferCr + denChromaOffSet + verticalIdx * denoisedPicturePtr->strideCr,
2500 				sizeof(EB_U8) * inputPicturePtr->width >> subWidthCMinus1);
2501 		}
2502 
2503 	}
2504 
2505     else if (contextPtr->picNoiseVarianceFloat >= 1.0  && sequenceControlSetPtr->inputResolution == INPUT_SIZE_4K_RANGE) {
2506 
2507 		//Luma : use filtered only for flatNoise LCUs
2508         for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2509 
2510             LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2511 
2512             lcuOriginX = lcuParams->originX;
2513             lcuOriginY = lcuParams->originY;
2514 
2515 			EB_U32  lcuHeight = MIN(MAX_LCU_SIZE, inputPicturePtr->height - lcuOriginY);
2516 			EB_U32  lcuWidth = MIN(MAX_LCU_SIZE, inputPicturePtr->width - lcuOriginX);
2517 
2518 			EB_U32 inLumaOffSet = inputPicturePtr->originX + lcuOriginX + (inputPicturePtr->originY + lcuOriginY) * inputPicturePtr->strideY;
2519 			EB_U32 denLumaOffSet = denoisedPicturePtr->originX + lcuOriginX + (denoisedPicturePtr->originY + lcuOriginY) * denoisedPicturePtr->strideY;
2520 
2521 
2522 			if (pictureControlSetPtr->lcuFlatNoiseArray[lcuIndex] == 1){
2523 
2524 
2525 				for (verticalIdx = 0; verticalIdx < lcuHeight; ++verticalIdx) {
2526 
2527 					EB_MEMCPY(inputPicturePtr->bufferY + inLumaOffSet + verticalIdx * inputPicturePtr->strideY,
2528 						denoisedPicturePtr->bufferY + denLumaOffSet + verticalIdx * denoisedPicturePtr->strideY,
2529 						sizeof(EB_U8) * lcuWidth);
2530 
2531 				}
2532 			}
2533 		}
2534 	}
2535 
2536 	return return_error;
2537 }
2538 
DetectInputPictureNoise(PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * noisePicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr)2539 static EB_ERRORTYPE DetectInputPictureNoise(
2540 	PictureAnalysisContext_t	*contextPtr,
2541 	SequenceControlSet_t		*sequenceControlSetPtr,
2542 	PictureParentControlSet_t   *pictureControlSetPtr,
2543 	EbPictureBufferDesc_t       *inputPicturePtr,
2544 	EbPictureBufferDesc_t       *noisePicturePtr,
2545 	EbPictureBufferDesc_t       *denoisedPicturePtr)
2546 {
2547 
2548 	EB_ERRORTYPE return_error = EB_ErrorNone;
2549 	EB_U32					 lcuIndex;
2550 
2551 	EB_U64                   picNoiseVariance;
2552 
2553 	EB_U32			         totLcuCount, noiseTh;
2554 
2555 	EB_U32                   lcuOriginX;
2556 	EB_U32                   lcuOriginY;
2557 	EB_U32				     inputLumaOriginIndex;
2558 
2559 	picNoiseVariance = 0;
2560 	totLcuCount = 0;
2561 
2562 	//Variance calc for noise picture
2563     for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2564 
2565         LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2566 
2567         lcuOriginX = lcuParams->originX;
2568         lcuOriginY = lcuParams->originY;
2569 		inputLumaOriginIndex = (noisePicturePtr->originY + lcuOriginY) * noisePicturePtr->strideY +
2570 			noisePicturePtr->originX + lcuOriginX;
2571 
2572 
2573 		EB_U32  noiseOriginIndex = noisePicturePtr->originX + lcuOriginX + noisePicturePtr->originY * noisePicturePtr->strideY;
2574 
2575 		if (lcuOriginX == 0)
2576 			WeakLumaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2577 			inputPicturePtr,
2578 			denoisedPicturePtr,
2579 			noisePicturePtr,
2580 			lcuOriginY,
2581 			lcuOriginX);
2582 
2583 		if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2584 		{
2585 			noiseExtractLumaWeak(
2586 				inputPicturePtr,
2587 				denoisedPicturePtr,
2588 				noisePicturePtr,
2589 				lcuOriginY,
2590 				lcuOriginX);
2591 		}
2592 
2593 		//do it only for complete 64x64 blocks
2594         if (lcuParams->isCompleteLcu)
2595 		{
2596 
2597 			EB_U64 noiseBlkVar32x32[4], denoiseBlkVar32x32[4];
2598 
2599 			EB_U64 noiseBlkVar = ComputeVariance64x64(
2600                 noisePicturePtr,
2601 				noiseOriginIndex,
2602 				noiseBlkVar32x32);
2603 
2604             EB_U64 noiseBlkVarTh ;
2605             EB_U64 denBlkVarTh = FLAT_MAX_VAR;
2606 
2607 			if (pictureControlSetPtr->noiseDetectionTh == 1)
2608 				noiseBlkVarTh = NOISE_MIN_LEVEL_0;
2609 			else
2610 				noiseBlkVarTh = NOISE_MIN_LEVEL_1;
2611 
2612 			picNoiseVariance += (noiseBlkVar >> 16);
2613 
2614 			EB_U64 denBlkVar = ComputeVariance64x64(
2615                 denoisedPicturePtr,
2616 				inputLumaOriginIndex,
2617 				denoiseBlkVar32x32) >> 16;
2618 
2619 			if (denBlkVar < denBlkVarTh && noiseBlkVar > noiseBlkVarTh) {
2620 				pictureControlSetPtr->lcuFlatNoiseArray[lcuIndex] = 1;
2621 			}
2622 
2623 			totLcuCount++;
2624 		}
2625 
2626 	}
2627 
2628     if (totLcuCount > 0) {
2629         contextPtr->picNoiseVarianceFloat = (double)picNoiseVariance / (double)totLcuCount;
2630 
2631         picNoiseVariance = picNoiseVariance / totLcuCount;
2632     }
2633 
2634 	//the variance of a 64x64 noise area tends to be bigger for small resolutions.
2635 	if (sequenceControlSetPtr->lumaHeight <= 720)
2636 		noiseTh = 25;
2637 	else
2638 		noiseTh = 0;
2639 
2640 	if (picNoiseVariance >= 80 + noiseTh)
2641 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_10;
2642 	else if (picNoiseVariance >= 70 + noiseTh)
2643 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_9;
2644 	else if (picNoiseVariance >= 60 + noiseTh)
2645 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_8;
2646 	else if (picNoiseVariance >= 50 + noiseTh)
2647 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_7;
2648 	else if (picNoiseVariance >= 40 + noiseTh)
2649 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_6;
2650 	else if (picNoiseVariance >= 30 + noiseTh)
2651 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_5;
2652 	else if (picNoiseVariance >= 20 + noiseTh)
2653 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_4;
2654 	else if (picNoiseVariance >= 17 + noiseTh)
2655 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3_1;
2656 	else if (picNoiseVariance >= 10 + noiseTh)
2657 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3;
2658 	else if (picNoiseVariance >= 5 + noiseTh)
2659 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_2;
2660 	else
2661 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_1;
2662 
2663 	if (pictureControlSetPtr->picNoiseClass >= PIC_NOISE_CLASS_4)
2664 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3_1;
2665 
2666 	return return_error;
2667 
2668 }
2669 
FullSampleDenoise(PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EB_U32 lcuTotalCount,EB_BOOL denoiseFlag)2670 static EB_ERRORTYPE FullSampleDenoise(
2671 	PictureAnalysisContext_t	*contextPtr,
2672 	SequenceControlSet_t		*sequenceControlSetPtr,
2673 	PictureParentControlSet_t   *pictureControlSetPtr,
2674 	EB_U32                       lcuTotalCount,
2675 	EB_BOOL                      denoiseFlag)
2676 {
2677 
2678 	EB_ERRORTYPE return_error = EB_ErrorNone;
2679 
2680 	EB_U32					 lcuCodingOrder;
2681 	EbPictureBufferDesc_t	*inputPicturePtr = pictureControlSetPtr->enhancedPicturePtr;
2682 	EbPictureBufferDesc_t	*denoisedPicturePtr = contextPtr->denoisedPicturePtr;
2683 	EbPictureBufferDesc_t	*noisePicturePtr = contextPtr->noisePicturePtr;
2684 
2685 	//Reset the flat noise flag array to False for both RealTime/HighComplexity Modes
2686 	for (lcuCodingOrder = 0; lcuCodingOrder < lcuTotalCount; ++lcuCodingOrder) {
2687 		pictureControlSetPtr->lcuFlatNoiseArray[lcuCodingOrder] = 0;
2688     }
2689 
2690     pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_INV; //this init is for both REAL-TIME and BEST-QUALITY
2691 
2692     DetectInputPictureNoise(
2693         contextPtr,
2694         sequenceControlSetPtr,
2695         pictureControlSetPtr,
2696         inputPicturePtr,
2697         noisePicturePtr,
2698         denoisedPicturePtr);
2699 
2700     if (denoiseFlag == EB_TRUE)
2701     {
2702         DenoiseInputPicture(
2703             contextPtr,
2704             sequenceControlSetPtr,
2705             pictureControlSetPtr,
2706             inputPicturePtr,
2707             denoisedPicturePtr);
2708     }
2709 
2710     return return_error;
2711 
2712 }
2713 
SubSampleFilterNoise(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * noisePicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr)2714 static EB_ERRORTYPE SubSampleFilterNoise(
2715 	SequenceControlSet_t		*sequenceControlSetPtr,
2716 	PictureParentControlSet_t   *pictureControlSetPtr,
2717 	EbPictureBufferDesc_t       *inputPicturePtr,
2718 	EbPictureBufferDesc_t       *noisePicturePtr,
2719 	EbPictureBufferDesc_t       *denoisedPicturePtr)
2720 {
2721 	EB_ERRORTYPE return_error = EB_ErrorNone;
2722 
2723 	EB_U32		 lcuIndex;
2724 	EB_U32       lcuOriginX;
2725 	EB_U32       lcuOriginY;
2726 	EB_U16       verticalIdx;
2727     EB_U32       colorFormat = inputPicturePtr->colorFormat;
2728     EB_U16       subWidthCMinus1 = (colorFormat  == EB_YUV444 ? 1 : 2) - 1;
2729     EB_U16       subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
2730 
2731 	if (pictureControlSetPtr->picNoiseClass == PIC_NOISE_CLASS_3_1) {
2732 
2733 		EB_U32 inLumaOffSet = inputPicturePtr->originX + inputPicturePtr->originY      * inputPicturePtr->strideY;
2734         EB_U32 inChromaOffSet = (inputPicturePtr->originX >> subWidthCMinus1) + (inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideCb;
2735 		EB_U32 denLumaOffSet = denoisedPicturePtr->originX + denoisedPicturePtr->originY   * denoisedPicturePtr->strideY;
2736         EB_U32 denChromaOffSet = (denoisedPicturePtr->originX >> subWidthCMinus1) + (denoisedPicturePtr->originY >> subHeightCMinus1) * denoisedPicturePtr->strideCb;
2737 
2738 
2739 		//filter Luma
2740         for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2741 
2742             LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2743 
2744             lcuOriginX = lcuParams->originX;
2745             lcuOriginY = lcuParams->originY;
2746 
2747 			if (lcuOriginX == 0)
2748 				WeakLumaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2749 				inputPicturePtr,
2750 				denoisedPicturePtr,
2751 				noisePicturePtr,
2752 				lcuOriginY,
2753 				lcuOriginX);
2754 
2755 			if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2756 			{
2757 				noiseExtractLumaWeak(
2758 					inputPicturePtr,
2759 					denoisedPicturePtr,
2760 					noisePicturePtr,
2761 					lcuOriginY,
2762 					lcuOriginX);
2763 			}
2764 		}
2765 
2766 		//copy luma
2767 		for (verticalIdx = 0; verticalIdx < inputPicturePtr->height; ++verticalIdx) {
2768 			EB_MEMCPY(inputPicturePtr->bufferY + inLumaOffSet + verticalIdx * inputPicturePtr->strideY,
2769 				denoisedPicturePtr->bufferY + denLumaOffSet + verticalIdx * denoisedPicturePtr->strideY,
2770 				sizeof(EB_U8) * inputPicturePtr->width);
2771 		}
2772 
2773 		//filter chroma
2774         for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2775 
2776             LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2777 
2778             lcuOriginX = lcuParams->originX;
2779             lcuOriginY = lcuParams->originY;
2780 
2781 			if (lcuOriginX == 0)
2782 				WeakChromaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2783 				inputPicturePtr,
2784 				denoisedPicturePtr,
2785 				lcuOriginY >> subHeightCMinus1,
2786 				lcuOriginX >> subWidthCMinus1);
2787 
2788 			if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2789 			{
2790 				noiseExtractChromaWeak(
2791 					inputPicturePtr,
2792 					denoisedPicturePtr,
2793 					lcuOriginY >> subHeightCMinus1,
2794 					lcuOriginX >> subWidthCMinus1);
2795 			}
2796 
2797 		}
2798 
2799 		//copy chroma
2800 		for (verticalIdx = 0; verticalIdx < inputPicturePtr->height >> subHeightCMinus1; ++verticalIdx) {
2801 
2802 			EB_MEMCPY(inputPicturePtr->bufferCb + inChromaOffSet + verticalIdx * inputPicturePtr->strideCb,
2803 				denoisedPicturePtr->bufferCb + denChromaOffSet + verticalIdx * denoisedPicturePtr->strideCb,
2804 				sizeof(EB_U8) * inputPicturePtr->width >> subWidthCMinus1);
2805 
2806 			EB_MEMCPY(inputPicturePtr->bufferCr + inChromaOffSet + verticalIdx * inputPicturePtr->strideCr,
2807 				denoisedPicturePtr->bufferCr + denChromaOffSet + verticalIdx * denoisedPicturePtr->strideCr,
2808 				sizeof(EB_U8) * inputPicturePtr->width >> subWidthCMinus1);
2809 		}
2810 
2811 	} else if (pictureControlSetPtr->picNoiseClass == PIC_NOISE_CLASS_2){
2812 
2813 		EB_U32 newTotFN = 0;
2814 
2815 		//for each LCU ,re check the FN information for only the FNdecim ones
2816         for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2817 
2818             LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2819 
2820             lcuOriginX = lcuParams->originX;
2821             lcuOriginY = lcuParams->originY;
2822 			EB_U32  inputLumaOriginIndex = noisePicturePtr->originX + lcuOriginX + (noisePicturePtr->originY + lcuOriginY) * noisePicturePtr->strideY;
2823 			EB_U32  noiseOriginIndex = noisePicturePtr->originX + lcuOriginX + (noisePicturePtr->originY * noisePicturePtr->strideY);
2824 
2825 			if (lcuParams->isCompleteLcu && pictureControlSetPtr->lcuFlatNoiseArray[lcuIndex] == 1)
2826 			{
2827 
2828 				WeakLumaFilterLcu_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2829 					inputPicturePtr,
2830 					denoisedPicturePtr,
2831 					noisePicturePtr,
2832 					lcuOriginY,
2833 					lcuOriginX);
2834 
2835 				if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2836 				{
2837 					noiseExtractLumaWeakLcu(
2838 						inputPicturePtr,
2839 						denoisedPicturePtr,
2840 						noisePicturePtr,
2841 						lcuOriginY,
2842 						lcuOriginX);
2843 				}
2844 
2845 				EB_U64 noiseBlkVar32x32[4], denoiseBlkVar32x32[4];
2846 				EB_U64 noiseBlkVar = ComputeVariance64x64(
2847                     noisePicturePtr, noiseOriginIndex, noiseBlkVar32x32);
2848 				EB_U64 denBlkVar = ComputeVariance64x64(
2849                     denoisedPicturePtr, inputLumaOriginIndex, denoiseBlkVar32x32) >> 16;
2850 
2851                 EB_U64 noiseBlkVarTh ;
2852                 EB_U64 denBlkVarTh = FLAT_MAX_VAR;
2853 
2854 			    if (pictureControlSetPtr->noiseDetectionTh == 1)
2855 				    noiseBlkVarTh = NOISE_MIN_LEVEL_0;
2856 			    else
2857 				    noiseBlkVarTh = NOISE_MIN_LEVEL_1;
2858 
2859 				if (denBlkVar<denBlkVarTh && noiseBlkVar> noiseBlkVarTh) {
2860 					pictureControlSetPtr->lcuFlatNoiseArray[lcuIndex] = 1;
2861 					//SVT_LOG("POC %i (%i,%i) denBlkVar: %i  noiseBlkVar :%i\n", pictureControlSetPtr->pictureNumber,lcuOriginX,lcuOriginY, denBlkVar, noiseBlkVar);
2862 					newTotFN++;
2863 
2864 				}
2865 				else{
2866 					pictureControlSetPtr->lcuFlatNoiseArray[lcuIndex] = 0;
2867 				}
2868 			}
2869 		}
2870 
2871         //filter Luma
2872         for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2873 
2874             LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2875 
2876             lcuOriginX = lcuParams->originX;
2877             lcuOriginY = lcuParams->originY;
2878 
2879 			if (lcuOriginX + 64 <= inputPicturePtr->width && lcuOriginY + 64 <= inputPicturePtr->height)
2880 			{
2881 
2882 
2883 				//use the denoised for FN LCUs
2884 				if (pictureControlSetPtr->lcuFlatNoiseArray[lcuIndex] == 1){
2885 
2886 					EB_U32  lcuHeight = MIN(MAX_LCU_SIZE, inputPicturePtr->height - lcuOriginY);
2887 					EB_U32  lcuWidth = MIN(MAX_LCU_SIZE, inputPicturePtr->width - lcuOriginX);
2888 
2889 					EB_U32 inLumaOffSet = inputPicturePtr->originX + lcuOriginX + (inputPicturePtr->originY + lcuOriginY) * inputPicturePtr->strideY;
2890 					EB_U32 denLumaOffSet = denoisedPicturePtr->originX + lcuOriginX + (denoisedPicturePtr->originY + lcuOriginY) * denoisedPicturePtr->strideY;
2891 
2892 					for (verticalIdx = 0; verticalIdx < lcuHeight; ++verticalIdx) {
2893 
2894 						EB_MEMCPY(inputPicturePtr->bufferY + inLumaOffSet + verticalIdx * inputPicturePtr->strideY,
2895 							denoisedPicturePtr->bufferY + denLumaOffSet + verticalIdx * denoisedPicturePtr->strideY,
2896 							sizeof(EB_U8) * lcuWidth);
2897 
2898 					}
2899 				}
2900 
2901 			}
2902 
2903 		}
2904 
2905 	}
2906 	return return_error;
2907 }
2908 
QuarterSampleDetectNoise(PictureAnalysisContext_t * contextPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * quarterDecimatedPicturePtr,EbPictureBufferDesc_t * noisePicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EB_U32 pictureWidthInLcu)2909 static EB_ERRORTYPE QuarterSampleDetectNoise(
2910 	PictureAnalysisContext_t	*contextPtr,
2911 	PictureParentControlSet_t   *pictureControlSetPtr,
2912 	EbPictureBufferDesc_t       *quarterDecimatedPicturePtr,
2913 	EbPictureBufferDesc_t       *noisePicturePtr,
2914 	EbPictureBufferDesc_t       *denoisedPicturePtr,
2915 	EB_U32						 pictureWidthInLcu)
2916 {
2917 
2918 	EB_ERRORTYPE return_error = EB_ErrorNone;
2919 
2920 	EB_U64                   picNoiseVariance;
2921 
2922 	EB_U32			         totLcuCount, noiseTh;
2923 
2924 	EB_U32				     blockIndex;
2925 
2926 	picNoiseVariance = 0;
2927 	totLcuCount = 0;
2928 
2929 
2930 	EB_U16 vert64x64Index;
2931 	EB_U16 horz64x64Index;
2932 	EB_U32 block64x64X;
2933 	EB_U32 block64x64Y;
2934 	EB_U32 vert32x32Index;
2935 	EB_U32 horz32x32Index;
2936 	EB_U32 block32x32X;
2937 	EB_U32 block32x32Y;
2938 	EB_U32 noiseOriginIndex;
2939 	EB_U32 lcuCodingOrder;
2940 
2941 	// Loop over 64x64 blocks on the downsampled domain (each block would contain 16 LCUs on the full sampled domain)
2942 	for (vert64x64Index = 0; vert64x64Index < (quarterDecimatedPicturePtr->height / 64); vert64x64Index++){
2943 		for (horz64x64Index = 0; horz64x64Index < (quarterDecimatedPicturePtr->width / 64); horz64x64Index++){
2944 
2945 			block64x64X = horz64x64Index * 64;
2946 			block64x64Y = vert64x64Index * 64;
2947 
2948 			if (block64x64X == 0)
2949 				WeakLumaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2950 				quarterDecimatedPicturePtr,
2951 				denoisedPicturePtr,
2952 				noisePicturePtr,
2953 				block64x64Y,
2954 				block64x64X);
2955 
2956 			if (block64x64Y + MAX_LCU_SIZE > quarterDecimatedPicturePtr->width)
2957 			{
2958 				noiseExtractLumaWeak(
2959 					quarterDecimatedPicturePtr,
2960 					denoisedPicturePtr,
2961 					noisePicturePtr,
2962 					block64x64Y,
2963 					block64x64X);
2964 			}
2965 
2966 
2967 			// Loop over 32x32 blocks (i.e, 64x64 blocks in full resolution)
2968 			for (vert32x32Index = 0; vert32x32Index < 2; vert32x32Index++){
2969 				for (horz32x32Index = 0; horz32x32Index < 2; horz32x32Index++){
2970 
2971 					block32x32X = block64x64X + horz32x32Index * 32;
2972 					block32x32Y = block64x64Y + vert32x32Index * 32;
2973 
2974 					//do it only for complete 32x32 blocks (i.e, complete 64x64 blocks in full resolution)
2975 					if ((block32x32X + 32 <= quarterDecimatedPicturePtr->width) && (block32x32Y + 32 <= quarterDecimatedPicturePtr->height))
2976 					{
2977 
2978 						lcuCodingOrder = ((vert64x64Index * 2) + vert32x32Index) * pictureWidthInLcu + ((horz64x64Index * 2) + horz32x32Index);
2979 
2980 
2981 						EB_U64 noiseBlkVar8x8[16], denoiseBlkVar8x8[16];
2982 
2983 						noiseOriginIndex = noisePicturePtr->originX + block32x32X + noisePicturePtr->originY * noisePicturePtr->strideY;
2984 
2985 						EB_U64 noiseBlkVar = ComputeVariance32x32(
2986 							noisePicturePtr,
2987 							noiseOriginIndex,
2988 							noiseBlkVar8x8);
2989 
2990 
2991 						picNoiseVariance += (noiseBlkVar >> 16);
2992 
2993 						blockIndex = (noisePicturePtr->originY + block32x32Y) * noisePicturePtr->strideY + noisePicturePtr->originX + block32x32X;
2994 
2995 						EB_U64 denBlkVar = ComputeVariance32x32(
2996 							denoisedPicturePtr,
2997 							blockIndex,
2998 							denoiseBlkVar8x8) >> 16;
2999 
3000                         EB_U64 denBlkVarDecTh;
3001 
3002                         if (pictureControlSetPtr->noiseDetectionTh == 0){
3003                             denBlkVarDecTh = NOISE_MIN_LEVEL_DECIM_1;
3004                         }
3005                         else{
3006                             denBlkVarDecTh = NOISE_MIN_LEVEL_DECIM_0;
3007                         }
3008 
3009 						if (denBlkVar < FLAT_MAX_VAR_DECIM && noiseBlkVar> denBlkVarDecTh) {
3010 							pictureControlSetPtr->lcuFlatNoiseArray[lcuCodingOrder] = 1;
3011 						}
3012 
3013 						totLcuCount++;
3014 					}
3015 				}
3016 			}
3017 		}
3018 	}
3019 
3020     if (totLcuCount > 0) {
3021         contextPtr->picNoiseVarianceFloat = (double)picNoiseVariance / (double)totLcuCount;
3022 
3023         picNoiseVariance = picNoiseVariance / totLcuCount;
3024     }
3025 
3026 	//the variance of a 64x64 noise area tends to be bigger for small resolutions.
3027 	//if (sequenceControlSetPtr->lumaHeight <= 720)
3028 	//	noiseTh = 25;
3029 	//else if (sequenceControlSetPtr->lumaHeight <= 1080)
3030 	//	noiseTh = 10;
3031 	//else
3032 	noiseTh = 0;
3033 
3034 	//look for extreme noise or big enough flat noisy area to be denoised.
3035 	if (picNoiseVariance > 60)
3036 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3_1; //Noise+Edge information is too big, so may be this is all noise (action: frame based denoising)
3037 	else if (picNoiseVariance >= 10 + noiseTh)
3038 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3;   //Noise+Edge information is big enough, so there is no big enough flat noisy area (action : no denoising)
3039 	else if (picNoiseVariance >= 5 + noiseTh)
3040 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_2;   //Noise+Edge information is relatively small, so there might be a big enough flat noisy area(action : denoising only for FN blocks)
3041 	else
3042 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_1;   //Noise+Edge information is very small, so no noise nor edge area (action : no denoising)
3043 
3044 
3045 
3046 	return return_error;
3047 
3048 }
3049 
3050 
3051 
SubSampleDetectNoise(PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * sixteenthDecimatedPicturePtr,EbPictureBufferDesc_t * noisePicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EB_U32 pictureWidthInLcu)3052 static EB_ERRORTYPE SubSampleDetectNoise(
3053 	PictureAnalysisContext_t	*contextPtr,
3054 	SequenceControlSet_t		*sequenceControlSetPtr,
3055 	PictureParentControlSet_t   *pictureControlSetPtr,
3056 	EbPictureBufferDesc_t       *sixteenthDecimatedPicturePtr,
3057 	EbPictureBufferDesc_t       *noisePicturePtr,
3058 	EbPictureBufferDesc_t       *denoisedPicturePtr,
3059 	EB_U32						 pictureWidthInLcu)
3060 {
3061 
3062 	EB_ERRORTYPE return_error = EB_ErrorNone;
3063 
3064 	EB_U64                   picNoiseVariance;
3065 
3066 	EB_U32			         totLcuCount, noiseTh;
3067 
3068 	EB_U32				     blockIndex;
3069 
3070 	picNoiseVariance = 0;
3071 	totLcuCount = 0;
3072 
3073 
3074 	EB_U16 vert64x64Index;
3075 	EB_U16 horz64x64Index;
3076 	EB_U32 block64x64X;
3077 	EB_U32 block64x64Y;
3078 	EB_U32 vert16x16Index;
3079 	EB_U32 horz16x16Index;
3080 	EB_U32 block16x16X;
3081 	EB_U32 block16x16Y;
3082 	EB_U32 noiseOriginIndex;
3083 	EB_U32 lcuCodingOrder;
3084 
3085 	// Loop over 64x64 blocks on the downsampled domain (each block would contain 16 LCUs on the full sampled domain)
3086 	for (vert64x64Index = 0; vert64x64Index < (sixteenthDecimatedPicturePtr->height / 64); vert64x64Index++){
3087 		for (horz64x64Index = 0; horz64x64Index < (sixteenthDecimatedPicturePtr->width / 64); horz64x64Index++){
3088 
3089 			block64x64X = horz64x64Index * 64;
3090 			block64x64Y = vert64x64Index * 64;
3091 
3092 			if (block64x64X == 0)
3093 				WeakLumaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
3094 				sixteenthDecimatedPicturePtr,
3095 				denoisedPicturePtr,
3096 				noisePicturePtr,
3097 				block64x64Y,
3098 				block64x64X);
3099 
3100 			if (block64x64Y + MAX_LCU_SIZE > sixteenthDecimatedPicturePtr->width)
3101 			{
3102 				noiseExtractLumaWeak(
3103 					sixteenthDecimatedPicturePtr,
3104 					denoisedPicturePtr,
3105 					noisePicturePtr,
3106 					block64x64Y,
3107 					block64x64X);
3108 			}
3109 
3110 
3111 			// Loop over 16x16 blocks (i.e, 64x64 blocks in full resolution)
3112 			for (vert16x16Index = 0; vert16x16Index < 4; vert16x16Index++){
3113 				for (horz16x16Index = 0; horz16x16Index < 4; horz16x16Index++){
3114 
3115 					block16x16X = block64x64X + horz16x16Index * 16;
3116 					block16x16Y = block64x64Y + vert16x16Index * 16;
3117 
3118 					//do it only for complete 16x16 blocks (i.e, complete 64x64 blocks in full resolution)
3119 					if (block16x16X + 16 <= sixteenthDecimatedPicturePtr->width && block16x16Y + 16 <= sixteenthDecimatedPicturePtr->height)
3120 					{
3121 
3122 						lcuCodingOrder = ((vert64x64Index * 4) + vert16x16Index) * pictureWidthInLcu + ((horz64x64Index * 4) + horz16x16Index);
3123 
3124 
3125 						EB_U64 noiseBlkVar8x8[4], denoiseBlkVar8x8[4];
3126 
3127 						noiseOriginIndex = noisePicturePtr->originX + block16x16X + noisePicturePtr->originY * noisePicturePtr->strideY;
3128 
3129 						EB_U64 noiseBlkVar = ComputeVariance16x16(
3130 							noisePicturePtr,
3131 							noiseOriginIndex,
3132 							noiseBlkVar8x8);
3133 
3134 
3135 						picNoiseVariance += (noiseBlkVar >> 16);
3136 
3137 						blockIndex = (noisePicturePtr->originY + block16x16Y) * noisePicturePtr->strideY + noisePicturePtr->originX + block16x16X;
3138 
3139 						EB_U64 denBlkVar = ComputeVariance16x16(
3140 							denoisedPicturePtr,
3141 							blockIndex,
3142 							denoiseBlkVar8x8) >> 16;
3143 
3144                         EB_U64  noiseBlkVarDecTh ;
3145                         EB_U64 denBlkVarDecTh = FLAT_MAX_VAR_DECIM;
3146 
3147 						if (pictureControlSetPtr->noiseDetectionTh == 1) {
3148 							noiseBlkVarDecTh = NOISE_MIN_LEVEL_DECIM_0;
3149 						}
3150 						else {
3151 							noiseBlkVarDecTh = NOISE_MIN_LEVEL_DECIM_1;
3152 						}
3153 
3154 						if (denBlkVar < denBlkVarDecTh && noiseBlkVar> noiseBlkVarDecTh) {
3155 							pictureControlSetPtr->lcuFlatNoiseArray[lcuCodingOrder] = 1;
3156 						}
3157 						totLcuCount++;
3158 					}
3159 				}
3160 			}
3161 		}
3162 	}
3163 
3164     if (totLcuCount > 0) {
3165         contextPtr->picNoiseVarianceFloat = (double)picNoiseVariance / (double)totLcuCount;
3166 
3167         picNoiseVariance = picNoiseVariance / totLcuCount;
3168     }
3169 
3170 	//the variance of a 64x64 noise area tends to be bigger for small resolutions.
3171 	if (sequenceControlSetPtr->lumaHeight <= 720)
3172 		noiseTh = 25;
3173 	else if (sequenceControlSetPtr->lumaHeight <= 1080)
3174 		noiseTh = 10;
3175 	else
3176 		noiseTh = 0;
3177 
3178 	//look for extreme noise or big enough flat noisy area to be denoised.
3179 	if (picNoiseVariance >= 55 + noiseTh)
3180 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3_1; //Noise+Edge information is too big, so may be this is all noise (action: frame based denoising)
3181 	else if (picNoiseVariance >= 10 + noiseTh)
3182 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3;   //Noise+Edge information is big enough, so there is no big enough flat noisy area (action : no denoising)
3183 	else if (picNoiseVariance >= 5 + noiseTh)
3184 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_2;   //Noise+Edge information is relatively small, so there might be a big enough flat noisy area(action : denoising only for FN blocks)
3185 	else
3186 		pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_1;   //Noise+Edge information is very small, so no noise nor edge area (action : no denoising)
3187 
3188 	return return_error;
3189 
3190 }
3191 
QuarterSampleDenoise(PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * quarterDecimatedPicturePtr,EB_U32 lcuTotalCount,EB_BOOL denoiseFlag,EB_U32 pictureWidthInLcu)3192 static EB_ERRORTYPE QuarterSampleDenoise(
3193 	PictureAnalysisContext_t	*contextPtr,
3194 	SequenceControlSet_t		*sequenceControlSetPtr,
3195 	PictureParentControlSet_t   *pictureControlSetPtr,
3196 	EbPictureBufferDesc_t		*quarterDecimatedPicturePtr,
3197 	EB_U32                       lcuTotalCount,
3198 	EB_BOOL                      denoiseFlag,
3199 	EB_U32						 pictureWidthInLcu)
3200 {
3201 
3202 	EB_ERRORTYPE return_error = EB_ErrorNone;
3203 
3204 	EB_U32					 lcuCodingOrder;
3205 	EbPictureBufferDesc_t	*inputPicturePtr = pictureControlSetPtr->enhancedPicturePtr;
3206 	EbPictureBufferDesc_t	*denoisedPicturePtr = contextPtr->denoisedPicturePtr;
3207 	EbPictureBufferDesc_t	*noisePicturePtr = contextPtr->noisePicturePtr;
3208 
3209 	//Reset the flat noise flag array to False for both RealTime/HighComplexity Modes
3210 	for (lcuCodingOrder = 0; lcuCodingOrder < lcuTotalCount; ++lcuCodingOrder) {
3211 		pictureControlSetPtr->lcuFlatNoiseArray[lcuCodingOrder] = 0;
3212 	}
3213 
3214 	pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_INV; //this init is for both REAL-TIME and BEST-QUALITY
3215 
3216     Decimation2D(
3217         &inputPicturePtr->bufferY[inputPicturePtr->originX + inputPicturePtr->originY * inputPicturePtr->strideY],
3218         inputPicturePtr->strideY,
3219         inputPicturePtr->width,
3220         inputPicturePtr->height,
3221         &quarterDecimatedPicturePtr->bufferY[quarterDecimatedPicturePtr->originX + (quarterDecimatedPicturePtr->originY * quarterDecimatedPicturePtr->strideY)],
3222         quarterDecimatedPicturePtr->strideY,
3223         2);
3224 
3225 
3226 	QuarterSampleDetectNoise(
3227 		contextPtr,
3228 		pictureControlSetPtr,
3229 		quarterDecimatedPicturePtr,
3230 		noisePicturePtr,
3231 		denoisedPicturePtr,
3232 		pictureWidthInLcu);
3233 
3234 	if (denoiseFlag == EB_TRUE) {
3235 
3236         // Turn OFF the de-noiser for Class 2 at QP=29 and lower (for Fixed_QP) and at the target rate of 14Mbps and higher (for RC=ON)
3237 		if ((pictureControlSetPtr->picNoiseClass == PIC_NOISE_CLASS_3_1) || ((pictureControlSetPtr->picNoiseClass == PIC_NOISE_CLASS_2) && ((sequenceControlSetPtr->staticConfig.rateControlMode == 0 && sequenceControlSetPtr->qp > DENOISER_QP_TH) || (sequenceControlSetPtr->staticConfig.rateControlMode != 0 && sequenceControlSetPtr->staticConfig.targetBitRate < DENOISER_BITRATE_TH)))) {
3238 
3239 			SubSampleFilterNoise(
3240 				sequenceControlSetPtr,
3241 				pictureControlSetPtr,
3242 				inputPicturePtr,
3243 				noisePicturePtr,
3244 				denoisedPicturePtr);
3245 		}
3246 	}
3247 
3248 	return return_error;
3249 
3250 }
3251 
3252 
HalfSampleDenoise(PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * sixteenthDecimatedPicturePtr,EB_U32 lcuTotalCount,EB_BOOL denoiseFlag,EB_U32 pictureWidthInLcu)3253 static EB_ERRORTYPE HalfSampleDenoise(
3254 	PictureAnalysisContext_t	*contextPtr,
3255 	SequenceControlSet_t		*sequenceControlSetPtr,
3256 	PictureParentControlSet_t   *pictureControlSetPtr,
3257 	EbPictureBufferDesc_t		*sixteenthDecimatedPicturePtr,
3258 	EB_U32                       lcuTotalCount,
3259 	EB_BOOL                      denoiseFlag,
3260 	EB_U32						 pictureWidthInLcu)
3261 {
3262 
3263 	EB_ERRORTYPE return_error = EB_ErrorNone;
3264 
3265 	EB_U32					 lcuCodingOrder;
3266 	EbPictureBufferDesc_t	*inputPicturePtr = pictureControlSetPtr->enhancedPicturePtr;
3267 	EbPictureBufferDesc_t	*denoisedPicturePtr = contextPtr->denoisedPicturePtr;
3268 	EbPictureBufferDesc_t	*noisePicturePtr = contextPtr->noisePicturePtr;
3269 
3270 	//Reset the flat noise flag array to False for both RealTime/HighComplexity Modes
3271 	for (lcuCodingOrder = 0; lcuCodingOrder < lcuTotalCount; ++lcuCodingOrder) {
3272 		pictureControlSetPtr->lcuFlatNoiseArray[lcuCodingOrder] = 0;
3273 	}
3274 
3275 	pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_INV; //this init is for both REAL-TIME and BEST-QUALITY
3276 
3277     Decimation2D(
3278         &inputPicturePtr->bufferY[inputPicturePtr->originX + inputPicturePtr->originY * inputPicturePtr->strideY],
3279         inputPicturePtr->strideY,
3280         inputPicturePtr->width,
3281         inputPicturePtr->height,
3282         &sixteenthDecimatedPicturePtr->bufferY[sixteenthDecimatedPicturePtr->originX + (sixteenthDecimatedPicturePtr->originY * sixteenthDecimatedPicturePtr->strideY)],
3283         sixteenthDecimatedPicturePtr->strideY,
3284         4);
3285 
3286 	SubSampleDetectNoise(
3287 		contextPtr,
3288 		sequenceControlSetPtr,
3289 		pictureControlSetPtr,
3290 		sixteenthDecimatedPicturePtr,
3291 		noisePicturePtr,
3292 		denoisedPicturePtr,
3293 		pictureWidthInLcu);
3294 
3295 	if (denoiseFlag == EB_TRUE) {
3296 
3297 		// Turn OFF the de-noiser for Class 2 at QP=29 and lower (for Fixed_QP) and at the target rate of 14Mbps and higher (for RC=ON)
3298 		if ((pictureControlSetPtr->picNoiseClass == PIC_NOISE_CLASS_3_1) || ((pictureControlSetPtr->picNoiseClass == PIC_NOISE_CLASS_2) && ((sequenceControlSetPtr->staticConfig.rateControlMode == 0 && sequenceControlSetPtr->qp > DENOISER_QP_TH) || (sequenceControlSetPtr->staticConfig.rateControlMode != 0 && sequenceControlSetPtr->staticConfig.targetBitRate < DENOISER_BITRATE_TH)))) {
3299 
3300 			SubSampleFilterNoise(
3301 				sequenceControlSetPtr,
3302 				pictureControlSetPtr,
3303 				inputPicturePtr,
3304 				noisePicturePtr,
3305 				denoisedPicturePtr);
3306 		}
3307 	}
3308 
3309 	return return_error;
3310 
3311 }
3312 
3313 
3314 /************************************************
3315  * Set Picture Parameters based on input configuration
3316  ** Setting Number of regions per resolution
3317  ** Setting width and height for subpicture and when picture scan type is 1
3318  ************************************************/
SetPictureParametersForStatisticsGathering(SequenceControlSet_t * sequenceControlSetPtr)3319 static void SetPictureParametersForStatisticsGathering(
3320 	SequenceControlSet_t            *sequenceControlSetPtr
3321 	)
3322 {
3323 	sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth = HIGHER_THAN_CLASS_1_REGION_SPLIT_PER_WIDTH;
3324 	sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight = HIGHER_THAN_CLASS_1_REGION_SPLIT_PER_HEIGHT;
3325 	sequenceControlSetPtr->pictureActivityRegionTh = HIGHER_THAN_CLASS_1_PICTURE_ACTIVITY_REGIONS_TH;
3326 
3327 	return;
3328 }
3329 
3330 /************************************************
3331  * Picture Pre Processing Operations *
3332  *** A function that groups all of the Pre proceesing
3333  * operations performed on the input picture
3334  *** Operations included at this point:
3335  ***** Borders preprocessing
3336  ***** Denoising
3337  ************************************************/
PicturePreProcessingOperations(PictureParentControlSet_t * pictureControlSetPtr,PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,EbPictureBufferDesc_t * quarterDecimatedPicturePtr,EbPictureBufferDesc_t * sixteenthDecimatedPicturePtr,EB_U32 lcuTotalCount,EB_U32 pictureWidthInLcu)3338 static void PicturePreProcessingOperations(
3339 	PictureParentControlSet_t       *pictureControlSetPtr,
3340 	PictureAnalysisContext_t        *contextPtr,
3341 	SequenceControlSet_t            *sequenceControlSetPtr,
3342 	EbPictureBufferDesc_t           *quarterDecimatedPicturePtr,
3343 	EbPictureBufferDesc_t           *sixteenthDecimatedPicturePtr,
3344 	EB_U32                           lcuTotalCount,
3345 	EB_U32                           pictureWidthInLcu)
3346 {
3347 	if (pictureControlSetPtr->noiseDetectionMethod == NOISE_DETECT_HALF_PRECISION) {
3348 
3349 		HalfSampleDenoise(
3350 			contextPtr,
3351 			sequenceControlSetPtr,
3352 			pictureControlSetPtr,
3353 			sixteenthDecimatedPicturePtr,
3354 			lcuTotalCount,
3355 			pictureControlSetPtr->enableDenoiseSrcFlag,
3356 			pictureWidthInLcu);
3357 	}
3358     else if (pictureControlSetPtr->noiseDetectionMethod == NOISE_DETECT_QUARTER_PRECISION) {
3359 		QuarterSampleDenoise(
3360 			contextPtr,
3361 			sequenceControlSetPtr,
3362 			pictureControlSetPtr,
3363 			quarterDecimatedPicturePtr,
3364 			lcuTotalCount,
3365 			pictureControlSetPtr->enableDenoiseSrcFlag,
3366 			pictureWidthInLcu);
3367 	} else {
3368 		FullSampleDenoise(
3369 			contextPtr,
3370 			sequenceControlSetPtr,
3371 			pictureControlSetPtr,
3372 			lcuTotalCount,
3373 			pictureControlSetPtr->enableDenoiseSrcFlag
3374 		);
3375 	}
3376 	return;
3377 
3378 }
3379 
3380 /**************************************************************
3381 * Generate picture histogram bins for YUV pixel intensity *
3382 * Calculation is done on a region based (Set previously, resolution dependent)
3383 **************************************************************/
SubSampleLumaGeneratePixelIntensityHistogramBins(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr,EB_U64 * sumAverageIntensityTotalRegionsLuma)3384 static void SubSampleLumaGeneratePixelIntensityHistogramBins(
3385 	SequenceControlSet_t            *sequenceControlSetPtr,
3386 	PictureParentControlSet_t       *pictureControlSetPtr,
3387 	EbPictureBufferDesc_t           *inputPicturePtr,
3388     EB_U64                          *sumAverageIntensityTotalRegionsLuma){
3389 
3390 	EB_U32                          regionWidth;
3391 	EB_U32                          regionHeight;
3392 	EB_U32                          regionWidthOffset;
3393 	EB_U32                          regionHeightOffset;
3394 	EB_U32                          regionInPictureWidthIndex;
3395 	EB_U32                          regionInPictureHeightIndex;
3396 	EB_U32							histogramBin;
3397 	EB_U64                          sum;
3398 
3399 	regionWidth = inputPicturePtr->width / sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth;
3400 	regionHeight = inputPicturePtr->height / sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight;
3401 
3402 	// Loop over regions inside the picture
3403 	for (regionInPictureWidthIndex = 0; regionInPictureWidthIndex < sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth; regionInPictureWidthIndex++){  // loop over horizontal regions
3404 		for (regionInPictureHeightIndex = 0; regionInPictureHeightIndex < sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight; regionInPictureHeightIndex++){ // loop over vertical regions
3405 
3406 
3407 			// Initialize bins to 1
3408 			InitializeBuffer_32bits_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)](pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][0], 64, 0, 1);
3409 
3410 			regionWidthOffset = (regionInPictureWidthIndex == sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth - 1) ?
3411 				inputPicturePtr->width - (sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth * regionWidth) :
3412 				0;
3413 
3414 			regionHeightOffset = (regionInPictureHeightIndex == sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight - 1) ?
3415 				inputPicturePtr->height - (sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight * regionHeight) :
3416 				0;
3417 
3418 			// Y Histogram
3419 			CalculateHistogram(
3420 				&inputPicturePtr->bufferY[(inputPicturePtr->originX + regionInPictureWidthIndex * regionWidth) + ((inputPicturePtr->originY + regionInPictureHeightIndex * regionHeight) * inputPicturePtr->strideY)],
3421 				regionWidth + regionWidthOffset,
3422 				regionHeight + regionHeightOffset,
3423 				inputPicturePtr->strideY,
3424                 1,
3425                 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][0],
3426 				&sum);
3427 
3428 			pictureControlSetPtr->averageIntensityPerRegion[regionInPictureWidthIndex][regionInPictureHeightIndex][0] = (EB_U8)((sum + (((regionWidth + regionWidthOffset)*(regionHeight + regionHeightOffset)) >> 1)) / ((regionWidth + regionWidthOffset)*(regionHeight + regionHeightOffset)));
3429             (*sumAverageIntensityTotalRegionsLuma) += (sum << 4);
3430             for (histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++){ // Loop over the histogram bins
3431 				pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][0][histogramBin] =
3432 					pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][0][histogramBin] << 4;
3433 			}
3434 		}
3435 	}
3436 
3437 	return;
3438 }
3439 
SubSampleChromaGeneratePixelIntensityHistogramBins(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr,EB_U64 * sumAverageIntensityTotalRegionsCb,EB_U64 * sumAverageIntensityTotalRegionsCr)3440 static void SubSampleChromaGeneratePixelIntensityHistogramBins(
3441     SequenceControlSet_t            *sequenceControlSetPtr,
3442     PictureParentControlSet_t       *pictureControlSetPtr,
3443     EbPictureBufferDesc_t           *inputPicturePtr,
3444     EB_U64                          *sumAverageIntensityTotalRegionsCb,
3445     EB_U64                          *sumAverageIntensityTotalRegionsCr){
3446 
3447     EB_U64                          sum;
3448     EB_U32                          regionWidth;
3449     EB_U32                          regionHeight;
3450     EB_U32                          regionWidthOffset;
3451     EB_U32                          regionHeightOffset;
3452     EB_U32                          regionInPictureWidthIndex;
3453     EB_U32                          regionInPictureHeightIndex;
3454 
3455     EB_U16                          histogramBin;
3456     EB_U8                           decimStep = 4;
3457 
3458     regionWidth  = inputPicturePtr->width / sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth;
3459     regionHeight = inputPicturePtr->height / sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight;
3460 
3461     // Loop over regions inside the picture
3462     for (regionInPictureWidthIndex = 0; regionInPictureWidthIndex < sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth; regionInPictureWidthIndex++){  // loop over horizontal regions
3463         for (regionInPictureHeightIndex = 0; regionInPictureHeightIndex < sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight; regionInPictureHeightIndex++){ // loop over vertical regions
3464 
3465 
3466             // Initialize bins to 1
3467 			InitializeBuffer_32bits_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)](pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][1], 64, 0, 1);
3468 			InitializeBuffer_32bits_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)](pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][2], 64, 0, 1);
3469 
3470             regionWidthOffset = (regionInPictureWidthIndex == sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth - 1) ?
3471                 inputPicturePtr->width - (sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth * regionWidth) :
3472                 0;
3473 
3474             regionHeightOffset = (regionInPictureHeightIndex == sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight - 1) ?
3475                 inputPicturePtr->height - (sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight * regionHeight) :
3476                 0;
3477 
3478 
3479             // U Histogram
3480             CalculateHistogram(
3481                 &inputPicturePtr->bufferCb[((inputPicturePtr->originX + regionInPictureWidthIndex * regionWidth) >> 1) + (((inputPicturePtr->originY + regionInPictureHeightIndex * regionHeight) >> 1) * inputPicturePtr->strideCb)],
3482                 (regionWidth + regionWidthOffset) >> 1,
3483                 (regionHeight + regionHeightOffset) >> 1,
3484                 inputPicturePtr->strideCb,
3485                 decimStep,
3486                 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][1],
3487                 &sum);
3488 
3489             sum = (sum << decimStep);
3490             *sumAverageIntensityTotalRegionsCb += sum;
3491             pictureControlSetPtr->averageIntensityPerRegion[regionInPictureWidthIndex][regionInPictureHeightIndex][1] = (EB_U8)((sum + (((regionWidth + regionWidthOffset)*(regionHeight + regionHeightOffset)) >> 3)) / (((regionWidth + regionWidthOffset)*(regionHeight + regionHeightOffset)) >> 2));
3492 
3493             for (histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++){ // Loop over the histogram bins
3494                 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][1][histogramBin] =
3495                     pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][1][histogramBin] << decimStep;
3496             }
3497 
3498             // V Histogram
3499             CalculateHistogram(
3500                 &inputPicturePtr->bufferCr[((inputPicturePtr->originX + regionInPictureWidthIndex * regionWidth) >> 1) + (((inputPicturePtr->originY + regionInPictureHeightIndex * regionHeight) >> 1) * inputPicturePtr->strideCr)],
3501                 (regionWidth + regionWidthOffset) >> 1,
3502                 (regionHeight + regionHeightOffset) >> 1,
3503                 inputPicturePtr->strideCr,
3504                 decimStep,
3505                 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][2],
3506                 &sum);
3507 
3508             sum = (sum << decimStep);
3509             *sumAverageIntensityTotalRegionsCr += sum;
3510             pictureControlSetPtr->averageIntensityPerRegion[regionInPictureWidthIndex][regionInPictureHeightIndex][2] = (EB_U8)((sum + (((regionWidth + regionWidthOffset)*(regionHeight + regionHeightOffset)) >> 3)) / (((regionWidth + regionWidthOffset)*(regionHeight + regionHeightOffset)) >> 2));
3511 
3512             for (histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++){ // Loop over the histogram bins
3513                 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][2][histogramBin] =
3514                     pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][2][histogramBin] << decimStep;
3515             }
3516         }
3517     }
3518     return;
3519 
3520 }
3521 
EdgeDetectionMeanLumaChroma16x16(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,PictureAnalysisContext_t * contextPtr,EB_U32 totalLcuCount)3522 static void EdgeDetectionMeanLumaChroma16x16(
3523 	SequenceControlSet_t        *sequenceControlSetPtr,
3524 	PictureParentControlSet_t   *pictureControlSetPtr,
3525     PictureAnalysisContext_t    *contextPtr,
3526 	EB_U32                       totalLcuCount)
3527 {
3528 
3529 	EB_U32               lcuIndex;
3530 
3531 
3532 	EB_U32 maxGrad = 1;
3533 
3534 	// The values are calculated for every 4th frame
3535 	if ((pictureControlSetPtr->pictureNumber & 3) == 0){
3536 		for (lcuIndex = 0; lcuIndex < totalLcuCount; lcuIndex++) {
3537 
3538 			LcuStat_t *lcuStatPtr = &pictureControlSetPtr->lcuStatArray[lcuIndex];
3539 
3540 			EB_MEMSET(lcuStatPtr, 0, sizeof(LcuStat_t));
3541 			LcuParams_t     *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
3542 			if (lcuParams->potentialLogoLcu &&lcuParams->isCompleteLcu)
3543 
3544 			{
3545 				EB_U8 *yMeanPtr = pictureControlSetPtr->yMean[lcuIndex];
3546 				EB_U8 *crMeanPtr = pictureControlSetPtr->crMean[lcuIndex];
3547 				EB_U8 *cbMeanPtr = pictureControlSetPtr->cbMean[lcuIndex];
3548 
3549 				EB_U8 rasterScanCuIndex;
3550 
3551 				for (rasterScanCuIndex = RASTER_SCAN_CU_INDEX_16x16_0; rasterScanCuIndex <= RASTER_SCAN_CU_INDEX_16x16_15; rasterScanCuIndex++) {
3552 					EB_U8 cuIndex = rasterScanCuIndex - 5;
3553 					EB_U8 x = cuIndex & 3;
3554 					EB_U8 y = (cuIndex >> 2);
3555 					EB_S32 gradx = 0;
3556 					EB_S32 grady = 0;
3557 					EB_S32 nbcompx = 0;
3558 					EB_S32 nbcompy = 0;
3559 					if (x != 0)
3560 					{
3561 						gradx += ABS((EB_S32)(yMeanPtr[rasterScanCuIndex]) - (EB_S32)(yMeanPtr[rasterScanCuIndex - 1]));
3562 						gradx += ABS((EB_S32)(crMeanPtr[rasterScanCuIndex]) - (EB_S32)(crMeanPtr[rasterScanCuIndex - 1]));
3563 						gradx += ABS((EB_S32)(cbMeanPtr[rasterScanCuIndex]) - (EB_S32)(cbMeanPtr[rasterScanCuIndex - 1]));
3564 						nbcompx++;
3565 					}
3566 					if (x != 3)
3567 					{
3568 						gradx += ABS((EB_S32)(yMeanPtr[rasterScanCuIndex + 1]) - (EB_S32)(yMeanPtr[rasterScanCuIndex]));
3569 						gradx += ABS((EB_S32)(crMeanPtr[rasterScanCuIndex + 1]) - (EB_S32)(crMeanPtr[rasterScanCuIndex]));
3570 						gradx += ABS((EB_S32)(cbMeanPtr[rasterScanCuIndex + 1]) - (EB_S32)(cbMeanPtr[rasterScanCuIndex]));
3571 						nbcompx++;
3572 					}
3573 					gradx = gradx / nbcompx;
3574 
3575 
3576 					if (y != 0)
3577 					{
3578 						grady += ABS((EB_S32)(yMeanPtr[rasterScanCuIndex]) - (EB_S32)(yMeanPtr[rasterScanCuIndex - 4]));
3579 						grady += ABS((EB_S32)(crMeanPtr[rasterScanCuIndex]) - (EB_S32)(crMeanPtr[rasterScanCuIndex - 4]));
3580 						grady += ABS((EB_S32)(cbMeanPtr[rasterScanCuIndex]) - (EB_S32)(cbMeanPtr[rasterScanCuIndex - 4]));
3581 						nbcompy++;
3582 					}
3583 					if (y != 3)
3584 					{
3585 						grady += ABS((EB_S32)(yMeanPtr[rasterScanCuIndex + 4]) - (EB_S32)(yMeanPtr[rasterScanCuIndex]));
3586 						grady += ABS((EB_S32)(crMeanPtr[rasterScanCuIndex + 4]) - (EB_S32)(crMeanPtr[rasterScanCuIndex]));
3587 						grady += ABS((EB_S32)(cbMeanPtr[rasterScanCuIndex + 4]) - (EB_S32)(cbMeanPtr[rasterScanCuIndex]));
3588 
3589 						nbcompy++;
3590 					}
3591 
3592 					grady = grady / nbcompy;
3593 
3594                     contextPtr->grad[lcuIndex][rasterScanCuIndex] = (EB_U16) (ABS(gradx) + ABS(grady));
3595 					if (contextPtr->grad[lcuIndex][rasterScanCuIndex] > maxGrad){
3596 						maxGrad = contextPtr->grad[lcuIndex][rasterScanCuIndex];
3597 					}
3598 				}
3599 			}
3600 		}
3601 
3602 		for (lcuIndex = 0; lcuIndex < totalLcuCount; lcuIndex++) {
3603 			LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
3604 			if (lcuParams->potentialLogoLcu &&lcuParams->isCompleteLcu){
3605 				LcuStat_t *lcuStatPtr = &pictureControlSetPtr->lcuStatArray[lcuIndex];
3606 
3607 				EB_U32 rasterScanCuIndex;
3608 				for (rasterScanCuIndex = RASTER_SCAN_CU_INDEX_16x16_0; rasterScanCuIndex <= RASTER_SCAN_CU_INDEX_16x16_15; rasterScanCuIndex++) {
3609 					lcuStatPtr->cuStatArray[rasterScanCuIndex].edgeCu = (EB_U16)MIN(((contextPtr->grad[lcuIndex][rasterScanCuIndex] * (255*3)) / maxGrad), 255) < 30 ? 0 : 1;
3610 				}
3611 			}
3612 		}
3613 	}
3614 	else{
3615 		for (lcuIndex = 0; lcuIndex < totalLcuCount; lcuIndex++) {
3616 
3617 			LcuStat_t *lcuStatPtr = &pictureControlSetPtr->lcuStatArray[lcuIndex];
3618 
3619 			EB_MEMSET(lcuStatPtr, 0, sizeof(LcuStat_t));
3620 		}
3621 	}
3622 }
3623 
3624 /******************************************************
3625 * Edge map derivation
3626 ******************************************************/
EdgeDetection(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr)3627 static void EdgeDetection(
3628 	SequenceControlSet_t            *sequenceControlSetPtr,
3629 	PictureParentControlSet_t       *pictureControlSetPtr)
3630 {
3631 
3632 	EB_U16  *variancePtr;
3633 	EB_U64 thrsldLevel0 = (pictureControlSetPtr->picAvgVariance * 70) / 100;
3634 	EB_U8  *meanPtr;
3635 	EB_U32 pictureWidthInLcu = (sequenceControlSetPtr->lumaWidth + sequenceControlSetPtr->lcuSize - 1) / sequenceControlSetPtr->lcuSize;
3636 	EB_U32 pictureHeightInLcu = (sequenceControlSetPtr->lumaHeight + sequenceControlSetPtr->lcuSize - 1) / sequenceControlSetPtr->lcuSize;
3637 	EB_U32 neighbourLcuIndex = 0;
3638 	EB_U64 similarityCount = 0;
3639 	EB_U64 similarityCount0 = 0;
3640 	EB_U64 similarityCount1 = 0;
3641 	EB_U64 similarityCount2 = 0;
3642 	EB_U64 similarityCount3 = 0;
3643 	EB_U32 lcu_X = 0;
3644 	EB_U32 lcu_Y = 0;
3645 	EB_U32 lcuIndex;
3646 	EB_BOOL highVarianceLucFlag;
3647 
3648 	EB_U32 rasterScanCuIndex = 0;
3649 	EB_U32 numberOfEdgeLcu = 0;
3650 	EB_BOOL highIntensityLcuFlag;
3651 
3652 	EB_U64 neighbourLcuMean;
3653 	EB_S32 i, j;
3654 
3655 	EB_U8 highIntensityTh = 180;
3656 	EB_U8 lowIntensityTh  = 120;
3657     EB_U8 highIntensityTh1   = 200;
3658     EB_U8 veryLowIntensityTh =  20;
3659 
3660     for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
3661 
3662         LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
3663 
3664         lcu_X = lcuParams->horizontalIndex;
3665         lcu_Y = lcuParams->verticalIndex;
3666 
3667         EdgeLcuResults_t *edgeResultsPtr = pictureControlSetPtr->edgeResultsPtr;
3668         pictureControlSetPtr->edgeResultsPtr[lcuIndex].edgeBlockNum = 0;
3669         pictureControlSetPtr->edgeResultsPtr[lcuIndex].isolatedHighIntensityLcu = 0;
3670         pictureControlSetPtr->sharpEdgeLcuFlag[lcuIndex] = 0;
3671 
3672 		if (lcu_X >  0 && lcu_X < (EB_U32)(pictureWidthInLcu - 1) && lcu_Y >  0 && lcu_Y < (EB_U32)(pictureHeightInLcu - 1)){
3673 
3674 			variancePtr = pictureControlSetPtr->variance[lcuIndex];
3675 		    meanPtr = pictureControlSetPtr->yMean[lcuIndex];
3676 
3677 
3678 			similarityCount = 0;
3679 
3680 			highVarianceLucFlag =
3681 				(variancePtr[RASTER_SCAN_CU_INDEX_64x64] > thrsldLevel0) ? EB_TRUE : EB_FALSE;
3682             edgeResultsPtr[lcuIndex].edgeBlockNum = highVarianceLucFlag;
3683             if (variancePtr[0] > highIntensityTh1){
3684                 EB_U8 sharpEdge = 0;
3685                 for (rasterScanCuIndex = RASTER_SCAN_CU_INDEX_16x16_0; rasterScanCuIndex <= RASTER_SCAN_CU_INDEX_16x16_15; rasterScanCuIndex++) {
3686                     sharpEdge = (variancePtr[rasterScanCuIndex] < veryLowIntensityTh) ? sharpEdge + 1 : sharpEdge;
3687 
3688                 }
3689                 if (sharpEdge > 4)
3690                 {
3691                     pictureControlSetPtr->sharpEdgeLcuFlag[lcuIndex] = 1;
3692                 }
3693             }
3694 
3695 
3696 			if (lcu_X > 3 && lcu_X < (EB_U32)(pictureWidthInLcu - 4) && lcu_Y >  3 && lcu_Y < (EB_U32)(pictureHeightInLcu - 4)){
3697 
3698 				highIntensityLcuFlag =
3699 					(meanPtr[RASTER_SCAN_CU_INDEX_64x64] > highIntensityTh) ? EB_TRUE : EB_FALSE;
3700 
3701 				if (highIntensityLcuFlag){
3702 
3703 					neighbourLcuIndex = lcuIndex - 1;
3704 					neighbourLcuMean = pictureControlSetPtr->yMean[neighbourLcuIndex][RASTER_SCAN_CU_INDEX_64x64];
3705 
3706 					similarityCount0 = (neighbourLcuMean < lowIntensityTh) ? 1 : 0;
3707 
3708 					neighbourLcuIndex = lcuIndex + 1;
3709 
3710 					neighbourLcuMean = pictureControlSetPtr->yMean[neighbourLcuIndex][RASTER_SCAN_CU_INDEX_64x64];
3711 					similarityCount1 = (neighbourLcuMean < lowIntensityTh) ? 1 : 0;
3712 
3713 					neighbourLcuIndex = lcuIndex - pictureWidthInLcu;
3714 					neighbourLcuMean = pictureControlSetPtr->yMean[neighbourLcuIndex][RASTER_SCAN_CU_INDEX_64x64];
3715 					similarityCount2 = (neighbourLcuMean < lowIntensityTh) ? 1 : 0;
3716 
3717 					neighbourLcuIndex = lcuIndex + pictureWidthInLcu;
3718 					neighbourLcuMean = pictureControlSetPtr->yMean[neighbourLcuIndex][RASTER_SCAN_CU_INDEX_64x64];
3719 					similarityCount3 = (neighbourLcuMean < lowIntensityTh) ? 1 : 0;
3720 
3721 					similarityCount = similarityCount0 + similarityCount1 + similarityCount2 + similarityCount3;
3722 
3723 					if (similarityCount > 0){
3724 
3725 
3726 						for (i = -4; i < 5; i++){
3727 							for (j = -4; j < 5; j++){
3728 								neighbourLcuIndex = lcuIndex + (i * pictureWidthInLcu) + j;
3729                                 pictureControlSetPtr->edgeResultsPtr[neighbourLcuIndex].isolatedHighIntensityLcu = 1;
3730 							}
3731 						}
3732 					}
3733 				}
3734 			}
3735 
3736 
3737             if (highVarianceLucFlag){
3738                 numberOfEdgeLcu += edgeResultsPtr[lcuIndex].edgeBlockNum;
3739 			}
3740 		}
3741 	}
3742 
3743 	pictureControlSetPtr->lcuBlockPercentage = (EB_U8)((numberOfEdgeLcu * 100) / pictureControlSetPtr->lcuTotalCount);
3744 
3745 	return;
3746 }
3747 
3748 /******************************************************
3749 * Calculate the variance of variance to determine Homogeneous regions. Note: Variance calculation should be on.
3750 ******************************************************/
DetermineHomogeneousRegionInPicture(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr)3751 static inline void DetermineHomogeneousRegionInPicture(
3752     SequenceControlSet_t            *sequenceControlSetPtr,
3753     PictureParentControlSet_t       *pictureControlSetPtr)
3754 {
3755 
3756     EB_U16  *variancePtr;
3757     EB_U32 lcuIndex;
3758 
3759     EB_U32 cuNum, cuSize, cuIndexOffset, cuH, cuW;
3760     EB_U64 nullVarCnt = 0;
3761     EB_U64 veryLowVarCnt = 0;
3762     EB_U64 varLcuCnt = 0;
3763     EB_U32 lcuTotalCount = pictureControlSetPtr->lcuTotalCount;
3764 
3765     for (lcuIndex = 0; lcuIndex < lcuTotalCount; ++lcuIndex) {
3766         EB_U64 meanSqrVariance32x32Based[4] = { 0 }, meanVariance32x32Based[4] = { 0 };
3767 
3768         EB_U64 meanSqrVariance64x64Based = 0, meanVariance64x64Based = 0;
3769         EB_U64 varOfVar64x64Based = 0;
3770 
3771         LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
3772 
3773         // Initialize
3774         pictureControlSetPtr->lcuHomogeneousAreaArray[lcuIndex] = EB_TRUE;
3775 
3776         variancePtr = pictureControlSetPtr->variance[lcuIndex];
3777 
3778         if (lcuParams->isCompleteLcu){
3779 
3780             nullVarCnt += (variancePtr[ME_TIER_ZERO_PU_64x64] == 0) ? 1 : 0;
3781 
3782             varLcuCnt++;
3783 
3784             veryLowVarCnt += ((variancePtr[ME_TIER_ZERO_PU_64x64]) < LCU_LOW_VAR_TH) ? 1 : 0;
3785             cuSize = 8;
3786             cuIndexOffset = ME_TIER_ZERO_PU_8x8_0;
3787             cuNum = 64 / cuSize;
3788 
3789             //Variance of 8x8 blocks in a 32x32
3790             for (cuH = 0; cuH < (cuNum / 2); cuH++){
3791                 for (cuW = 0; cuW < (cuNum / 2); cuW++){
3792 
3793                     meanSqrVariance32x32Based[0] += (variancePtr[cuIndexOffset + cuH*cuNum + cuW])*(variancePtr[cuIndexOffset + cuH*cuNum + cuW]);
3794                     meanVariance32x32Based[0] += (variancePtr[cuIndexOffset + cuH*cuNum + cuW]);
3795 
3796                     meanSqrVariance32x32Based[1] += (variancePtr[cuIndexOffset + cuH*cuNum + cuW + 4])*(variancePtr[cuIndexOffset + cuH*cuNum + cuW + 4]);
3797                     meanVariance32x32Based[1] += (variancePtr[cuIndexOffset + cuH*cuNum + cuW + 4]);
3798 
3799                     meanSqrVariance32x32Based[2] += (variancePtr[cuIndexOffset + (cuH + 4)*cuNum + cuW])*(variancePtr[cuIndexOffset + (cuH + 4)*cuNum + cuW]);
3800                     meanVariance32x32Based[2] += (variancePtr[cuIndexOffset + (cuH + 4)*cuNum + cuW]);
3801 
3802                     meanSqrVariance32x32Based[3] += (variancePtr[cuIndexOffset + (cuH + 4)*cuNum + cuW + 4])*(variancePtr[cuIndexOffset + (cuH + 4)*cuNum + cuW + 4]);
3803                     meanVariance32x32Based[3] += (variancePtr[cuIndexOffset + (cuH + 4)*cuNum + cuW + 4]);
3804 
3805                 }
3806             }
3807 
3808             meanSqrVariance32x32Based[0] = meanSqrVariance32x32Based[0] >> 4;
3809             meanVariance32x32Based[0] = meanVariance32x32Based[0] >> 4;
3810             pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][0] = meanSqrVariance32x32Based[0] - meanVariance32x32Based[0] * meanVariance32x32Based[0];
3811 
3812             meanSqrVariance32x32Based[1] = meanSqrVariance32x32Based[1] >> 4;
3813             meanVariance32x32Based[1] = meanVariance32x32Based[1] >> 4;
3814             pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][1] = meanSqrVariance32x32Based[1] - meanVariance32x32Based[1] * meanVariance32x32Based[1];
3815 
3816             meanSqrVariance32x32Based[2] = meanSqrVariance32x32Based[2] >> 4;
3817             meanVariance32x32Based[2] = meanVariance32x32Based[2] >> 4;
3818             pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][2] = meanSqrVariance32x32Based[2] - meanVariance32x32Based[2] * meanVariance32x32Based[2];
3819 
3820             meanSqrVariance32x32Based[3] = meanSqrVariance32x32Based[3] >> 4;
3821             meanVariance32x32Based[3] = meanVariance32x32Based[3] >> 4;
3822             pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][3] = meanSqrVariance32x32Based[3] - meanVariance32x32Based[3] * meanVariance32x32Based[3];
3823 
3824             // Compute the 64x64 based variance of variance
3825             {
3826                 EB_U32 varIndex;
3827                 // Loop over all 8x8s in a 64x64
3828                 for (varIndex = ME_TIER_ZERO_PU_8x8_0; varIndex <= ME_TIER_ZERO_PU_8x8_63; varIndex++) {
3829                     meanSqrVariance64x64Based += variancePtr[varIndex] * variancePtr[varIndex];
3830                     meanVariance64x64Based += variancePtr[varIndex];
3831                 }
3832 
3833                 meanSqrVariance64x64Based = meanSqrVariance64x64Based >> 6;
3834                 meanVariance64x64Based = meanVariance64x64Based >> 6;
3835 
3836                 // Compute variance
3837                 varOfVar64x64Based = meanSqrVariance64x64Based - meanVariance64x64Based * meanVariance64x64Based;
3838 
3839                 // Turn off detail preservation if the varOfVar is greater than a threshold
3840                 if (varOfVar64x64Based > VAR_BASED_DETAIL_PRESERVATION_SELECTOR_THRSLHD)
3841                 {
3842                     pictureControlSetPtr->lcuHomogeneousAreaArray[lcuIndex] = EB_FALSE;
3843                 }
3844             }
3845 
3846         }
3847         else{
3848 
3849             // Should be re-calculated and scaled properly
3850             pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][0] = 0xFFFFFFFFFFFFFFFF;
3851             pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][1] = 0xFFFFFFFFFFFFFFFF;
3852             pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][2] = 0xFFFFFFFFFFFFFFFF;
3853             pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][3] = 0xFFFFFFFFFFFFFFFF;
3854         }
3855     }
3856     pictureControlSetPtr->veryLowVarPicFlag = EB_FALSE;
3857     if (varLcuCnt > 0) {
3858         if (((veryLowVarCnt * 100) / varLcuCnt) > PIC_LOW_VAR_PERCENTAGE_TH) {
3859             pictureControlSetPtr->veryLowVarPicFlag = EB_TRUE;
3860         }
3861     }
3862 
3863     pictureControlSetPtr->logoPicFlag = EB_FALSE;
3864     if (varLcuCnt > 0) {
3865         if (((veryLowVarCnt * 100) / varLcuCnt) > 80) {
3866             pictureControlSetPtr->logoPicFlag = EB_TRUE;
3867         }
3868     }
3869 
3870     return;
3871 }
3872 
3873 /************************************************
3874  * ComputePictureSpatialStatistics
3875  ** Compute Block Variance
3876  ** Compute Picture Variance
3877  ** Compute Block Mean for all blocks in the picture
3878  ************************************************/
ComputePictureSpatialStatistics(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,PictureAnalysisContext_t * contextPtr,EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * inputPaddedPicturePtr,EB_U32 lcuTotalCount)3879 static void ComputePictureSpatialStatistics(
3880 	SequenceControlSet_t            *sequenceControlSetPtr,
3881 	PictureParentControlSet_t       *pictureControlSetPtr,
3882     PictureAnalysisContext_t        *contextPtr,
3883 	EbPictureBufferDesc_t           *inputPicturePtr,
3884 	EbPictureBufferDesc_t           *inputPaddedPicturePtr,
3885 	EB_U32                           lcuTotalCount)
3886 {
3887 	EB_U32 lcuIndex;
3888 	EB_U32 lcuOriginX;        // to avoid using child PCS
3889 	EB_U32 lcuOriginY;
3890 	EB_U32 inputLumaOriginIndex;
3891 	EB_U32 inputCbOriginIndex;
3892 	EB_U32 inputCrOriginIndex;
3893 	EB_U64 picTotVariance;
3894 
3895 	// Variance
3896 	picTotVariance = 0;
3897 
3898 	for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
3899         LcuParams_t   *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
3900 
3901 		lcuOriginX = lcuParams->originX;
3902 		lcuOriginY = lcuParams->originY;
3903 		inputLumaOriginIndex = (inputPaddedPicturePtr->originY + lcuOriginY) * inputPaddedPicturePtr->strideY +
3904 			inputPaddedPicturePtr->originX + lcuOriginX;
3905 
3906 		inputCbOriginIndex = ((inputPicturePtr->originY + lcuOriginY) >> 1) * inputPicturePtr->strideCb + ((inputPicturePtr->originX + lcuOriginX) >> 1);
3907 		inputCrOriginIndex = ((inputPicturePtr->originY + lcuOriginY) >> 1) * inputPicturePtr->strideCr + ((inputPicturePtr->originX + lcuOriginX) >> 1);
3908 
3909 		ComputeBlockMeanComputeVariance(
3910 			pictureControlSetPtr,
3911 			inputPaddedPicturePtr,
3912 			lcuIndex,
3913 			inputLumaOriginIndex);
3914 
3915         if (lcuParams->isCompleteLcu){
3916 
3917 			ComputeChromaBlockMean(
3918 				pictureControlSetPtr,
3919 				inputPicturePtr,
3920 				lcuIndex,
3921 				inputCbOriginIndex,
3922 				inputCrOriginIndex);
3923 		}
3924 		else{
3925 			ZeroOutChromaBlockMean(
3926 				pictureControlSetPtr,
3927 				lcuIndex);
3928 		}
3929 
3930 		picTotVariance += (pictureControlSetPtr->variance[lcuIndex][RASTER_SCAN_CU_INDEX_64x64]);
3931 	}
3932 
3933 	pictureControlSetPtr->picAvgVariance = (EB_U16) (picTotVariance / lcuTotalCount);
3934     // Calculate the variance of variance to determine Homogeneous regions. Note: Variance calculation should be on.
3935     DetermineHomogeneousRegionInPicture(
3936         sequenceControlSetPtr,
3937         pictureControlSetPtr);
3938 
3939     EdgeDetectionMeanLumaChroma16x16(
3940         sequenceControlSetPtr,
3941         pictureControlSetPtr,
3942         contextPtr,
3943         sequenceControlSetPtr->lcuTotalCount);
3944 
3945 	EdgeDetection(
3946 		sequenceControlSetPtr,
3947 		pictureControlSetPtr);
3948 
3949 
3950 	return;
3951 }
3952 
CalculateInputAverageIntensity(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr,EB_U64 sumAverageIntensityTotalRegionsLuma,EB_U64 sumAverageIntensityTotalRegionsCb,EB_U64 sumAverageIntensityTotalRegionsCr)3953 static void CalculateInputAverageIntensity(
3954     SequenceControlSet_t            *sequenceControlSetPtr,
3955 	PictureParentControlSet_t       *pictureControlSetPtr,
3956 	EbPictureBufferDesc_t           *inputPicturePtr,
3957 	EB_U64                           sumAverageIntensityTotalRegionsLuma,
3958 	EB_U64                           sumAverageIntensityTotalRegionsCb,
3959 	EB_U64                           sumAverageIntensityTotalRegionsCr)
3960 {
3961 
3962     if (sequenceControlSetPtr->scdMode == SCD_MODE_0){
3963         EB_U16 blockIndexInWidth;
3964         EB_U16 blockIndexInHeight;
3965         EB_U64 mean = 0;
3966 
3967         const EB_U16 strideY = inputPicturePtr->strideY;
3968 
3969         // Loop over 8x8 blocks and calculates the mean value
3970         for (blockIndexInHeight = 0; blockIndexInHeight < inputPicturePtr->height >> 3; ++blockIndexInHeight) {
3971             for (blockIndexInWidth = 0; blockIndexInWidth < inputPicturePtr->width >> 3; ++blockIndexInWidth) {
3972                 mean += ComputeSubMean8x8_SSE2_INTRIN(&(inputPicturePtr->bufferY[(blockIndexInWidth << 3) + (blockIndexInHeight << 3) * strideY]), strideY);
3973             }
3974         }
3975 
3976         mean = ((mean + ((inputPicturePtr->height* inputPicturePtr->width) >> 7)) / ((inputPicturePtr->height* inputPicturePtr->width) >> 6));
3977         mean = (mean + (1 << (MEAN_PRECISION - 1))) >> MEAN_PRECISION;
3978         pictureControlSetPtr->averageIntensity[0] = (EB_U8)mean;
3979     }
3980 
3981     else{
3982         pictureControlSetPtr->averageIntensity[0] = (EB_U8)((sumAverageIntensityTotalRegionsLuma + ((inputPicturePtr->width*inputPicturePtr->height) >> 1)) / (inputPicturePtr->width*inputPicturePtr->height));
3983         pictureControlSetPtr->averageIntensity[1] = (EB_U8)((sumAverageIntensityTotalRegionsCb + ((inputPicturePtr->width*inputPicturePtr->height) >> 3)) / ((inputPicturePtr->width*inputPicturePtr->height) >> 2));
3984         pictureControlSetPtr->averageIntensity[2] = (EB_U8)((sumAverageIntensityTotalRegionsCr + ((inputPicturePtr->width*inputPicturePtr->height) >> 3)) / ((inputPicturePtr->width*inputPicturePtr->height) >> 2));
3985     }
3986 
3987     return;
3988 }
3989 
3990 /************************************************
3991  * Gathering statistics per picture
3992  ** Calculating the pixel intensity histogram bins per picture needed for SCD
3993  ** Computing Picture Variance
3994  ************************************************/
GatheringPictureStatistics(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,PictureAnalysisContext_t * contextPtr,EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * inputPaddedPicturePtr,EbPictureBufferDesc_t * sixteenthDecimatedPicturePtr,EB_U32 lcuTotalCount)3995 static void GatheringPictureStatistics(
3996 	SequenceControlSet_t            *sequenceControlSetPtr,
3997 	PictureParentControlSet_t       *pictureControlSetPtr,
3998     PictureAnalysisContext_t        *contextPtr,
3999 	EbPictureBufferDesc_t           *inputPicturePtr,
4000 	EbPictureBufferDesc_t           *inputPaddedPicturePtr,
4001 	EbPictureBufferDesc_t			*sixteenthDecimatedPicturePtr,
4002 	EB_U32                           lcuTotalCount)
4003 {
4004 
4005 	EB_U64                          sumAverageIntensityTotalRegionsLuma = 0;
4006 	EB_U64                          sumAverageIntensityTotalRegionsCb = 0;
4007 	EB_U64                          sumAverageIntensityTotalRegionsCr = 0;
4008 
4009 	// Histogram bins
4010    // Use 1/16 Luma for Histogram generation
4011    // 1/16 input ready
4012    SubSampleLumaGeneratePixelIntensityHistogramBins(
4013        sequenceControlSetPtr,
4014        pictureControlSetPtr,
4015        sixteenthDecimatedPicturePtr,
4016        &sumAverageIntensityTotalRegionsLuma);
4017 
4018    // Use 1/4 Chroma for Histogram generation
4019    // 1/4 input not ready => perform operation on the fly
4020    SubSampleChromaGeneratePixelIntensityHistogramBins(
4021        sequenceControlSetPtr,
4022        pictureControlSetPtr,
4023        inputPicturePtr,
4024        &sumAverageIntensityTotalRegionsCb,
4025        &sumAverageIntensityTotalRegionsCr);
4026 
4027 	// Calculate the LUMA average intensity
4028     CalculateInputAverageIntensity(
4029         sequenceControlSetPtr,
4030         pictureControlSetPtr,
4031         inputPicturePtr,
4032         sumAverageIntensityTotalRegionsLuma,
4033         sumAverageIntensityTotalRegionsCb,
4034         sumAverageIntensityTotalRegionsCr);
4035 
4036 	ComputePictureSpatialStatistics(
4037 		sequenceControlSetPtr,
4038 		pictureControlSetPtr,
4039         contextPtr,
4040 		inputPicturePtr,
4041 		inputPaddedPicturePtr,
4042 		lcuTotalCount);
4043 
4044 	return;
4045 }
4046 
4047 /************************************************
4048  * Pad Picture at the right and bottom sides
4049  ** To match a multiple of min CU size in width and height
4050  ************************************************/
PadPictureToMultipleOfMinCuSizeDimensions(SequenceControlSet_t * sequenceControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr)4051 static void PadPictureToMultipleOfMinCuSizeDimensions(
4052 	SequenceControlSet_t            *sequenceControlSetPtr,
4053 	EbPictureBufferDesc_t           *inputPicturePtr)
4054 {
4055     EB_BOOL is16BitInput = (EB_BOOL)(sequenceControlSetPtr->staticConfig.encoderBitDepth > EB_8BIT);
4056     EB_U32 colorFormat = inputPicturePtr->colorFormat;
4057     EB_U16 subWidthCMinus1  = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
4058     EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
4059 
4060 	// Input Picture Padding
4061 	PadInputPicture(
4062 		&inputPicturePtr->bufferY[inputPicturePtr->originX + (inputPicturePtr->originY * inputPicturePtr->strideY)],
4063 		inputPicturePtr->strideY,
4064 		(inputPicturePtr->width - sequenceControlSetPtr->padRight),
4065 		(inputPicturePtr->height - sequenceControlSetPtr->padBottom),
4066 		sequenceControlSetPtr->padRight,
4067 		sequenceControlSetPtr->padBottom);
4068 
4069 	PadInputPicture(
4070 		&inputPicturePtr->bufferCb[(inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideCb)],
4071 		inputPicturePtr->strideCb,
4072         (inputPicturePtr->width - sequenceControlSetPtr->padRight) >> subWidthCMinus1,
4073         (inputPicturePtr->height - sequenceControlSetPtr->padBottom) >> subHeightCMinus1,
4074         sequenceControlSetPtr->padRight >> subWidthCMinus1,
4075         sequenceControlSetPtr->padBottom >> subHeightCMinus1);
4076 
4077 	PadInputPicture(
4078 		&inputPicturePtr->bufferCr[(inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideCr)],
4079 		inputPicturePtr->strideCr,
4080         (inputPicturePtr->width - sequenceControlSetPtr->padRight) >> subWidthCMinus1,
4081         (inputPicturePtr->height - sequenceControlSetPtr->padBottom) >> subHeightCMinus1,
4082         sequenceControlSetPtr->padRight >> subWidthCMinus1,
4083         sequenceControlSetPtr->padBottom >> subHeightCMinus1);
4084 
4085     if (is16BitInput) {
4086         PadInputPicture(
4087             &inputPicturePtr->bufferBitIncY[inputPicturePtr->originX + (inputPicturePtr->originY * inputPicturePtr->strideBitIncY)],
4088             inputPicturePtr->strideBitIncY,
4089             (inputPicturePtr->width - sequenceControlSetPtr->padRight),
4090             (inputPicturePtr->height - sequenceControlSetPtr->padBottom),
4091             sequenceControlSetPtr->padRight,
4092             sequenceControlSetPtr->padBottom);
4093 
4094         PadInputPicture(
4095 			&inputPicturePtr->bufferBitIncCb[(inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideBitIncCb)],
4096             inputPicturePtr->strideBitIncCb,
4097             (inputPicturePtr->width - sequenceControlSetPtr->padRight) >> subWidthCMinus1,
4098             (inputPicturePtr->height - sequenceControlSetPtr->padBottom) >> subHeightCMinus1,
4099             sequenceControlSetPtr->padRight >> subWidthCMinus1,
4100             sequenceControlSetPtr->padBottom >> subHeightCMinus1);
4101 
4102         PadInputPicture(
4103 			&inputPicturePtr->bufferBitIncCr[(inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideBitIncCr)],
4104             inputPicturePtr->strideBitIncCr,
4105             (inputPicturePtr->width - sequenceControlSetPtr->padRight) >> subWidthCMinus1,
4106             (inputPicturePtr->height - sequenceControlSetPtr->padBottom) >> subHeightCMinus1,
4107             sequenceControlSetPtr->padRight >> subWidthCMinus1,
4108             sequenceControlSetPtr->padBottom >> subHeightCMinus1);
4109 
4110     }
4111 
4112 	return;
4113 }
4114 
4115 /************************************************
4116  * Pad Picture at the right and bottom sides
4117  ** To complete border LCU smaller than LCU size
4118  ************************************************/
PadPictureToMultipleOfLcuDimensions(EbPictureBufferDesc_t * inputPaddedPicturePtr)4119 static void PadPictureToMultipleOfLcuDimensions(
4120 	EbPictureBufferDesc_t           *inputPaddedPicturePtr
4121         )
4122 {
4123 
4124 	// Generate Padding
4125 	GeneratePadding(
4126 		&inputPaddedPicturePtr->bufferY[0],
4127 		inputPaddedPicturePtr->strideY,
4128 		inputPaddedPicturePtr->width,
4129 		inputPaddedPicturePtr->height,
4130 		inputPaddedPicturePtr->originX,
4131 		inputPaddedPicturePtr->originY);
4132 
4133 	return;
4134 }
4135 
4136 /************************************************
4137 * 1/4 & 1/16 input picture decimation
4138 ************************************************/
DecimateInputPicture(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPaddedPicturePtr,EbPictureBufferDesc_t * quarterDecimatedPicturePtr,EbPictureBufferDesc_t * sixteenthDecimatedPicturePtr)4139 static void DecimateInputPicture(
4140     SequenceControlSet_t            *sequenceControlSetPtr,
4141 	PictureParentControlSet_t       *pictureControlSetPtr,
4142 	EbPictureBufferDesc_t           *inputPaddedPicturePtr,
4143 	EbPictureBufferDesc_t           *quarterDecimatedPicturePtr,
4144 	EbPictureBufferDesc_t           *sixteenthDecimatedPicturePtr) {
4145 
4146     // Decimate input picture for HME L1
4147     EB_BOOL  preformQuarterPellDecimationFlag;
4148     if (sequenceControlSetPtr->staticConfig.speedControlFlag){
4149         preformQuarterPellDecimationFlag = EB_TRUE;
4150     }
4151     else{
4152         if (pictureControlSetPtr->enableHmeLevel1Flag == 1){
4153             preformQuarterPellDecimationFlag = EB_TRUE;
4154         }
4155         else{
4156             preformQuarterPellDecimationFlag = EB_FALSE;
4157         }
4158     }
4159 
4160     if (preformQuarterPellDecimationFlag) {
4161         Decimation2D(
4162 		        &inputPaddedPicturePtr->bufferY[inputPaddedPicturePtr->originX + inputPaddedPicturePtr->originY * inputPaddedPicturePtr->strideY],
4163 		        inputPaddedPicturePtr->strideY,
4164 		        inputPaddedPicturePtr->width ,
4165 		        inputPaddedPicturePtr->height,
4166 		        &quarterDecimatedPicturePtr->bufferY[quarterDecimatedPicturePtr->originX+quarterDecimatedPicturePtr->originY*quarterDecimatedPicturePtr->strideY],
4167 		        quarterDecimatedPicturePtr->strideY,
4168 		        2);
4169 
4170             GeneratePadding(
4171 		        &quarterDecimatedPicturePtr->bufferY[0],
4172 		        quarterDecimatedPicturePtr->strideY,
4173 		        quarterDecimatedPicturePtr->width,
4174 		        quarterDecimatedPicturePtr->height,
4175 		        quarterDecimatedPicturePtr->originX,
4176 		        quarterDecimatedPicturePtr->originY);
4177 
4178 	}
4179 
4180     // Decimate input picture for HME L0
4181 	// Sixteenth Input Picture Decimation
4182     Decimation2D(
4183 		&inputPaddedPicturePtr->bufferY[inputPaddedPicturePtr->originX + inputPaddedPicturePtr->originY * inputPaddedPicturePtr->strideY],
4184 		inputPaddedPicturePtr->strideY,
4185 		inputPaddedPicturePtr->width ,
4186 		inputPaddedPicturePtr->height ,
4187 		&sixteenthDecimatedPicturePtr->bufferY[sixteenthDecimatedPicturePtr->originX+sixteenthDecimatedPicturePtr->originY*sixteenthDecimatedPicturePtr->strideY],
4188 		sixteenthDecimatedPicturePtr->strideY,
4189 		4);
4190 
4191     GeneratePadding(
4192 		&sixteenthDecimatedPicturePtr->bufferY[0],
4193 		sixteenthDecimatedPicturePtr->strideY,
4194 		sixteenthDecimatedPicturePtr->width,
4195 		sixteenthDecimatedPicturePtr->height,
4196 		sixteenthDecimatedPicturePtr->originX,
4197 		sixteenthDecimatedPicturePtr->originY);
4198 }
4199 
4200 /************************************************
4201  * Picture Analysis Kernel
4202  * The Picture Analysis Process pads & decimates the input pictures.
4203  * The Picture Analysis also includes creating an n-bin Histogram,
4204  * gathering picture 1st and 2nd moment statistics for each 8x8 block,
4205  * which are used to compute variance.
4206  * The Picture Analysis process is multithreaded, so pictures can be
4207  * processed out of order as long as all inputs are available.
4208  ************************************************/
PictureAnalysisKernel(void * inputPtr)4209 void* PictureAnalysisKernel(void *inputPtr)
4210 {
4211 	PictureAnalysisContext_t        *contextPtr = (PictureAnalysisContext_t*)inputPtr;
4212 	PictureParentControlSet_t       *pictureControlSetPtr;
4213 	SequenceControlSet_t            *sequenceControlSetPtr;
4214 
4215 	EbObjectWrapper_t               *inputResultsWrapperPtr;
4216 	ResourceCoordinationResults_t   *inputResultsPtr;
4217 	EbObjectWrapper_t               *outputResultsWrapperPtr;
4218 	PictureAnalysisResults_t        *outputResultsPtr;
4219 	EbPaReferenceObject_t           *paReferenceObject;
4220 
4221 	EbPictureBufferDesc_t           *inputPaddedPicturePtr;
4222 	EbPictureBufferDesc_t           *quarterDecimatedPicturePtr;
4223 	EbPictureBufferDesc_t           *sixteenthDecimatedPicturePtr;
4224 	EbPictureBufferDesc_t           *inputPicturePtr;
4225 
4226 	// Variance
4227 	EB_U32                          pictureWidthInLcu;
4228 	EB_U32                          pictureHeighInLcu;
4229 	EB_U32                          lcuTotalCount;
4230 
4231 	for (;;) {
4232 
4233 		// Get Input Full Object
4234 		EbGetFullObject(
4235 			contextPtr->resourceCoordinationResultsInputFifoPtr,
4236 			&inputResultsWrapperPtr);
4237         EB_CHECK_END_OBJ(inputResultsWrapperPtr);
4238 
4239 		inputResultsPtr = (ResourceCoordinationResults_t*)inputResultsWrapperPtr->objectPtr;
4240 		pictureControlSetPtr = (PictureParentControlSet_t*)inputResultsPtr->pictureControlSetWrapperPtr->objectPtr;
4241 		sequenceControlSetPtr = (SequenceControlSet_t*)pictureControlSetPtr->sequenceControlSetWrapperPtr->objectPtr;
4242 		inputPicturePtr = pictureControlSetPtr->enhancedPicturePtr;
4243 #if DEADLOCK_DEBUG
4244         if ((pictureControlSetPtr->pictureNumber >= MIN_POC) && (pictureControlSetPtr->pictureNumber <= MAX_POC))
4245             SVT_LOG("POC %lu PA IN \n", pictureControlSetPtr->pictureNumber);
4246 #endif
4247 		paReferenceObject = (EbPaReferenceObject_t*)pictureControlSetPtr->paReferencePictureWrapperPtr->objectPtr;
4248 		inputPaddedPicturePtr = (EbPictureBufferDesc_t*)paReferenceObject->inputPaddedPicturePtr;
4249 		quarterDecimatedPicturePtr = (EbPictureBufferDesc_t*)paReferenceObject->quarterDecimatedPicturePtr;
4250 		sixteenthDecimatedPicturePtr = (EbPictureBufferDesc_t*)paReferenceObject->sixteenthDecimatedPicturePtr;
4251 
4252 		// Variance
4253 		pictureWidthInLcu = (sequenceControlSetPtr->lumaWidth + sequenceControlSetPtr->lcuSize - 1) / sequenceControlSetPtr->lcuSize;
4254 		pictureHeighInLcu = (sequenceControlSetPtr->lumaHeight + sequenceControlSetPtr->lcuSize - 1) / sequenceControlSetPtr->lcuSize;
4255 		lcuTotalCount = pictureWidthInLcu * pictureHeighInLcu;
4256 
4257 		// Pad pictures to multiple min cu size
4258 		PadPictureToMultipleOfMinCuSizeDimensions(
4259 			sequenceControlSetPtr,
4260 			inputPicturePtr);
4261 
4262         // Backup the Y component data from input picture into PA reference picture, to work arond the race condition that
4263         // the input picture buffer pointed by PA reference picture (in ResourceCoordination) would be updated even though
4264         // it's still being referenced.
4265         EB_U8 *pa = inputPaddedPicturePtr->bufferY + inputPaddedPicturePtr->originX + inputPaddedPicturePtr->originY * inputPaddedPicturePtr->strideY;
4266         EB_U8 *in = inputPicturePtr->bufferY + inputPicturePtr->originX + inputPicturePtr->originY * inputPicturePtr->strideY;
4267         for (EB_U32 row = 0; row < inputPicturePtr->height; row++) {
4268             EB_MEMCPY(pa + row * inputPaddedPicturePtr->strideY, in + row * inputPicturePtr->strideY, sizeof(EB_U8) * inputPicturePtr->width);
4269         }
4270 
4271         // Set picture parameters to account for subpicture, picture scantype, and set regions by resolutions
4272 		SetPictureParametersForStatisticsGathering(
4273 			sequenceControlSetPtr);
4274 
4275 
4276 		// Pre processing operations performed on the input picture
4277         PicturePreProcessingOperations(
4278             pictureControlSetPtr,
4279             contextPtr,
4280             sequenceControlSetPtr,
4281             quarterDecimatedPicturePtr,
4282             sixteenthDecimatedPicturePtr,
4283             lcuTotalCount,
4284             pictureWidthInLcu);
4285 
4286         if (inputPicturePtr->colorFormat >= EB_YUV422) {
4287             // Jing: Do the conversion of 422/444=>420 here since it's multi-threaded kernel
4288             //       Reuse the Y, only add cb/cr in the newly created buffer desc
4289             //       NOTE: since denoise may change the src, so this part is after PicturePreProcessingOperations()
4290             pictureControlSetPtr->chromaDownSamplePicturePtr->bufferY = inputPicturePtr->bufferY;
4291             DownSampleChroma(inputPicturePtr, pictureControlSetPtr->chromaDownSamplePicturePtr);
4292         } else {
4293             pictureControlSetPtr->chromaDownSamplePicturePtr = inputPicturePtr;
4294         }
4295 
4296 		// Pad input picture to complete border LCUs
4297 		PadPictureToMultipleOfLcuDimensions(
4298 			inputPaddedPicturePtr
4299         );
4300 
4301 		// 1/4 & 1/16 input picture decimation
4302 		DecimateInputPicture(
4303             sequenceControlSetPtr,
4304 			pictureControlSetPtr,
4305 			inputPaddedPicturePtr,
4306 			quarterDecimatedPicturePtr,
4307 			sixteenthDecimatedPicturePtr);
4308 
4309 		// Gathering statistics of input picture, including Variance Calculation, Histogram Bins
4310 		GatheringPictureStatistics(
4311 			sequenceControlSetPtr,
4312 			pictureControlSetPtr,
4313             contextPtr,
4314 			pictureControlSetPtr->chromaDownSamplePicturePtr, //420 inputPicturePtr
4315 			inputPaddedPicturePtr,
4316 			sixteenthDecimatedPicturePtr,
4317 			lcuTotalCount);
4318 
4319 
4320 		// Hold the 64x64 variance and mean in the reference frame
4321 		EB_U32 lcuIndex;
4322 		for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex){
4323 			paReferenceObject->variance[lcuIndex] = pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_64x64];
4324 			paReferenceObject->yMean[lcuIndex] = pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_64x64];
4325 
4326 		}
4327 
4328 		// Get Empty Results Object
4329 		EbGetEmptyObject(
4330 			contextPtr->pictureAnalysisResultsOutputFifoPtr,
4331 			&outputResultsWrapperPtr);
4332 
4333 		outputResultsPtr = (PictureAnalysisResults_t*)outputResultsWrapperPtr->objectPtr;
4334 		outputResultsPtr->pictureControlSetWrapperPtr = inputResultsPtr->pictureControlSetWrapperPtr;
4335 
4336 		// Release the Input Results
4337 		EbReleaseObject(inputResultsWrapperPtr);
4338 
4339 #if LATENCY_PROFILE
4340         double latency = 0.0;
4341         EB_U64 finishTimeSeconds = 0;
4342         EB_U64 finishTimeuSeconds = 0;
4343         EbHevcFinishTime((uint64_t*)&finishTimeSeconds, (uint64_t*)&finishTimeuSeconds);
4344 
4345         EbHevcComputeOverallElapsedTimeMs(
4346                 pictureControlSetPtr->startTimeSeconds,
4347                 pictureControlSetPtr->startTimeuSeconds,
4348                 finishTimeSeconds,
4349                 finishTimeuSeconds,
4350                 &latency);
4351 
4352         SVT_LOG("POC %lld PA OUT, decoder order %d, latency %3.3f \n",
4353                 pictureControlSetPtr->pictureNumber,
4354                 pictureControlSetPtr->decodeOrder,
4355                 latency);
4356 #endif
4357 		// Post the Full Results Object
4358 		EbPostFullObject(outputResultsWrapperPtr);
4359 
4360 #if DEADLOCK_DEBUG
4361         if ((pictureControlSetPtr->pictureNumber >= MIN_POC) && (pictureControlSetPtr->pictureNumber <= MAX_POC))
4362             SVT_LOG("POC %lu PA OUT \n", pictureControlSetPtr->pictureNumber);
4363 #endif
4364 	}
4365 	return EB_NULL;
4366 }
4367 
UnusedVariablevoidFunc_PA()4368 void UnusedVariablevoidFunc_PA()
4369 {
4370 	(void)SadCalculation_8x8_16x16_funcPtrArray;
4371 	(void)SadCalculation_32x32_64x64_funcPtrArray;
4372 }
4373