1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5
6 #include <stdlib.h>
7 #include <string.h>
8
9 #include "EbDefinitions.h"
10 #include "EbSystemResourceManager.h"
11 #include "EbPictureControlSet.h"
12 #include "EbSequenceControlSet.h"
13 #include "EbPictureBufferDesc.h"
14
15 #include "EbResourceCoordinationResults.h"
16 #include "EbPictureAnalysisProcess.h"
17 #include "EbPictureAnalysisResults.h"
18 #include "EbMcp.h"
19 #include "EbMotionEstimation.h"
20 #include "EbReferenceObject.h"
21
22 #include "EbComputeMean.h"
23 #include "EbMeSadCalculation.h"
24 #include "EbPictureOperators.h"
25 #include "EbComputeMean_SSE2.h"
26 #include "EbCombinedAveragingSAD_Intrinsic_AVX2.h"
27
28 #define VARIANCE_PRECISION 16
29 #define LCU_LOW_VAR_TH 5
30 #define PIC_LOW_VAR_PERCENTAGE_TH 60
31 #define FLAT_MAX_VAR 50
32 #define FLAT_MAX_VAR_DECIM (50-00)
33 #define NOISE_MIN_LEVEL_0 70000//120000
34 #define NOISE_MIN_LEVEL_DECIM_0 (70000+000000)//(120000+000000)
35 #define NOISE_MIN_LEVEL_1 120000
36 #define NOISE_MIN_LEVEL_DECIM_1 (120000+000000)
37 #define DENOISER_QP_TH 29
38 #define DENOISER_BITRATE_TH 14000000
39 #define SAMPLE_THRESHOLD_PRECENT_BORDER_LINE 15
40 #define SAMPLE_THRESHOLD_PRECENT_TWO_BORDER_LINES 10
41
PictureAnalysisContextDctor(EB_PTR p)42 static void PictureAnalysisContextDctor(EB_PTR p)
43 {
44 PictureAnalysisContext_t *obj = (PictureAnalysisContext_t*)p;
45 EB_DELETE(obj->noisePicturePtr);
46 EB_DELETE(obj->denoisedPicturePtr);
47 EB_FREE_PTR_ARRAY(obj->grad, obj->lcuTotalCountAllocated);
48 }
49 /************************************************
50 * Picture Analysis Context Constructor
51 ************************************************/
PictureAnalysisContextCtor(PictureAnalysisContext_t * contextPtr,EbPictureBufferDescInitData_t * inputPictureBufferDescInitData,EB_BOOL denoiseFlag,EbFifo_t * resourceCoordinationResultsInputFifoPtr,EbFifo_t * pictureAnalysisResultsOutputFifoPtr,EB_U16 lcuTotalCount)52 EB_ERRORTYPE PictureAnalysisContextCtor(
53 PictureAnalysisContext_t *contextPtr,
54 EbPictureBufferDescInitData_t *inputPictureBufferDescInitData,
55 EB_BOOL denoiseFlag,
56 EbFifo_t *resourceCoordinationResultsInputFifoPtr,
57 EbFifo_t *pictureAnalysisResultsOutputFifoPtr,
58 EB_U16 lcuTotalCount)
59 {
60 contextPtr->dctor = PictureAnalysisContextDctor;
61 contextPtr->resourceCoordinationResultsInputFifoPtr = resourceCoordinationResultsInputFifoPtr;
62 contextPtr->pictureAnalysisResultsOutputFifoPtr = pictureAnalysisResultsOutputFifoPtr;
63
64 if (denoiseFlag == EB_TRUE){
65
66 //denoised
67 // If 420/422, re-use luma for chroma
68 // If 444, re-use luma for Cr
69 if (inputPictureBufferDescInitData->colorFormat != EB_YUV444) {
70 inputPictureBufferDescInitData->bufferEnableMask = PICTURE_BUFFER_DESC_Y_FLAG;
71 } else {
72 inputPictureBufferDescInitData->bufferEnableMask = PICTURE_BUFFER_DESC_Y_FLAG | PICTURE_BUFFER_DESC_Cb_FLAG;
73 }
74
75 EB_NEW(
76 contextPtr->denoisedPicturePtr,
77 EbPictureBufferDescCtor,
78 inputPictureBufferDescInitData);
79
80 if (inputPictureBufferDescInitData->colorFormat != EB_YUV444) {
81 contextPtr->denoisedPicturePtr->bufferCb = contextPtr->denoisedPicturePtr->bufferY;
82 contextPtr->denoisedPicturePtr->bufferCr = contextPtr->denoisedPicturePtr->bufferY + contextPtr->denoisedPicturePtr->chromaSize;
83 } else {
84 contextPtr->denoisedPicturePtr->bufferCr = contextPtr->denoisedPicturePtr->bufferY;
85 }
86
87 // noise
88 inputPictureBufferDescInitData->maxHeight = MAX_LCU_SIZE;
89 inputPictureBufferDescInitData->bufferEnableMask = PICTURE_BUFFER_DESC_Y_FLAG;
90
91
92 EB_NEW(
93 contextPtr->noisePicturePtr,
94 EbPictureBufferDescCtor,
95 inputPictureBufferDescInitData);
96 }
97 contextPtr->lcuTotalCountAllocated = lcuTotalCount;
98 EB_ALLOC_PTR_ARRAY(contextPtr->grad, lcuTotalCount);
99 for (EB_U16 lcuIndex = 0; lcuIndex < lcuTotalCount; ++lcuIndex) {
100 EB_MALLOC_ARRAY(contextPtr->grad[lcuIndex], CU_MAX_COUNT);
101 }
102 return EB_ErrorNone;
103 }
104
DownSampleChroma(EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * outputPicturePtr)105 static void DownSampleChroma(EbPictureBufferDesc_t* inputPicturePtr, EbPictureBufferDesc_t* outputPicturePtr)
106 {
107 EB_U32 inputColorFormat = inputPicturePtr->colorFormat;
108 EB_U16 inputSubWidthCMinus1 = (inputColorFormat == EB_YUV444 ? 1 : 2) - 1;
109 EB_U16 inputSubHeightCMinus1 = (inputColorFormat >= EB_YUV422 ? 1 : 2) - 1;
110
111 EB_U32 outputColorFormat = outputPicturePtr->colorFormat;
112 EB_U16 outputSubWidthCMinus1 = (outputColorFormat == EB_YUV444 ? 1 : 2) - 1;
113 EB_U16 outputSubHeightCMinus1 = (outputColorFormat >= EB_YUV422 ? 1 : 2) - 1;
114
115 EB_U32 strideIn, strideOut;
116 EB_U32 inputOriginIndex, outputOriginIndex;
117
118 EB_U8 *ptrIn;
119 EB_U8 *ptrOut;
120
121 EB_U32 ii, jj;
122
123 //Cb
124 {
125 strideIn = inputPicturePtr->strideCb;
126 inputOriginIndex = (inputPicturePtr->originX >> inputSubWidthCMinus1) +
127 (inputPicturePtr->originY >> inputSubHeightCMinus1) * inputPicturePtr->strideCb;
128 ptrIn = &(inputPicturePtr->bufferCb[inputOriginIndex]);
129
130 strideOut = outputPicturePtr->strideCb;
131 outputOriginIndex = (outputPicturePtr->originX >> outputSubWidthCMinus1) +
132 (outputPicturePtr->originY >> outputSubHeightCMinus1) * outputPicturePtr->strideCb;
133 ptrOut = &(outputPicturePtr->bufferCb[outputOriginIndex]);
134
135 for (jj = 0; jj < (EB_U32)(outputPicturePtr->height >> outputSubHeightCMinus1); jj++) {
136 for (ii = 0; ii < (EB_U32)(outputPicturePtr->width >> outputSubWidthCMinus1); ii++) {
137 ptrOut[ii + jj * strideOut] =
138 ptrIn[(ii << (1 - inputSubWidthCMinus1)) +
139 (jj << (1 - inputSubHeightCMinus1)) * strideIn];
140 }
141 }
142
143 }
144
145 //Cr
146 {
147 strideIn = inputPicturePtr->strideCr;
148 inputOriginIndex = (inputPicturePtr->originX >> inputSubWidthCMinus1) + (inputPicturePtr->originY >> inputSubHeightCMinus1) * inputPicturePtr->strideCr;
149 ptrIn = &(inputPicturePtr->bufferCr[inputOriginIndex]);
150
151 strideOut = outputPicturePtr->strideCr;
152 outputOriginIndex = (outputPicturePtr->originX >> outputSubWidthCMinus1) + (outputPicturePtr->originY >> outputSubHeightCMinus1) * outputPicturePtr->strideCr;
153 ptrOut = &(outputPicturePtr->bufferCr[outputOriginIndex]);
154
155 for (jj = 0; jj < (EB_U32)(outputPicturePtr->height >> outputSubHeightCMinus1); jj++) {
156 for (ii = 0; ii < (EB_U32)(outputPicturePtr->width >> outputSubWidthCMinus1); ii++) {
157 ptrOut[ii + jj * strideOut] =
158 ptrIn[(ii << (1 - inputSubWidthCMinus1)) +
159 (jj << (1 - inputSubHeightCMinus1)) * strideIn];
160 }
161 }
162 }
163 }
164
165 /************************************************
166 * Picture Analysis Context Destructor
167 ************************************************/
168
169 /********************************************
170 * Decimation2D
171 * decimates the input
172 ********************************************/
Decimation2D(EB_U8 * inputSamples,EB_U32 inputStride,EB_U32 inputAreaWidth,EB_U32 inputAreaHeight,EB_U8 * decimSamples,EB_U32 decimStride,EB_U32 decimStep)173 void Decimation2D(
174 EB_U8 * inputSamples, // input parameter, input samples Ptr
175 EB_U32 inputStride, // input parameter, input stride
176 EB_U32 inputAreaWidth, // input parameter, input area width
177 EB_U32 inputAreaHeight, // input parameter, input area height
178 EB_U8 * decimSamples, // output parameter, decimated samples Ptr
179 EB_U32 decimStride, // input parameter, output stride
180 EB_U32 decimStep) // input parameter, area height
181 {
182
183 EB_U32 horizontalIndex;
184 EB_U32 verticalIndex;
185
186
187 for (verticalIndex = 0; verticalIndex < inputAreaHeight; verticalIndex += decimStep) {
188 for (horizontalIndex = 0; horizontalIndex < inputAreaWidth; horizontalIndex += decimStep) {
189
190 decimSamples[(horizontalIndex >> (decimStep >> 1))] = inputSamples[horizontalIndex];
191
192 }
193 inputSamples += (inputStride << (decimStep >> 1));
194 decimSamples += decimStride;
195 }
196
197 return;
198 }
199
200 /********************************************
201 * CalculateHistogram
202 * creates n-bins histogram for the input
203 ********************************************/
CalculateHistogram(EB_U8 * inputSamples,EB_U32 inputAreaWidth,EB_U32 inputAreaHeight,EB_U32 stride,EB_U8 decimStep,EB_U32 * histogram,EB_U64 * sum)204 static void CalculateHistogram(
205 EB_U8 * inputSamples, // input parameter, input samples Ptr
206 EB_U32 inputAreaWidth, // input parameter, input area width
207 EB_U32 inputAreaHeight, // input parameter, input area height
208 EB_U32 stride, // input parameter, input stride
209 EB_U8 decimStep, // input parameter, area height
210 EB_U32 *histogram, // output parameter, output histogram
211 EB_U64 *sum)
212
213 {
214
215 EB_U32 horizontalIndex;
216 EB_U32 verticalIndex;
217 *sum = 0;
218
219 for (verticalIndex = 0; verticalIndex < inputAreaHeight; verticalIndex += decimStep) {
220 for (horizontalIndex = 0; horizontalIndex < inputAreaWidth; horizontalIndex += decimStep) {
221 ++(histogram[inputSamples[horizontalIndex]]);
222 *sum += inputSamples[horizontalIndex];
223 }
224 inputSamples += (stride << (decimStep >> 1));
225 }
226
227 return;
228 }
229
230
ComputeVariance32x32(EbPictureBufferDesc_t * inputPaddedPicturePtr,EB_U32 inputLumaOriginIndex,EB_U64 * variance8x8)231 static EB_U64 ComputeVariance32x32(
232 EbPictureBufferDesc_t *inputPaddedPicturePtr, // input parameter, Input Padded Picture
233 EB_U32 inputLumaOriginIndex, // input parameter, LCU index, used to point to source/reference samples
234 EB_U64 *variance8x8)
235 {
236
237 EB_U32 blockIndex;
238
239 EB_U64 meanOf8x8Blocks[16];
240 EB_U64 meanOf8x8SquaredValuesBlocks[16];
241
242 EB_U64 meanOf16x16Blocks[4];
243 EB_U64 meanOf16x16SquaredValuesBlocks[4];
244
245 EB_U64 meanOf32x32Blocks;
246 EB_U64 meanOf32x32SquaredValuesBlocks;
247 /////////////////////////////////////////////
248 // (0,0)
249 blockIndex = inputLumaOriginIndex;
250
251 meanOf8x8Blocks[0] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
252 meanOf8x8SquaredValuesBlocks[0] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
253
254 // (0,1)
255 blockIndex = blockIndex + 8;
256 meanOf8x8Blocks[1] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
257 meanOf8x8SquaredValuesBlocks[1] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
258
259 // (0,2)
260 blockIndex = blockIndex + 8;
261 meanOf8x8Blocks[2] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
262 meanOf8x8SquaredValuesBlocks[2] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
263
264 // (0,3)
265 blockIndex = blockIndex + 8;
266 meanOf8x8Blocks[3] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
267 meanOf8x8SquaredValuesBlocks[3] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
268
269
270
271 // (1,0)
272 blockIndex = inputLumaOriginIndex + (inputPaddedPicturePtr->strideY << 3);
273 meanOf8x8Blocks[4] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
274 meanOf8x8SquaredValuesBlocks[4] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
275
276 // (1,1)
277 blockIndex = blockIndex + 8;
278 meanOf8x8Blocks[5] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
279 meanOf8x8SquaredValuesBlocks[5] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
280
281 // (1,2)
282 blockIndex = blockIndex + 8;
283 meanOf8x8Blocks[6] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
284 meanOf8x8SquaredValuesBlocks[6] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
285
286 // (1,3)
287 blockIndex = blockIndex + 8;
288 meanOf8x8Blocks[7] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
289 meanOf8x8SquaredValuesBlocks[7] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
290
291
292
293 // (2,0)
294 blockIndex = inputLumaOriginIndex + (inputPaddedPicturePtr->strideY << 4);
295 meanOf8x8Blocks[8] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
296 meanOf8x8SquaredValuesBlocks[8] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
297
298 // (2,1)
299 blockIndex = blockIndex + 8;
300 meanOf8x8Blocks[9] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
301 meanOf8x8SquaredValuesBlocks[9] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
302
303 // (2,2)
304 blockIndex = blockIndex + 8;
305 meanOf8x8Blocks[10] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
306 meanOf8x8SquaredValuesBlocks[10] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
307
308 // (2,3)
309 blockIndex = blockIndex + 8;
310 meanOf8x8Blocks[11] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
311 meanOf8x8SquaredValuesBlocks[11] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
312
313
314
315 // (3,0)
316 blockIndex = inputLumaOriginIndex + (inputPaddedPicturePtr->strideY << 3) + (inputPaddedPicturePtr->strideY << 4);
317 meanOf8x8Blocks[12] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
318 meanOf8x8SquaredValuesBlocks[12] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
319
320 // (3,1)
321 blockIndex = blockIndex + 8;
322 meanOf8x8Blocks[13] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
323 meanOf8x8SquaredValuesBlocks[13] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
324
325 // (3,2)
326 blockIndex = blockIndex + 8;
327 meanOf8x8Blocks[14] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
328 meanOf8x8SquaredValuesBlocks[14] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
329
330 // (3,3)
331 blockIndex = blockIndex + 8;
332 meanOf8x8Blocks[15] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
333 meanOf8x8SquaredValuesBlocks[15] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
334
335
336 /////////////////////////////////////////////
337
338 variance8x8[0] = meanOf8x8SquaredValuesBlocks[0] - (meanOf8x8Blocks[0] * meanOf8x8Blocks[0]);
339 variance8x8[1] = meanOf8x8SquaredValuesBlocks[1] - (meanOf8x8Blocks[1] * meanOf8x8Blocks[1]);
340 variance8x8[2] = meanOf8x8SquaredValuesBlocks[2] - (meanOf8x8Blocks[2] * meanOf8x8Blocks[2]);
341 variance8x8[3] = meanOf8x8SquaredValuesBlocks[3] - (meanOf8x8Blocks[3] * meanOf8x8Blocks[3]);
342 variance8x8[4] = meanOf8x8SquaredValuesBlocks[4] - (meanOf8x8Blocks[4] * meanOf8x8Blocks[4]);
343 variance8x8[5] = meanOf8x8SquaredValuesBlocks[5] - (meanOf8x8Blocks[5] * meanOf8x8Blocks[5]);
344 variance8x8[6] = meanOf8x8SquaredValuesBlocks[6] - (meanOf8x8Blocks[6] * meanOf8x8Blocks[6]);
345 variance8x8[7] = meanOf8x8SquaredValuesBlocks[7] - (meanOf8x8Blocks[7] * meanOf8x8Blocks[7]);
346 variance8x8[8] = meanOf8x8SquaredValuesBlocks[8] - (meanOf8x8Blocks[8] * meanOf8x8Blocks[8]);
347 variance8x8[9] = meanOf8x8SquaredValuesBlocks[9] - (meanOf8x8Blocks[9] * meanOf8x8Blocks[9]);
348 variance8x8[10] = meanOf8x8SquaredValuesBlocks[10] - (meanOf8x8Blocks[10] * meanOf8x8Blocks[10]);
349 variance8x8[11] = meanOf8x8SquaredValuesBlocks[11] - (meanOf8x8Blocks[11] * meanOf8x8Blocks[11]);
350 variance8x8[12] = meanOf8x8SquaredValuesBlocks[12] - (meanOf8x8Blocks[12] * meanOf8x8Blocks[12]);
351 variance8x8[13] = meanOf8x8SquaredValuesBlocks[13] - (meanOf8x8Blocks[13] * meanOf8x8Blocks[13]);
352 variance8x8[14] = meanOf8x8SquaredValuesBlocks[14] - (meanOf8x8Blocks[14] * meanOf8x8Blocks[14]);
353 variance8x8[15] = meanOf8x8SquaredValuesBlocks[15] - (meanOf8x8Blocks[15] * meanOf8x8Blocks[15]);
354
355 // 16x16
356 meanOf16x16Blocks[0] = (meanOf8x8Blocks[0] + meanOf8x8Blocks[1] + meanOf8x8Blocks[8] + meanOf8x8Blocks[9]) >> 2;
357 meanOf16x16Blocks[1] = (meanOf8x8Blocks[2] + meanOf8x8Blocks[3] + meanOf8x8Blocks[10] + meanOf8x8Blocks[11]) >> 2;
358 meanOf16x16Blocks[2] = (meanOf8x8Blocks[4] + meanOf8x8Blocks[5] + meanOf8x8Blocks[12] + meanOf8x8Blocks[13]) >> 2;
359 meanOf16x16Blocks[3] = (meanOf8x8Blocks[6] + meanOf8x8Blocks[7] + meanOf8x8Blocks[14] + meanOf8x8Blocks[15]) >> 2;
360
361
362 meanOf16x16SquaredValuesBlocks[0] = (meanOf8x8SquaredValuesBlocks[0] + meanOf8x8SquaredValuesBlocks[1] + meanOf8x8SquaredValuesBlocks[8] + meanOf8x8SquaredValuesBlocks[9]) >> 2;
363 meanOf16x16SquaredValuesBlocks[1] = (meanOf8x8SquaredValuesBlocks[2] + meanOf8x8SquaredValuesBlocks[3] + meanOf8x8SquaredValuesBlocks[10] + meanOf8x8SquaredValuesBlocks[11]) >> 2;
364 meanOf16x16SquaredValuesBlocks[2] = (meanOf8x8SquaredValuesBlocks[4] + meanOf8x8SquaredValuesBlocks[5] + meanOf8x8SquaredValuesBlocks[12] + meanOf8x8SquaredValuesBlocks[13]) >> 2;
365 meanOf16x16SquaredValuesBlocks[3] = (meanOf8x8SquaredValuesBlocks[6] + meanOf8x8SquaredValuesBlocks[7] + meanOf8x8SquaredValuesBlocks[14] + meanOf8x8SquaredValuesBlocks[15]) >> 2;
366
367 // 32x32
368 meanOf32x32Blocks = (meanOf16x16Blocks[0] + meanOf16x16Blocks[1] + meanOf16x16Blocks[2] + meanOf16x16Blocks[3]) >> 2;
369
370
371 meanOf32x32SquaredValuesBlocks = (meanOf16x16SquaredValuesBlocks[0] + meanOf16x16SquaredValuesBlocks[1] + meanOf16x16SquaredValuesBlocks[2] + meanOf16x16SquaredValuesBlocks[3]) >> 2;
372
373
374 return (meanOf32x32SquaredValuesBlocks - (meanOf32x32Blocks * meanOf32x32Blocks));
375 }
376
ComputeVariance16x16(EbPictureBufferDesc_t * inputPaddedPicturePtr,EB_U32 inputLumaOriginIndex,EB_U64 * variance8x8)377 static EB_U64 ComputeVariance16x16(
378 EbPictureBufferDesc_t *inputPaddedPicturePtr, // input parameter, Input Padded Picture
379 EB_U32 inputLumaOriginIndex, // input parameter, LCU index, used to point to source/reference samples
380 EB_U64 *variance8x8)
381 {
382
383 EB_U32 blockIndex;
384
385 EB_U64 meanOf8x8Blocks[4];
386 EB_U64 meanOf8x8SquaredValuesBlocks[4];
387
388 EB_U64 meanOf16x16Blocks;
389 EB_U64 meanOf16x16SquaredValuesBlocks;
390
391 // (0,0)
392 blockIndex = inputLumaOriginIndex;
393
394 meanOf8x8Blocks[0] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
395 meanOf8x8SquaredValuesBlocks[0] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
396
397 // (0,1)
398 blockIndex = blockIndex + 8;
399 meanOf8x8Blocks[1] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
400 meanOf8x8SquaredValuesBlocks[1] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
401
402 // (1,0)
403 blockIndex = inputLumaOriginIndex + (inputPaddedPicturePtr->strideY << 3);
404 meanOf8x8Blocks[2] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
405 meanOf8x8SquaredValuesBlocks[2] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
406
407 // (1,1)
408 blockIndex = blockIndex + 8;
409 meanOf8x8Blocks[3] = ComputeMeanFunc[0][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
410 meanOf8x8SquaredValuesBlocks[3] = ComputeMeanFunc[1][!!(ASM_TYPES & AVX2_MASK)](&(inputPaddedPicturePtr->bufferY[blockIndex]), inputPaddedPicturePtr->strideY, 8, 8);
411
412 variance8x8[0] = meanOf8x8SquaredValuesBlocks[0] - (meanOf8x8Blocks[0] * meanOf8x8Blocks[0]);
413 variance8x8[1] = meanOf8x8SquaredValuesBlocks[1] - (meanOf8x8Blocks[1] * meanOf8x8Blocks[1]);
414 variance8x8[2] = meanOf8x8SquaredValuesBlocks[2] - (meanOf8x8Blocks[2] * meanOf8x8Blocks[2]);
415 variance8x8[3] = meanOf8x8SquaredValuesBlocks[3] - (meanOf8x8Blocks[3] * meanOf8x8Blocks[3]);
416
417 // 16x16
418 meanOf16x16Blocks = (meanOf8x8Blocks[0] + meanOf8x8Blocks[1] + meanOf8x8Blocks[2] + meanOf8x8Blocks[3]) >> 2;
419 meanOf16x16SquaredValuesBlocks = (meanOf8x8SquaredValuesBlocks[0] + meanOf8x8SquaredValuesBlocks[1] + meanOf8x8SquaredValuesBlocks[2] + meanOf8x8SquaredValuesBlocks[3]) >> 2;
420
421 return (meanOf16x16SquaredValuesBlocks - (meanOf16x16Blocks * meanOf16x16Blocks));
422 }
423
424 /*******************************************
425 ComputeVariance64x64
426 this function is exactly same as
427 PictureAnalysisComputeVarianceLcu excpet it
428 does not store data for every block,
429 just returns the 64x64 data point
430 *******************************************/
ComputeVariance64x64(EbPictureBufferDesc_t * inputPaddedPicturePtr,EB_U32 inputLumaOriginIndex,EB_U64 * variance32x32)431 static EB_U64 ComputeVariance64x64(
432 EbPictureBufferDesc_t *inputPaddedPicturePtr, // input parameter, Input Padded Picture
433 EB_U32 inputLumaOriginIndex, // input parameter, LCU index, used to point to source/reference samples
434 EB_U64 *variance32x32)
435 {
436
437
438 EB_U32 blockIndex;
439
440 EB_U64 meanOf8x8Blocks[64];
441 EB_U64 meanOf8x8SquaredValuesBlocks[64];
442
443 EB_U64 meanOf16x16Blocks[16];
444 EB_U64 meanOf16x16SquaredValuesBlocks[16];
445
446 EB_U64 meanOf32x32Blocks[4];
447 EB_U64 meanOf32x32SquaredValuesBlocks[4];
448
449 EB_U64 meanOf64x64Blocks;
450 EB_U64 meanOf64x64SquaredValuesBlocks;
451
452 // (0,0)
453 blockIndex = inputLumaOriginIndex;
454 const EB_U16 strideY = inputPaddedPicturePtr->strideY;
455
456 if (!!(ASM_TYPES & AVX2_MASK)) {
457
458 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[0], &meanOf8x8SquaredValuesBlocks[0]);
459
460 // (0,1)
461 blockIndex = blockIndex + 32;
462
463 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[4], &meanOf8x8SquaredValuesBlocks[4]);
464 // (0,5)
465 blockIndex = blockIndex + 24;
466
467 // (1,0)
468 blockIndex = inputLumaOriginIndex + (strideY << 3);
469
470 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[8], &meanOf8x8SquaredValuesBlocks[8]);
471
472 // (1,1)
473 blockIndex = blockIndex + 32;
474
475 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[12], &meanOf8x8SquaredValuesBlocks[12]);
476
477 // (1,5)
478 blockIndex = blockIndex + 24;
479
480 // (2,0)
481 blockIndex = inputLumaOriginIndex + (strideY << 4);
482
483 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[16], &meanOf8x8SquaredValuesBlocks[16]);
484
485 // (2,1)
486 blockIndex = blockIndex + 32;
487
488 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[20], &meanOf8x8SquaredValuesBlocks[20]);
489
490 // (2,5)
491 blockIndex = blockIndex + 24;
492
493 // (3,0)
494 blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4);
495
496 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[24], &meanOf8x8SquaredValuesBlocks[24]);
497
498 // (3,1)
499 blockIndex = blockIndex + 32;
500
501 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[28], &meanOf8x8SquaredValuesBlocks[28]);
502
503 // (3,5)
504 blockIndex = blockIndex + 24;
505
506 // (4,0)
507 blockIndex = inputLumaOriginIndex + (strideY << 5);
508
509 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[32], &meanOf8x8SquaredValuesBlocks[32]);
510
511 // (4,1)
512 blockIndex = blockIndex + 32;
513
514 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[36], &meanOf8x8SquaredValuesBlocks[36]);
515
516 // (4,5)
517 blockIndex = blockIndex + 24;
518
519 // (5,0)
520 blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 5);
521
522 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[40], &meanOf8x8SquaredValuesBlocks[40]);
523
524 // (5,1)
525 blockIndex = blockIndex + 32;
526
527 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[44], &meanOf8x8SquaredValuesBlocks[44]);
528
529 // (5,5)
530 blockIndex = blockIndex + 24;
531
532 // (6,0)
533 blockIndex = inputLumaOriginIndex + (strideY << 4) + (strideY << 5);
534
535 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[48], &meanOf8x8SquaredValuesBlocks[48]);
536
537 // (6,1)
538 blockIndex = blockIndex + 32;
539
540 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[52], &meanOf8x8SquaredValuesBlocks[52]);
541
542 // (6,5)
543 blockIndex = blockIndex + 24;
544
545 // (7,0)
546 blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4) + (strideY << 5);
547
548 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[56], &meanOf8x8SquaredValuesBlocks[56]);
549
550 // (7,1)
551 blockIndex = blockIndex + 32;
552
553 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[60], &meanOf8x8SquaredValuesBlocks[60]);
554
555
556 }
557 else{
558 meanOf8x8Blocks[0] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
559 meanOf8x8SquaredValuesBlocks[0] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
560
561 // (0,1)
562 blockIndex = blockIndex + 8;
563 meanOf8x8Blocks[1] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
564 meanOf8x8SquaredValuesBlocks[1] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
565
566 // (0,2)
567 blockIndex = blockIndex + 8;
568 meanOf8x8Blocks[2] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
569 meanOf8x8SquaredValuesBlocks[2] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
570
571 // (0,3)
572 blockIndex = blockIndex + 8;
573 meanOf8x8Blocks[3] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
574 meanOf8x8SquaredValuesBlocks[3] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
575
576 // (0,4)
577 blockIndex = blockIndex + 8;
578 meanOf8x8Blocks[4] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
579 meanOf8x8SquaredValuesBlocks[4] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
580
581 // (0,5)
582 blockIndex = blockIndex + 8;
583 meanOf8x8Blocks[5] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
584 meanOf8x8SquaredValuesBlocks[5] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
585
586 // (0,6)
587 blockIndex = blockIndex + 8;
588 meanOf8x8Blocks[6] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
589 meanOf8x8SquaredValuesBlocks[6] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
590
591 // (0,7)
592 blockIndex = blockIndex + 8;
593 meanOf8x8Blocks[7] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
594 meanOf8x8SquaredValuesBlocks[7] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
595
596 // (1,0)
597 blockIndex = inputLumaOriginIndex + (strideY << 3);
598 meanOf8x8Blocks[8] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
599 meanOf8x8SquaredValuesBlocks[8] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
600
601 // (1,1)
602 blockIndex = blockIndex + 8;
603 meanOf8x8Blocks[9] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
604 meanOf8x8SquaredValuesBlocks[9] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
605
606 // (1,2)
607 blockIndex = blockIndex + 8;
608 meanOf8x8Blocks[10] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
609 meanOf8x8SquaredValuesBlocks[10] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
610
611 // (1,3)
612 blockIndex = blockIndex + 8;
613 meanOf8x8Blocks[11] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
614 meanOf8x8SquaredValuesBlocks[11] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
615
616 // (1,4)
617 blockIndex = blockIndex + 8;
618 meanOf8x8Blocks[12] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
619 meanOf8x8SquaredValuesBlocks[12] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
620
621 // (1,5)
622 blockIndex = blockIndex + 8;
623 meanOf8x8Blocks[13] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
624 meanOf8x8SquaredValuesBlocks[13] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
625
626 // (1,6)
627 blockIndex = blockIndex + 8;
628 meanOf8x8Blocks[14] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
629 meanOf8x8SquaredValuesBlocks[14] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
630
631 // (1,7)
632 blockIndex = blockIndex + 8;
633 meanOf8x8Blocks[15] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
634 meanOf8x8SquaredValuesBlocks[15] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
635
636 // (2,0)
637 blockIndex = inputLumaOriginIndex + (strideY << 4);
638 meanOf8x8Blocks[16] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
639 meanOf8x8SquaredValuesBlocks[16] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
640
641 // (2,1)
642 blockIndex = blockIndex + 8;
643 meanOf8x8Blocks[17] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
644 meanOf8x8SquaredValuesBlocks[17] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
645
646 // (2,2)
647 blockIndex = blockIndex + 8;
648 meanOf8x8Blocks[18] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
649 meanOf8x8SquaredValuesBlocks[18] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
650
651 // (2,3)
652 blockIndex = blockIndex + 8;
653 meanOf8x8Blocks[19] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
654 meanOf8x8SquaredValuesBlocks[19] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
655
656 /// (2,4)
657 blockIndex = blockIndex + 8;
658 meanOf8x8Blocks[20] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
659 meanOf8x8SquaredValuesBlocks[20] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
660
661 // (2,5)
662 blockIndex = blockIndex + 8;
663 meanOf8x8Blocks[21] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
664 meanOf8x8SquaredValuesBlocks[21] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
665
666 // (2,6)
667 blockIndex = blockIndex + 8;
668 meanOf8x8Blocks[22] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
669 meanOf8x8SquaredValuesBlocks[22] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
670
671 // (2,7)
672 blockIndex = blockIndex + 8;
673 meanOf8x8Blocks[23] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
674 meanOf8x8SquaredValuesBlocks[23] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
675
676 // (3,0)
677 blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4);
678 meanOf8x8Blocks[24] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
679 meanOf8x8SquaredValuesBlocks[24] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
680
681 // (3,1)
682 blockIndex = blockIndex + 8;
683 meanOf8x8Blocks[25] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
684 meanOf8x8SquaredValuesBlocks[25] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
685
686 // (3,2)
687 blockIndex = blockIndex + 8;
688 meanOf8x8Blocks[26] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
689 meanOf8x8SquaredValuesBlocks[26] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
690
691 // (3,3)
692 blockIndex = blockIndex + 8;
693 meanOf8x8Blocks[27] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
694 meanOf8x8SquaredValuesBlocks[27] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
695
696 // (3,4)
697 blockIndex = blockIndex + 8;
698 meanOf8x8Blocks[28] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
699 meanOf8x8SquaredValuesBlocks[28] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
700
701 // (3,5)
702 blockIndex = blockIndex + 8;
703 meanOf8x8Blocks[29] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
704 meanOf8x8SquaredValuesBlocks[29] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
705
706 // (3,6)
707 blockIndex = blockIndex + 8;
708 meanOf8x8Blocks[30] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
709 meanOf8x8SquaredValuesBlocks[30] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
710
711 // (3,7)
712 blockIndex = blockIndex + 8;
713 meanOf8x8Blocks[31] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
714 meanOf8x8SquaredValuesBlocks[31] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
715
716 // (4,0)
717 blockIndex = inputLumaOriginIndex + (strideY << 5);
718 meanOf8x8Blocks[32] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
719 meanOf8x8SquaredValuesBlocks[32] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
720
721 // (4,1)
722 blockIndex = blockIndex + 8;
723 meanOf8x8Blocks[33] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
724 meanOf8x8SquaredValuesBlocks[33] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
725
726 // (4,2)
727 blockIndex = blockIndex + 8;
728 meanOf8x8Blocks[34] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
729 meanOf8x8SquaredValuesBlocks[34] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
730
731 // (4,3)
732 blockIndex = blockIndex + 8;
733 meanOf8x8Blocks[35] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
734 meanOf8x8SquaredValuesBlocks[35] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
735
736 // (4,4)
737 blockIndex = blockIndex + 8;
738 meanOf8x8Blocks[36] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
739 meanOf8x8SquaredValuesBlocks[36] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
740
741 // (4,5)
742 blockIndex = blockIndex + 8;
743 meanOf8x8Blocks[37] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
744 meanOf8x8SquaredValuesBlocks[37] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
745
746 // (4,6)
747 blockIndex = blockIndex + 8;
748 meanOf8x8Blocks[38] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
749 meanOf8x8SquaredValuesBlocks[38] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
750
751 // (4,7)
752 blockIndex = blockIndex + 8;
753 meanOf8x8Blocks[39] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
754 meanOf8x8SquaredValuesBlocks[39] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
755
756 // (5,0)
757 blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 5);
758 meanOf8x8Blocks[40] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
759 meanOf8x8SquaredValuesBlocks[40] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
760
761 // (5,1)
762 blockIndex = blockIndex + 8;
763 meanOf8x8Blocks[41] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
764 meanOf8x8SquaredValuesBlocks[41] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
765
766 // (5,2)
767 blockIndex = blockIndex + 8;
768 meanOf8x8Blocks[42] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
769 meanOf8x8SquaredValuesBlocks[42] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
770
771 // (5,3)
772 blockIndex = blockIndex + 8;
773 meanOf8x8Blocks[43] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
774 meanOf8x8SquaredValuesBlocks[43] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
775
776 // (5,4)
777 blockIndex = blockIndex + 8;
778 meanOf8x8Blocks[44] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
779 meanOf8x8SquaredValuesBlocks[44] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
780
781 // (5,5)
782 blockIndex = blockIndex + 8;
783 meanOf8x8Blocks[45] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
784 meanOf8x8SquaredValuesBlocks[45] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
785
786 // (5,6)
787 blockIndex = blockIndex + 8;
788 meanOf8x8Blocks[46] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
789 meanOf8x8SquaredValuesBlocks[46] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
790
791 // (5,7)
792 blockIndex = blockIndex + 8;
793 meanOf8x8Blocks[47] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
794 meanOf8x8SquaredValuesBlocks[47] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
795
796 // (6,0)
797 blockIndex = inputLumaOriginIndex + (strideY << 4) + (strideY << 5);
798 meanOf8x8Blocks[48] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
799 meanOf8x8SquaredValuesBlocks[48] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
800
801 // (6,1)
802 blockIndex = blockIndex + 8;
803 meanOf8x8Blocks[49] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
804 meanOf8x8SquaredValuesBlocks[49] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
805
806 // (6,2)
807 blockIndex = blockIndex + 8;
808 meanOf8x8Blocks[50] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
809 meanOf8x8SquaredValuesBlocks[50] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
810
811 // (6,3)
812 blockIndex = blockIndex + 8;
813 meanOf8x8Blocks[51] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
814 meanOf8x8SquaredValuesBlocks[51] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
815
816 // (6,4)
817 blockIndex = blockIndex + 8;
818 meanOf8x8Blocks[52] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
819 meanOf8x8SquaredValuesBlocks[52] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
820
821 // (6,5)
822 blockIndex = blockIndex + 8;
823 meanOf8x8Blocks[53] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
824 meanOf8x8SquaredValuesBlocks[53] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
825
826 // (6,6)
827 blockIndex = blockIndex + 8;
828 meanOf8x8Blocks[54] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
829 meanOf8x8SquaredValuesBlocks[54] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
830
831 // (6,7)
832 blockIndex = blockIndex + 8;
833 meanOf8x8Blocks[55] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
834 meanOf8x8SquaredValuesBlocks[55] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
835
836 // (7,0)
837 blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4) + (strideY << 5);
838 meanOf8x8Blocks[56] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
839 meanOf8x8SquaredValuesBlocks[56] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
840
841 // (7,1)
842 blockIndex = blockIndex + 8;
843 meanOf8x8Blocks[57] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
844 meanOf8x8SquaredValuesBlocks[57] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
845
846 // (7,2)
847 blockIndex = blockIndex + 8;
848 meanOf8x8Blocks[58] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
849 meanOf8x8SquaredValuesBlocks[58] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
850
851 // (7,3)
852 blockIndex = blockIndex + 8;
853 meanOf8x8Blocks[59] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
854 meanOf8x8SquaredValuesBlocks[59] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
855
856 // (7,4)
857 blockIndex = blockIndex + 8;
858 meanOf8x8Blocks[60] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
859 meanOf8x8SquaredValuesBlocks[60] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
860
861 // (7,5)
862 blockIndex = blockIndex + 8;
863 meanOf8x8Blocks[61] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
864 meanOf8x8SquaredValuesBlocks[61] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
865
866 // (7,6)
867 blockIndex = blockIndex + 8;
868 meanOf8x8Blocks[62] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
869 meanOf8x8SquaredValuesBlocks[62] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
870
871 // (7,7)
872 blockIndex = blockIndex + 8;
873 meanOf8x8Blocks[63] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
874 meanOf8x8SquaredValuesBlocks[63] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
875
876
877 }
878
879
880 // 16x16
881 meanOf16x16Blocks[0] = (meanOf8x8Blocks[0] + meanOf8x8Blocks[1] + meanOf8x8Blocks[8] + meanOf8x8Blocks[9]) >> 2;
882 meanOf16x16Blocks[1] = (meanOf8x8Blocks[2] + meanOf8x8Blocks[3] + meanOf8x8Blocks[10] + meanOf8x8Blocks[11]) >> 2;
883 meanOf16x16Blocks[2] = (meanOf8x8Blocks[4] + meanOf8x8Blocks[5] + meanOf8x8Blocks[12] + meanOf8x8Blocks[13]) >> 2;
884 meanOf16x16Blocks[3] = (meanOf8x8Blocks[6] + meanOf8x8Blocks[7] + meanOf8x8Blocks[14] + meanOf8x8Blocks[15]) >> 2;
885
886 meanOf16x16Blocks[4] = (meanOf8x8Blocks[16] + meanOf8x8Blocks[17] + meanOf8x8Blocks[24] + meanOf8x8Blocks[25]) >> 2;
887 meanOf16x16Blocks[5] = (meanOf8x8Blocks[18] + meanOf8x8Blocks[19] + meanOf8x8Blocks[26] + meanOf8x8Blocks[27]) >> 2;
888 meanOf16x16Blocks[6] = (meanOf8x8Blocks[20] + meanOf8x8Blocks[21] + meanOf8x8Blocks[28] + meanOf8x8Blocks[29]) >> 2;
889 meanOf16x16Blocks[7] = (meanOf8x8Blocks[22] + meanOf8x8Blocks[23] + meanOf8x8Blocks[30] + meanOf8x8Blocks[31]) >> 2;
890
891 meanOf16x16Blocks[8] = (meanOf8x8Blocks[32] + meanOf8x8Blocks[33] + meanOf8x8Blocks[40] + meanOf8x8Blocks[41]) >> 2;
892 meanOf16x16Blocks[9] = (meanOf8x8Blocks[34] + meanOf8x8Blocks[35] + meanOf8x8Blocks[42] + meanOf8x8Blocks[43]) >> 2;
893 meanOf16x16Blocks[10] = (meanOf8x8Blocks[36] + meanOf8x8Blocks[37] + meanOf8x8Blocks[44] + meanOf8x8Blocks[45]) >> 2;
894 meanOf16x16Blocks[11] = (meanOf8x8Blocks[38] + meanOf8x8Blocks[39] + meanOf8x8Blocks[46] + meanOf8x8Blocks[47]) >> 2;
895
896 meanOf16x16Blocks[12] = (meanOf8x8Blocks[48] + meanOf8x8Blocks[49] + meanOf8x8Blocks[56] + meanOf8x8Blocks[57]) >> 2;
897 meanOf16x16Blocks[13] = (meanOf8x8Blocks[50] + meanOf8x8Blocks[51] + meanOf8x8Blocks[58] + meanOf8x8Blocks[59]) >> 2;
898 meanOf16x16Blocks[14] = (meanOf8x8Blocks[52] + meanOf8x8Blocks[53] + meanOf8x8Blocks[60] + meanOf8x8Blocks[61]) >> 2;
899 meanOf16x16Blocks[15] = (meanOf8x8Blocks[54] + meanOf8x8Blocks[55] + meanOf8x8Blocks[62] + meanOf8x8Blocks[63]) >> 2;
900
901 meanOf16x16SquaredValuesBlocks[0] = (meanOf8x8SquaredValuesBlocks[0] + meanOf8x8SquaredValuesBlocks[1] + meanOf8x8SquaredValuesBlocks[8] + meanOf8x8SquaredValuesBlocks[9]) >> 2;
902 meanOf16x16SquaredValuesBlocks[1] = (meanOf8x8SquaredValuesBlocks[2] + meanOf8x8SquaredValuesBlocks[3] + meanOf8x8SquaredValuesBlocks[10] + meanOf8x8SquaredValuesBlocks[11]) >> 2;
903 meanOf16x16SquaredValuesBlocks[2] = (meanOf8x8SquaredValuesBlocks[4] + meanOf8x8SquaredValuesBlocks[5] + meanOf8x8SquaredValuesBlocks[12] + meanOf8x8SquaredValuesBlocks[13]) >> 2;
904 meanOf16x16SquaredValuesBlocks[3] = (meanOf8x8SquaredValuesBlocks[6] + meanOf8x8SquaredValuesBlocks[7] + meanOf8x8SquaredValuesBlocks[14] + meanOf8x8SquaredValuesBlocks[15]) >> 2;
905
906 meanOf16x16SquaredValuesBlocks[4] = (meanOf8x8SquaredValuesBlocks[16] + meanOf8x8SquaredValuesBlocks[17] + meanOf8x8SquaredValuesBlocks[24] + meanOf8x8SquaredValuesBlocks[25]) >> 2;
907 meanOf16x16SquaredValuesBlocks[5] = (meanOf8x8SquaredValuesBlocks[18] + meanOf8x8SquaredValuesBlocks[19] + meanOf8x8SquaredValuesBlocks[26] + meanOf8x8SquaredValuesBlocks[27]) >> 2;
908 meanOf16x16SquaredValuesBlocks[6] = (meanOf8x8SquaredValuesBlocks[20] + meanOf8x8SquaredValuesBlocks[21] + meanOf8x8SquaredValuesBlocks[28] + meanOf8x8SquaredValuesBlocks[29]) >> 2;
909 meanOf16x16SquaredValuesBlocks[7] = (meanOf8x8SquaredValuesBlocks[22] + meanOf8x8SquaredValuesBlocks[23] + meanOf8x8SquaredValuesBlocks[30] + meanOf8x8SquaredValuesBlocks[31]) >> 2;
910
911 meanOf16x16SquaredValuesBlocks[8] = (meanOf8x8SquaredValuesBlocks[32] + meanOf8x8SquaredValuesBlocks[33] + meanOf8x8SquaredValuesBlocks[40] + meanOf8x8SquaredValuesBlocks[41]) >> 2;
912 meanOf16x16SquaredValuesBlocks[9] = (meanOf8x8SquaredValuesBlocks[34] + meanOf8x8SquaredValuesBlocks[35] + meanOf8x8SquaredValuesBlocks[42] + meanOf8x8SquaredValuesBlocks[43]) >> 2;
913 meanOf16x16SquaredValuesBlocks[10] = (meanOf8x8SquaredValuesBlocks[36] + meanOf8x8SquaredValuesBlocks[37] + meanOf8x8SquaredValuesBlocks[44] + meanOf8x8SquaredValuesBlocks[45]) >> 2;
914 meanOf16x16SquaredValuesBlocks[11] = (meanOf8x8SquaredValuesBlocks[38] + meanOf8x8SquaredValuesBlocks[39] + meanOf8x8SquaredValuesBlocks[46] + meanOf8x8SquaredValuesBlocks[47]) >> 2;
915
916 meanOf16x16SquaredValuesBlocks[12] = (meanOf8x8SquaredValuesBlocks[48] + meanOf8x8SquaredValuesBlocks[49] + meanOf8x8SquaredValuesBlocks[56] + meanOf8x8SquaredValuesBlocks[57]) >> 2;
917 meanOf16x16SquaredValuesBlocks[13] = (meanOf8x8SquaredValuesBlocks[50] + meanOf8x8SquaredValuesBlocks[51] + meanOf8x8SquaredValuesBlocks[58] + meanOf8x8SquaredValuesBlocks[59]) >> 2;
918 meanOf16x16SquaredValuesBlocks[14] = (meanOf8x8SquaredValuesBlocks[52] + meanOf8x8SquaredValuesBlocks[53] + meanOf8x8SquaredValuesBlocks[60] + meanOf8x8SquaredValuesBlocks[61]) >> 2;
919 meanOf16x16SquaredValuesBlocks[15] = (meanOf8x8SquaredValuesBlocks[54] + meanOf8x8SquaredValuesBlocks[55] + meanOf8x8SquaredValuesBlocks[62] + meanOf8x8SquaredValuesBlocks[63]) >> 2;
920
921 // 32x32
922 meanOf32x32Blocks[0] = (meanOf16x16Blocks[0] + meanOf16x16Blocks[1] + meanOf16x16Blocks[4] + meanOf16x16Blocks[5]) >> 2;
923 meanOf32x32Blocks[1] = (meanOf16x16Blocks[2] + meanOf16x16Blocks[3] + meanOf16x16Blocks[6] + meanOf16x16Blocks[7]) >> 2;
924 meanOf32x32Blocks[2] = (meanOf16x16Blocks[8] + meanOf16x16Blocks[9] + meanOf16x16Blocks[12] + meanOf16x16Blocks[13]) >> 2;
925 meanOf32x32Blocks[3] = (meanOf16x16Blocks[10] + meanOf16x16Blocks[11] + meanOf16x16Blocks[14] + meanOf16x16Blocks[15]) >> 2;
926
927 meanOf32x32SquaredValuesBlocks[0] = (meanOf16x16SquaredValuesBlocks[0] + meanOf16x16SquaredValuesBlocks[1] + meanOf16x16SquaredValuesBlocks[4] + meanOf16x16SquaredValuesBlocks[5]) >> 2;
928 meanOf32x32SquaredValuesBlocks[1] = (meanOf16x16SquaredValuesBlocks[2] + meanOf16x16SquaredValuesBlocks[3] + meanOf16x16SquaredValuesBlocks[6] + meanOf16x16SquaredValuesBlocks[7]) >> 2;
929 meanOf32x32SquaredValuesBlocks[2] = (meanOf16x16SquaredValuesBlocks[8] + meanOf16x16SquaredValuesBlocks[9] + meanOf16x16SquaredValuesBlocks[12] + meanOf16x16SquaredValuesBlocks[13]) >> 2;
930 meanOf32x32SquaredValuesBlocks[3] = (meanOf16x16SquaredValuesBlocks[10] + meanOf16x16SquaredValuesBlocks[11] + meanOf16x16SquaredValuesBlocks[14] + meanOf16x16SquaredValuesBlocks[15]) >> 2;
931
932
933 variance32x32[0] = meanOf32x32SquaredValuesBlocks[0] - (meanOf32x32Blocks[0] * meanOf32x32Blocks[0]);
934 variance32x32[1] = meanOf32x32SquaredValuesBlocks[1] - (meanOf32x32Blocks[1] * meanOf32x32Blocks[1]);
935 variance32x32[2] = meanOf32x32SquaredValuesBlocks[2] - (meanOf32x32Blocks[2] * meanOf32x32Blocks[2]);
936 variance32x32[3] = meanOf32x32SquaredValuesBlocks[3] - (meanOf32x32Blocks[3] * meanOf32x32Blocks[3]);
937
938
939 // 64x64
940 meanOf64x64Blocks = (meanOf32x32Blocks[0] + meanOf32x32Blocks[1] + meanOf32x32Blocks[2] + meanOf32x32Blocks[3]) >> 2;
941 meanOf64x64SquaredValuesBlocks = (meanOf32x32SquaredValuesBlocks[0] + meanOf32x32SquaredValuesBlocks[1] + meanOf32x32SquaredValuesBlocks[2] + meanOf32x32SquaredValuesBlocks[3]) >> 2;
942
943 return (meanOf64x64SquaredValuesBlocks - (meanOf64x64Blocks * meanOf64x64Blocks));
944 }
945
946
947
getFilteredTypes(EB_U8 * ptr,EB_U32 stride,EB_U8 EbHevcFilterType)948 static EB_U8 getFilteredTypes(EB_U8 *ptr,
949 EB_U32 stride,
950 EB_U8 EbHevcFilterType)
951 {
952 EB_U8 *p = ptr - 1 - stride;
953
954 EB_U32 a = 0;
955
956 if (EbHevcFilterType == 0){
957
958 //Luma
959 a = (p[1] +
960 p[0 + stride] + 4 * p[1 + stride] + p[2 + stride] +
961 p[1 + 2 * stride]) / 8;
962
963 }
964 else if (EbHevcFilterType == 1){
965 a = ( 2 * p[1] +
966 2 * p[0 + stride] + 4 * p[1 + stride] + 2 * p[2 + stride] +
967 2 * p[1 + 2 * stride] );
968
969 a = (( (EB_U32)((a *2730) >> 14) + 1) >> 1) & 0xFFFF;
970
971 //fixed point version of a=a/12 to mimic x86 instruction _mm256_mulhrs_epi16;
972 //a= (a*2730)>>15;
973 }
974 else if (EbHevcFilterType == 2){
975
976
977 a = (4 * p[1] +
978 4 * p[0 + stride] + 4 * p[1 + stride] + 4 * p[2 + stride] +
979 4 * p[1 + 2 * stride]) / 20;
980 }
981 else if (EbHevcFilterType == 3){
982
983 a = (1 * p[0] + 1 * p[1] + 1 * p[2] +
984 1 * p[0 + stride] + 4 * p[1 + stride] + 1 * p[2 + stride] +
985 1 * p[0 + 2 * stride] + 1 * p[1 + 2 * stride] + 1 * p[2 + 2 * stride]) / 12;
986
987
988 }
989 else if (EbHevcFilterType == 4){
990
991 //gaussian matrix(Chroma)
992 a = (1 * p[0] + 2 * p[1] + 1 * p[2] +
993 2 * p[0 + stride] + 4 * p[1 + stride] + 2 * p[2 + stride] +
994 1 * p[0 + 2 * stride] + 2 * p[1 + 2 * stride] + 1 * p[2 + 2 * stride]) / 16;
995
996 }
997 else if (EbHevcFilterType == 5){
998
999 a = (2 * p[0] + 2 * p[1] + 2 * p[2] +
1000 2 * p[0 + stride] + 4 * p[1 + stride] + 2 * p[2 + stride] +
1001 2 * p[0 + 2 * stride] + 2 * p[1 + 2 * stride] + 2 * p[2 + 2 * stride]) / 20;
1002
1003 }
1004 else if (EbHevcFilterType == 6){
1005
1006 a = (4 * p[0] + 4 * p[1] + 4 * p[2] +
1007 4 * p[0 + stride] + 4 * p[1 + stride] + 4 * p[2 + stride] +
1008 4 * p[0 + 2 * stride] + 4 * p[1 + 2 * stride] + 4 * p[2 + 2 * stride]) / 36;
1009
1010 }
1011
1012 return (EB_U8)CLIP3EQ(0, 255, a);
1013 }
1014
1015
1016 /*******************************************
1017 * noiseExtractLumaStrong
1018 * strong filter Luma.
1019 *******************************************/
noiseExtractLumaStrong(EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EB_U32 lcuOriginY,EB_U32 lcuOriginX)1020 void noiseExtractLumaStrong(
1021 EbPictureBufferDesc_t *inputPicturePtr,
1022 EbPictureBufferDesc_t *denoisedPicturePtr,
1023 EB_U32 lcuOriginY
1024 , EB_U32 lcuOriginX
1025 )
1026 {
1027 EB_U32 ii, jj;
1028 EB_U32 picHeight, lcuHeight;
1029 EB_U32 picWidth;
1030 EB_U32 inputOriginIndex;
1031 EB_U32 inputOriginIndexPad;
1032
1033 EB_U8 *ptrIn;
1034 EB_U32 strideIn;
1035 EB_U8 *ptrDenoised;
1036
1037 EB_U32 strideOut;
1038 EB_U32 idx = (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width) ? lcuOriginX : 0;
1039
1040 //Luma
1041 {
1042 picHeight = inputPicturePtr->height;
1043 picWidth = inputPicturePtr->width;
1044 lcuHeight = MIN(MAX_LCU_SIZE, picHeight - lcuOriginY);
1045
1046 strideIn = inputPicturePtr->strideY;
1047 inputOriginIndex = inputPicturePtr->originX + (inputPicturePtr->originY + lcuOriginY)* inputPicturePtr->strideY;
1048 ptrIn = &(inputPicturePtr->bufferY[inputOriginIndex]);
1049
1050 inputOriginIndexPad = denoisedPicturePtr->originX + (denoisedPicturePtr->originY + lcuOriginY) * denoisedPicturePtr->strideY;
1051 strideOut = denoisedPicturePtr->strideY;
1052 ptrDenoised = &(denoisedPicturePtr->bufferY[inputOriginIndexPad]);
1053
1054 for (jj = 0; jj < lcuHeight; jj++){
1055 for (ii = idx; ii < picWidth; ii++){
1056
1057 if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || lcuOriginY + lcuHeight < picHeight) && ii>0 && ii < picWidth - 1){
1058
1059 ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 4);
1060
1061 }
1062 else{
1063 ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1064
1065 }
1066
1067 }
1068 }
1069 }
1070
1071 }
1072
1073 /*******************************************
1074 * noiseExtractChromaStrong
1075 * strong filter chroma.
1076 *******************************************/
noiseExtractChromaStrong(EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EB_U32 lcuOriginY,EB_U32 lcuOriginX)1077 void noiseExtractChromaStrong(
1078 EbPictureBufferDesc_t *inputPicturePtr,
1079 EbPictureBufferDesc_t *denoisedPicturePtr,
1080 EB_U32 lcuOriginY
1081 , EB_U32 lcuOriginX
1082 )
1083 {
1084 EB_U32 ii, jj;
1085 EB_U32 picHeight, lcuHeight;
1086 EB_U32 picWidth;
1087 EB_U32 inputOriginIndex;
1088 EB_U32 inputOriginIndexPad;
1089
1090 EB_U8 *ptrIn;
1091 EB_U32 strideIn;
1092 EB_U8 *ptrDenoised;
1093
1094 EB_U32 strideOut;
1095 EB_U32 idx = (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width) ? lcuOriginX : 0;
1096
1097 EB_U32 colorFormat = inputPicturePtr->colorFormat;
1098 EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
1099 EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
1100
1101
1102 //Cb
1103 {
1104 picHeight = inputPicturePtr->height >> subHeightCMinus1;
1105 picWidth = inputPicturePtr->width >> subWidthCMinus1;
1106 lcuHeight = MIN(MAX_LCU_SIZE >> subHeightCMinus1, picHeight - lcuOriginY);
1107
1108 strideIn = inputPicturePtr->strideCb;
1109 inputOriginIndex = (inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) + lcuOriginY) * inputPicturePtr->strideCb;
1110 ptrIn = &(inputPicturePtr->bufferCb[inputOriginIndex]);
1111
1112 inputOriginIndexPad = (denoisedPicturePtr->originX >> subWidthCMinus1) + ((denoisedPicturePtr->originY >> subHeightCMinus1) + lcuOriginY) * denoisedPicturePtr->strideCb;
1113 strideOut = denoisedPicturePtr->strideCb;
1114 ptrDenoised = &(denoisedPicturePtr->bufferCb[inputOriginIndexPad]);
1115
1116
1117 for (jj = 0; jj < lcuHeight; jj++){
1118 for (ii = idx; ii < picWidth; ii++){
1119
1120
1121 if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || (lcuOriginY + lcuHeight) < picHeight) && ii > 0 && ii < picWidth - 1){
1122 ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 6);
1123 }
1124 else{
1125 ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1126 }
1127
1128 }
1129 }
1130 }
1131
1132 //Cr
1133 {
1134 picHeight = inputPicturePtr->height >> subHeightCMinus1;
1135 picWidth = inputPicturePtr->width >> subWidthCMinus1;
1136 lcuHeight = MIN(MAX_LCU_SIZE >> subHeightCMinus1, picHeight - lcuOriginY);
1137
1138 strideIn = inputPicturePtr->strideCr;
1139 inputOriginIndex = (inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) + lcuOriginY) * inputPicturePtr->strideCr;
1140 ptrIn = &(inputPicturePtr->bufferCr[inputOriginIndex]);
1141
1142 inputOriginIndexPad = (denoisedPicturePtr->originX >> subWidthCMinus1) + ((denoisedPicturePtr->originY >> subHeightCMinus1) + lcuOriginY) * denoisedPicturePtr->strideCr;
1143 strideOut = denoisedPicturePtr->strideCr;
1144 ptrDenoised = &(denoisedPicturePtr->bufferCr[inputOriginIndexPad]);
1145
1146
1147 for (jj = 0; jj < lcuHeight; jj++){
1148 for (ii = idx; ii < picWidth; ii++){
1149
1150 if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || (lcuOriginY + lcuHeight) < picHeight) && ii > 0 && ii < picWidth - 1){
1151 ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 6);
1152 }
1153 else{
1154 ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1155 }
1156
1157 }
1158 }
1159 }
1160 }
1161
1162 /*******************************************
1163 * noiseExtractChromaWeak
1164 * weak filter chroma.
1165 *******************************************/
noiseExtractChromaWeak(EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EB_U32 lcuOriginY,EB_U32 lcuOriginX)1166 void noiseExtractChromaWeak(
1167 EbPictureBufferDesc_t *inputPicturePtr,
1168 EbPictureBufferDesc_t *denoisedPicturePtr,
1169 EB_U32 lcuOriginY
1170 , EB_U32 lcuOriginX
1171 )
1172 {
1173 EB_U32 ii, jj;
1174 EB_U32 picHeight, lcuHeight;
1175 EB_U32 picWidth;
1176 EB_U32 inputOriginIndex;
1177 EB_U32 inputOriginIndexPad;
1178
1179 EB_U8 *ptrIn;
1180 EB_U32 strideIn;
1181 EB_U8 *ptrDenoised;
1182
1183 EB_U32 strideOut;
1184
1185 EB_U32 idx = (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width) ? lcuOriginX : 0;
1186
1187 EB_U32 colorFormat = inputPicturePtr->colorFormat;
1188 EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
1189 EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
1190
1191
1192 //Cb
1193 {
1194 picHeight = inputPicturePtr->height >> subHeightCMinus1;
1195 picWidth = inputPicturePtr->width >> subWidthCMinus1;
1196
1197 lcuHeight = MIN(MAX_LCU_SIZE >> subHeightCMinus1, picHeight - lcuOriginY);
1198
1199 strideIn = inputPicturePtr->strideCb;
1200 inputOriginIndex = (inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) + lcuOriginY)* inputPicturePtr->strideCb;
1201 ptrIn = &(inputPicturePtr->bufferCb[inputOriginIndex]);
1202
1203 inputOriginIndexPad = (denoisedPicturePtr->originX >> subWidthCMinus1) + ((denoisedPicturePtr->originY >> subHeightCMinus1) + lcuOriginY)* denoisedPicturePtr->strideCb;
1204 strideOut = denoisedPicturePtr->strideCb;
1205 ptrDenoised = &(denoisedPicturePtr->bufferCb[inputOriginIndexPad]);
1206
1207
1208 for (jj = 0; jj < lcuHeight; jj++){
1209 for (ii = idx; ii < picWidth; ii++){
1210
1211
1212 if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || (lcuOriginY + lcuHeight) < picHeight) && ii > 0 && ii < picWidth - 1){
1213 ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 4);
1214 }
1215 else{
1216 ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1217 }
1218
1219 }
1220 }
1221 }
1222
1223 //Cr
1224 {
1225 picHeight = inputPicturePtr->height >> subHeightCMinus1;
1226 picWidth = inputPicturePtr->width >> subWidthCMinus1;
1227 lcuHeight = MIN(MAX_LCU_SIZE >> subHeightCMinus1, picHeight - lcuOriginY);
1228
1229 strideIn = inputPicturePtr->strideCr;
1230 inputOriginIndex = (inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) + lcuOriginY)* inputPicturePtr->strideCr;
1231 ptrIn = &(inputPicturePtr->bufferCr[inputOriginIndex]);
1232
1233 inputOriginIndexPad = (denoisedPicturePtr->originX >> subWidthCMinus1) + ((denoisedPicturePtr->originY >> subHeightCMinus1) + lcuOriginY)* denoisedPicturePtr->strideCr;
1234 strideOut = denoisedPicturePtr->strideCr;
1235 ptrDenoised = &(denoisedPicturePtr->bufferCr[inputOriginIndexPad]);
1236
1237
1238 for (jj = 0; jj < lcuHeight; jj++){
1239 for (ii = idx; ii < picWidth; ii++){
1240
1241 if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || (lcuOriginY + lcuHeight) < picHeight) && ii > 0 && ii < picWidth - 1){
1242 ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 4);
1243 }
1244 else{
1245 ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1246 }
1247
1248 }
1249 }
1250 }
1251
1252 }
1253
1254 /*******************************************
1255 * noiseExtractLumaWeak
1256 * weak filter Luma and store noise.
1257 *******************************************/
noiseExtractLumaWeak(EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EbPictureBufferDesc_t * noisePicturePtr,EB_U32 lcuOriginY,EB_U32 lcuOriginX)1258 void noiseExtractLumaWeak(
1259 EbPictureBufferDesc_t *inputPicturePtr,
1260 EbPictureBufferDesc_t *denoisedPicturePtr,
1261 EbPictureBufferDesc_t *noisePicturePtr,
1262 EB_U32 lcuOriginY
1263 , EB_U32 lcuOriginX
1264 )
1265 {
1266 EB_U32 ii, jj;
1267 EB_U32 picHeight, lcuHeight;
1268 EB_U32 picWidth;
1269 EB_U32 inputOriginIndex;
1270 EB_U32 inputOriginIndexPad;
1271 EB_U32 noiseOriginIndex;
1272
1273 EB_U8 *ptrIn;
1274 EB_U32 strideIn;
1275 EB_U8 *ptrDenoised;
1276
1277 EB_U8 *ptrNoise;
1278 EB_U32 strideOut;
1279
1280 EB_U32 idx = (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width) ? lcuOriginX : 0;
1281
1282 //Luma
1283 {
1284 picHeight = inputPicturePtr->height;
1285 picWidth = inputPicturePtr->width;
1286 lcuHeight = MIN(MAX_LCU_SIZE, picHeight - lcuOriginY);
1287
1288 strideIn = inputPicturePtr->strideY;
1289 inputOriginIndex = inputPicturePtr->originX + (inputPicturePtr->originY + lcuOriginY) * inputPicturePtr->strideY;
1290 ptrIn = &(inputPicturePtr->bufferY[inputOriginIndex]);
1291
1292 inputOriginIndexPad = denoisedPicturePtr->originX + (denoisedPicturePtr->originY + lcuOriginY) * denoisedPicturePtr->strideY;
1293 strideOut = denoisedPicturePtr->strideY;
1294 ptrDenoised = &(denoisedPicturePtr->bufferY[inputOriginIndexPad]);
1295
1296 noiseOriginIndex = noisePicturePtr->originX + noisePicturePtr->originY * noisePicturePtr->strideY;
1297 ptrNoise = &(noisePicturePtr->bufferY[noiseOriginIndex]);
1298
1299
1300 for (jj = 0; jj < lcuHeight; jj++){
1301 for (ii = idx; ii < picWidth; ii++){
1302
1303 if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || lcuOriginY + lcuHeight < picHeight) && ii>0 && ii < picWidth - 1){
1304
1305 ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 0);
1306 ptrNoise[ii + jj*strideOut] = CLIP3EQ(0, 255, ptrIn[ii + jj*strideIn] - ptrDenoised[ii + jj*strideOut]);
1307
1308 }
1309 else{
1310 ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1311 ptrNoise[ii + jj*strideOut] = 0;
1312 }
1313
1314 }
1315 }
1316 }
1317
1318 }
1319
noiseExtractLumaWeakLcu(EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EbPictureBufferDesc_t * noisePicturePtr,EB_U32 lcuOriginY,EB_U32 lcuOriginX)1320 void noiseExtractLumaWeakLcu(
1321 EbPictureBufferDesc_t *inputPicturePtr,
1322 EbPictureBufferDesc_t *denoisedPicturePtr,
1323 EbPictureBufferDesc_t *noisePicturePtr,
1324 EB_U32 lcuOriginY
1325 , EB_U32 lcuOriginX
1326 )
1327 {
1328 EB_U32 ii, jj;
1329 EB_U32 picHeight, lcuHeight;
1330 EB_U32 picWidth, lcuWidth;
1331 EB_U32 inputOriginIndex;
1332 EB_U32 inputOriginIndexPad;
1333 EB_U32 noiseOriginIndex;
1334
1335 EB_U8 *ptrIn;
1336 EB_U32 strideIn;
1337 EB_U8 *ptrDenoised;
1338
1339 EB_U8 *ptrNoise;
1340 EB_U32 strideOut;
1341
1342 EB_U32 idx = (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width) ? lcuOriginX : 0;
1343
1344 //Luma
1345 {
1346 picHeight = inputPicturePtr->height;
1347 picWidth = inputPicturePtr->width;
1348 lcuHeight = MIN(MAX_LCU_SIZE, picHeight - lcuOriginY);
1349 lcuWidth = MIN(MAX_LCU_SIZE, picWidth - lcuOriginX);
1350
1351 strideIn = inputPicturePtr->strideY;
1352 inputOriginIndex = inputPicturePtr->originX + lcuOriginX + (inputPicturePtr->originY + lcuOriginY) * inputPicturePtr->strideY;
1353 ptrIn = &(inputPicturePtr->bufferY[inputOriginIndex]);
1354
1355 inputOriginIndexPad = denoisedPicturePtr->originX + lcuOriginX + (denoisedPicturePtr->originY + lcuOriginY) * denoisedPicturePtr->strideY;
1356 strideOut = denoisedPicturePtr->strideY;
1357 ptrDenoised = &(denoisedPicturePtr->bufferY[inputOriginIndexPad]);
1358
1359 noiseOriginIndex = noisePicturePtr->originX + lcuOriginX + noisePicturePtr->originY * noisePicturePtr->strideY;
1360 ptrNoise = &(noisePicturePtr->bufferY[noiseOriginIndex]);
1361
1362
1363 for (jj = 0; jj < lcuHeight; jj++){
1364 for (ii = idx; ii < lcuWidth; ii++){
1365
1366 if ((jj>0 || lcuOriginY > 0) && (jj < lcuHeight - 1 || lcuOriginY + lcuHeight < picHeight) && (ii>0 || lcuOriginX>0) && (ii + lcuOriginX) < picWidth - 1/* & ii < lcuWidth - 1*/){
1367
1368 ptrDenoised[ii + jj*strideOut] = getFilteredTypes(&ptrIn[ii + jj*strideIn], strideIn, 0);
1369 ptrNoise[ii + jj*strideOut] = CLIP3EQ(0, 255, ptrIn[ii + jj*strideIn] - ptrDenoised[ii + jj*strideOut]);
1370
1371 }
1372 else{
1373 ptrDenoised[ii + jj*strideOut] = ptrIn[ii + jj*strideIn];
1374 ptrNoise[ii + jj*strideOut] = 0;
1375 }
1376
1377 }
1378 }
1379 }
1380
1381 }
1382
ZeroOutChromaBlockMean(PictureParentControlSet_t * pictureControlSetPtr,EB_U32 lcuCodingOrder)1383 static EB_ERRORTYPE ZeroOutChromaBlockMean(
1384 PictureParentControlSet_t *pictureControlSetPtr, // input parameter, Picture Control Set Ptr
1385 EB_U32 lcuCodingOrder // input parameter, LCU address
1386 )
1387 {
1388
1389 EB_ERRORTYPE return_error = EB_ErrorNone;
1390 // 16x16 mean
1391 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_0] = 0;
1392 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_1] = 0;
1393 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_2] = 0;
1394 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_3] = 0;
1395 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_4] = 0;
1396 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_5] = 0;
1397 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_6] = 0;
1398 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_7] = 0;
1399 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_8] = 0;
1400 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_9] = 0;
1401 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_10] = 0;
1402 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_11] = 0;
1403 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_12] = 0;
1404 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_13] = 0;
1405 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_14] = 0;
1406 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_15] = 0;
1407
1408 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_0] = 0;
1409 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_1] = 0;
1410 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_2] = 0;
1411 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_3] = 0;
1412 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_4] = 0;
1413 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_5] = 0;
1414 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_6] = 0;
1415 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_7] = 0;
1416 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_8] = 0;
1417 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_9] = 0;
1418 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_10] = 0;
1419 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_11] = 0;
1420 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_12] = 0;
1421 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_13] = 0;
1422 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_14] = 0;
1423 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_15] = 0;
1424
1425 // 32x32 mean
1426 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_0] = 0;
1427 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_1] = 0;
1428 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_2] = 0;
1429 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_3] = 0;
1430
1431 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_0] = 0;
1432 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_1] = 0;
1433 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_2] = 0;
1434 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_3] = 0;
1435
1436 // 64x64 mean
1437 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_64x64] = 0;
1438 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_64x64] = 0;
1439
1440 return return_error;
1441
1442 }
1443
1444 /*******************************************
1445 * ComputeChromaBlockMean
1446 * computes the chroma block mean for 64x64, 32x32 and 16x16 CUs inside the tree block
1447 *******************************************/
ComputeChromaBlockMean(PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPaddedPicturePtr,EB_U32 lcuCodingOrder,EB_U32 inputCbOriginIndex,EB_U32 inputCrOriginIndex)1448 static EB_ERRORTYPE ComputeChromaBlockMean(
1449 PictureParentControlSet_t *pictureControlSetPtr, // input parameter, Picture Control Set Ptr
1450 EbPictureBufferDesc_t *inputPaddedPicturePtr, // input parameter, Input Padded Picture
1451 EB_U32 lcuCodingOrder, // input parameter, LCU address
1452 EB_U32 inputCbOriginIndex, // input parameter, LCU index, used to point to source/reference samples
1453 EB_U32 inputCrOriginIndex) // input parameter, LCU index, used to point to source/reference samples
1454 {
1455
1456 EB_ERRORTYPE return_error = EB_ErrorNone;
1457
1458 EB_U32 cbBlockIndex, crBlockIndex;
1459
1460 EB_U64 cbMeanOf16x16Blocks[16];
1461 EB_U64 crMeanOf16x16Blocks[16];
1462
1463 EB_U64 cbMeanOf32x32Blocks[4];
1464 EB_U64 crMeanOf32x32Blocks[4];
1465
1466 EB_U64 cbMeanOf64x64Blocks;
1467 EB_U64 crMeanOf64x64Blocks;
1468
1469
1470 // (0,0) 16x16 block
1471 cbBlockIndex = inputCbOriginIndex;
1472 crBlockIndex = inputCrOriginIndex;
1473
1474 const EB_U16 strideCb = inputPaddedPicturePtr->strideCb;
1475 const EB_U16 strideCr = inputPaddedPicturePtr->strideCr;
1476
1477 cbMeanOf16x16Blocks[0] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1478 crMeanOf16x16Blocks[0] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1479
1480 // (0,1)
1481 cbBlockIndex = cbBlockIndex + 8;
1482 crBlockIndex = crBlockIndex + 8;
1483 cbMeanOf16x16Blocks[1] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1484 crMeanOf16x16Blocks[1] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1485
1486 // (0,2)
1487 cbBlockIndex = cbBlockIndex + 8;
1488 crBlockIndex = crBlockIndex + 8;
1489 cbMeanOf16x16Blocks[2] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1490 crMeanOf16x16Blocks[2] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1491
1492 // (0,3)
1493 cbBlockIndex = cbBlockIndex + 8;
1494 crBlockIndex = crBlockIndex + 8;
1495 cbMeanOf16x16Blocks[3] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1496 crMeanOf16x16Blocks[3] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1497
1498 // (1,0)
1499 cbBlockIndex = inputCbOriginIndex + (strideCb << 3);
1500 crBlockIndex = inputCrOriginIndex + (strideCr << 3);
1501 cbMeanOf16x16Blocks[4] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1502 crMeanOf16x16Blocks[4] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1503
1504 // (1,1)
1505 cbBlockIndex = cbBlockIndex + 8;
1506 crBlockIndex = crBlockIndex + 8;
1507 cbMeanOf16x16Blocks[5] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1508 crMeanOf16x16Blocks[5] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1509
1510 // (1,2)
1511 cbBlockIndex = cbBlockIndex + 8;
1512 crBlockIndex = crBlockIndex + 8;
1513 cbMeanOf16x16Blocks[6] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1514 crMeanOf16x16Blocks[6] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1515
1516 // (1,3)
1517 cbBlockIndex = cbBlockIndex + 8;
1518 crBlockIndex = crBlockIndex + 8;
1519 cbMeanOf16x16Blocks[7] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1520 crMeanOf16x16Blocks[7] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1521
1522 // (2,0)
1523 cbBlockIndex = inputCbOriginIndex + (strideCb << 4);
1524 crBlockIndex = inputCrOriginIndex + (strideCr << 4);
1525 cbMeanOf16x16Blocks[8] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1526 crMeanOf16x16Blocks[8] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1527
1528 // (2,1)
1529 cbBlockIndex = cbBlockIndex + 8;
1530 crBlockIndex = crBlockIndex + 8;
1531 cbMeanOf16x16Blocks[9] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1532 crMeanOf16x16Blocks[9] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1533
1534 // (2,2)
1535 cbBlockIndex = cbBlockIndex + 8;
1536 crBlockIndex = crBlockIndex + 8;
1537 cbMeanOf16x16Blocks[10] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1538 crMeanOf16x16Blocks[10] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1539
1540 // (2,3)
1541 cbBlockIndex = cbBlockIndex + 8;
1542 crBlockIndex = crBlockIndex + 8;
1543 cbMeanOf16x16Blocks[11] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1544 crMeanOf16x16Blocks[11] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1545
1546 // (3,0)
1547 cbBlockIndex = inputCbOriginIndex + (strideCb * 24);
1548 crBlockIndex = inputCrOriginIndex + (strideCr * 24);
1549 cbMeanOf16x16Blocks[12] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1550 crMeanOf16x16Blocks[12] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1551
1552 // (3,1)
1553 cbBlockIndex = cbBlockIndex + 8;
1554 crBlockIndex = crBlockIndex + 8;
1555 cbMeanOf16x16Blocks[13] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1556 crMeanOf16x16Blocks[13] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1557
1558 // (3,2)
1559 cbBlockIndex = cbBlockIndex + 8;
1560 crBlockIndex = crBlockIndex + 8;
1561 cbMeanOf16x16Blocks[14] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1562 crMeanOf16x16Blocks[14] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1563
1564 // (3,3)
1565 cbBlockIndex = cbBlockIndex + 8;
1566 crBlockIndex = crBlockIndex + 8;
1567 cbMeanOf16x16Blocks[15] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCb[cbBlockIndex]), strideCb);
1568 crMeanOf16x16Blocks[15] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferCr[crBlockIndex]), strideCr);
1569
1570
1571 // 32x32
1572 cbMeanOf32x32Blocks[0] = (cbMeanOf16x16Blocks[0] + cbMeanOf16x16Blocks[1] + cbMeanOf16x16Blocks[4] + cbMeanOf16x16Blocks[5]) >> 2;
1573 crMeanOf32x32Blocks[0] = (crMeanOf16x16Blocks[0] + crMeanOf16x16Blocks[1] + crMeanOf16x16Blocks[4] + crMeanOf16x16Blocks[5]) >> 2;
1574
1575 cbMeanOf32x32Blocks[1] = (cbMeanOf16x16Blocks[2] + cbMeanOf16x16Blocks[3] + cbMeanOf16x16Blocks[6] + cbMeanOf16x16Blocks[7]) >> 2;
1576 crMeanOf32x32Blocks[1] = (crMeanOf16x16Blocks[2] + crMeanOf16x16Blocks[3] + crMeanOf16x16Blocks[6] + crMeanOf16x16Blocks[7]) >> 2;
1577
1578
1579 cbMeanOf32x32Blocks[2] = (cbMeanOf16x16Blocks[8] + cbMeanOf16x16Blocks[9] + cbMeanOf16x16Blocks[12] + cbMeanOf16x16Blocks[13]) >> 2;
1580 crMeanOf32x32Blocks[2] = (crMeanOf16x16Blocks[8] + crMeanOf16x16Blocks[9] + crMeanOf16x16Blocks[12] + crMeanOf16x16Blocks[13]) >> 2;
1581
1582 cbMeanOf32x32Blocks[3] = (cbMeanOf16x16Blocks[10] + cbMeanOf16x16Blocks[11] + cbMeanOf16x16Blocks[14] + cbMeanOf16x16Blocks[15]) >> 2;
1583 crMeanOf32x32Blocks[3] = (crMeanOf16x16Blocks[10] + crMeanOf16x16Blocks[11] + crMeanOf16x16Blocks[14] + crMeanOf16x16Blocks[15]) >> 2;
1584
1585 // 64x64
1586 cbMeanOf64x64Blocks = (cbMeanOf32x32Blocks[0] + cbMeanOf32x32Blocks[1] + cbMeanOf32x32Blocks[3] + cbMeanOf32x32Blocks[3]) >> 2;
1587 crMeanOf64x64Blocks = (crMeanOf32x32Blocks[0] + crMeanOf32x32Blocks[1] + crMeanOf32x32Blocks[3] + crMeanOf32x32Blocks[3]) >> 2;
1588 // 16x16 mean
1589 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_0] = (EB_U8) (cbMeanOf16x16Blocks[0] >> MEAN_PRECISION);
1590 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_1] = (EB_U8) (cbMeanOf16x16Blocks[1] >> MEAN_PRECISION);
1591 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_2] = (EB_U8) (cbMeanOf16x16Blocks[2] >> MEAN_PRECISION);
1592 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_3] = (EB_U8) (cbMeanOf16x16Blocks[3] >> MEAN_PRECISION);
1593 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_4] = (EB_U8) (cbMeanOf16x16Blocks[4] >> MEAN_PRECISION);
1594 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_5] = (EB_U8) (cbMeanOf16x16Blocks[5] >> MEAN_PRECISION);
1595 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_6] = (EB_U8) (cbMeanOf16x16Blocks[6] >> MEAN_PRECISION);
1596 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_7] = (EB_U8) (cbMeanOf16x16Blocks[7] >> MEAN_PRECISION);
1597 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_8] = (EB_U8) (cbMeanOf16x16Blocks[8] >> MEAN_PRECISION);
1598 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_9] = (EB_U8) (cbMeanOf16x16Blocks[9] >> MEAN_PRECISION);
1599 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_10] = (EB_U8) (cbMeanOf16x16Blocks[10] >> MEAN_PRECISION);
1600 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_11] = (EB_U8) (cbMeanOf16x16Blocks[11] >> MEAN_PRECISION);
1601 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_12] = (EB_U8) (cbMeanOf16x16Blocks[12] >> MEAN_PRECISION);
1602 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_13] = (EB_U8) (cbMeanOf16x16Blocks[13] >> MEAN_PRECISION);
1603 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_14] = (EB_U8) (cbMeanOf16x16Blocks[14] >> MEAN_PRECISION);
1604 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_15] = (EB_U8) (cbMeanOf16x16Blocks[15] >> MEAN_PRECISION);
1605
1606 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_0] = (EB_U8) (crMeanOf16x16Blocks[0] >> MEAN_PRECISION);
1607 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_1] = (EB_U8) (crMeanOf16x16Blocks[1] >> MEAN_PRECISION);
1608 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_2] = (EB_U8) (crMeanOf16x16Blocks[2] >> MEAN_PRECISION);
1609 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_3] = (EB_U8) (crMeanOf16x16Blocks[3] >> MEAN_PRECISION);
1610 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_4] = (EB_U8) (crMeanOf16x16Blocks[4] >> MEAN_PRECISION);
1611 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_5] = (EB_U8) (crMeanOf16x16Blocks[5] >> MEAN_PRECISION);
1612 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_6] = (EB_U8) (crMeanOf16x16Blocks[6] >> MEAN_PRECISION);
1613 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_7] = (EB_U8) (crMeanOf16x16Blocks[7] >> MEAN_PRECISION);
1614 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_8] = (EB_U8) (crMeanOf16x16Blocks[8] >> MEAN_PRECISION);
1615 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_9] = (EB_U8) (crMeanOf16x16Blocks[9] >> MEAN_PRECISION);
1616 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_10] = (EB_U8) (crMeanOf16x16Blocks[10] >> MEAN_PRECISION);
1617 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_11] = (EB_U8) (crMeanOf16x16Blocks[11] >> MEAN_PRECISION);
1618 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_12] = (EB_U8) (crMeanOf16x16Blocks[12] >> MEAN_PRECISION);
1619 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_13] = (EB_U8) (crMeanOf16x16Blocks[13] >> MEAN_PRECISION);
1620 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_14] = (EB_U8) (crMeanOf16x16Blocks[14] >> MEAN_PRECISION);
1621 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_16x16_15] = (EB_U8) (crMeanOf16x16Blocks[15] >> MEAN_PRECISION);
1622
1623 // 32x32 mean
1624 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_0] = (EB_U8) (cbMeanOf32x32Blocks[0] >> MEAN_PRECISION);
1625 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_1] = (EB_U8) (cbMeanOf32x32Blocks[1] >> MEAN_PRECISION);
1626 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_2] = (EB_U8) (cbMeanOf32x32Blocks[2] >> MEAN_PRECISION);
1627 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_3] = (EB_U8) (cbMeanOf32x32Blocks[3] >> MEAN_PRECISION);
1628
1629 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_0] = (EB_U8)(crMeanOf32x32Blocks[0] >> MEAN_PRECISION);
1630 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_1] = (EB_U8)(crMeanOf32x32Blocks[1] >> MEAN_PRECISION);
1631 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_2] = (EB_U8)(crMeanOf32x32Blocks[2] >> MEAN_PRECISION);
1632 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_32x32_3] = (EB_U8)(crMeanOf32x32Blocks[3] >> MEAN_PRECISION);
1633
1634 // 64x64 mean
1635 pictureControlSetPtr->cbMean[lcuCodingOrder][ME_TIER_ZERO_PU_64x64] = (EB_U8) (cbMeanOf64x64Blocks >> MEAN_PRECISION);
1636 pictureControlSetPtr->crMean[lcuCodingOrder][ME_TIER_ZERO_PU_64x64] = (EB_U8) (crMeanOf64x64Blocks >> MEAN_PRECISION);
1637
1638 return return_error;
1639 }
1640
1641
1642 /*******************************************
1643 * ComputeBlockMeanComputeVariance
1644 * computes the variance and the block mean of all CUs inside the tree block
1645 *******************************************/
ComputeBlockMeanComputeVariance(PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPaddedPicturePtr,EB_U32 lcuIndex,EB_U32 inputLumaOriginIndex)1646 static EB_ERRORTYPE ComputeBlockMeanComputeVariance(
1647 PictureParentControlSet_t *pictureControlSetPtr, // input parameter, Picture Control Set Ptr
1648 EbPictureBufferDesc_t *inputPaddedPicturePtr, // input parameter, Input Padded Picture
1649 EB_U32 lcuIndex, // input parameter, LCU address
1650 EB_U32 inputLumaOriginIndex) // input parameter, LCU index, used to point to source/reference samples
1651 {
1652
1653 EB_ERRORTYPE return_error = EB_ErrorNone;
1654
1655 EB_U32 blockIndex;
1656
1657 EB_U64 meanOf8x8Blocks[64];
1658 EB_U64 meanOf8x8SquaredValuesBlocks[64];
1659
1660 EB_U64 meanOf16x16Blocks[16];
1661 EB_U64 meanOf16x16SquaredValuesBlocks[16];
1662
1663 EB_U64 meanOf32x32Blocks[4];
1664 EB_U64 meanOf32x32SquaredValuesBlocks[4];
1665
1666 EB_U64 meanOf64x64Blocks;
1667 EB_U64 meanOf64x64SquaredValuesBlocks;
1668
1669 if (pictureControlSetPtr->disableVarianceFlag) {
1670 memset16bit(pictureControlSetPtr->variance[lcuIndex], 125, MAX_ME_PU_COUNT);
1671 EB_MEMSET(pictureControlSetPtr->yMean[lcuIndex], 125, sizeof(EB_U8) * MAX_ME_PU_COUNT);
1672
1673 }
1674 else {
1675
1676 // (0,0)
1677 blockIndex = inputLumaOriginIndex;
1678
1679 const EB_U16 strideY = inputPaddedPicturePtr->strideY;
1680
1681 if (!!(ASM_TYPES & AVX2_MASK)){
1682
1683 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[0], &meanOf8x8SquaredValuesBlocks[0]);
1684
1685 // (0,1)
1686 blockIndex = blockIndex + 32;
1687
1688
1689 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[4], &meanOf8x8SquaredValuesBlocks[4]);
1690
1691 // (0,5)
1692 blockIndex = blockIndex + 24;
1693
1694 // (1,0)
1695 blockIndex = inputLumaOriginIndex + (strideY << 3);
1696
1697 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[8], &meanOf8x8SquaredValuesBlocks[8]);
1698
1699 // (1,1)
1700 blockIndex = blockIndex + 32;
1701
1702 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[12], &meanOf8x8SquaredValuesBlocks[12]);
1703
1704 // (1,5)
1705 blockIndex = blockIndex + 24;
1706
1707 // (2,0)
1708 blockIndex = inputLumaOriginIndex + (strideY << 4);
1709
1710 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[16], &meanOf8x8SquaredValuesBlocks[16]);
1711
1712 // (2,1)
1713 blockIndex = blockIndex + 32;
1714
1715 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[20], &meanOf8x8SquaredValuesBlocks[20]);
1716
1717 // (2,5)
1718 blockIndex = blockIndex + 24;
1719
1720 // (3,0)
1721 blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4);
1722
1723
1724 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[24], &meanOf8x8SquaredValuesBlocks[24]);
1725
1726 // (3,1)
1727 blockIndex = blockIndex + 32;
1728
1729 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[28], &meanOf8x8SquaredValuesBlocks[28]);
1730
1731 // (3,5)
1732 blockIndex = blockIndex + 24;
1733
1734 // (4,0)
1735 blockIndex = inputLumaOriginIndex + (strideY << 5);
1736
1737 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[32], &meanOf8x8SquaredValuesBlocks[32]);
1738
1739 // (4,1)
1740 blockIndex = blockIndex + 32;
1741
1742 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[36], &meanOf8x8SquaredValuesBlocks[36]);
1743
1744 // (4,5)
1745 blockIndex = blockIndex + 24;
1746
1747 // (5,0)
1748 blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 5);
1749
1750 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[40], &meanOf8x8SquaredValuesBlocks[40]);
1751
1752 // (5,1)
1753 blockIndex = blockIndex + 32;
1754
1755 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[44], &meanOf8x8SquaredValuesBlocks[44]);
1756
1757 // (5,5)
1758 blockIndex = blockIndex + 24;
1759
1760 // (6,0)
1761 blockIndex = inputLumaOriginIndex + (strideY << 4) + (strideY << 5);
1762
1763 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[48], &meanOf8x8SquaredValuesBlocks[48]);
1764
1765 // (6,1)
1766 blockIndex = blockIndex + 32;
1767
1768 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[52], &meanOf8x8SquaredValuesBlocks[52]);
1769
1770 // (6,5)
1771 blockIndex = blockIndex + 24;
1772
1773
1774 // (7,0)
1775 blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4) + (strideY << 5);
1776
1777 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[56], &meanOf8x8SquaredValuesBlocks[56]);
1778
1779
1780 // (7,1)
1781 blockIndex = blockIndex + 32;
1782
1783 ComputeIntermVarFour8x8_AVX2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY, &meanOf8x8Blocks[60], &meanOf8x8SquaredValuesBlocks[60]);
1784
1785
1786 }
1787 else{
1788 meanOf8x8Blocks[0] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1789 meanOf8x8SquaredValuesBlocks[0] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1790
1791 // (0,1)
1792 blockIndex = blockIndex + 8;
1793 meanOf8x8Blocks[1] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1794 meanOf8x8SquaredValuesBlocks[1] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1795
1796 // (0,2)
1797 blockIndex = blockIndex + 8;
1798 meanOf8x8Blocks[2] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1799 meanOf8x8SquaredValuesBlocks[2] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1800
1801 // (0,3)
1802 blockIndex = blockIndex + 8;
1803 meanOf8x8Blocks[3] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1804 meanOf8x8SquaredValuesBlocks[3] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1805
1806 // (0,4)
1807 blockIndex = blockIndex + 8;
1808 meanOf8x8Blocks[4] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1809 meanOf8x8SquaredValuesBlocks[4] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1810
1811 // (0,5)
1812 blockIndex = blockIndex + 8;
1813 meanOf8x8Blocks[5] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1814 meanOf8x8SquaredValuesBlocks[5] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1815
1816 // (0,6)
1817 blockIndex = blockIndex + 8;
1818 meanOf8x8Blocks[6] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1819 meanOf8x8SquaredValuesBlocks[6] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1820
1821 // (0,7)
1822 blockIndex = blockIndex + 8;
1823 meanOf8x8Blocks[7] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1824 meanOf8x8SquaredValuesBlocks[7] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1825
1826 // (1,0)
1827 blockIndex = inputLumaOriginIndex + (strideY << 3);
1828 meanOf8x8Blocks[8] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1829 meanOf8x8SquaredValuesBlocks[8] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1830
1831 // (1,1)
1832 blockIndex = blockIndex + 8;
1833 meanOf8x8Blocks[9] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1834 meanOf8x8SquaredValuesBlocks[9] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1835
1836 // (1,2)
1837 blockIndex = blockIndex + 8;
1838 meanOf8x8Blocks[10] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1839 meanOf8x8SquaredValuesBlocks[10] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1840
1841 // (1,3)
1842 blockIndex = blockIndex + 8;
1843 meanOf8x8Blocks[11] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1844 meanOf8x8SquaredValuesBlocks[11] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1845
1846 // (1,4)
1847 blockIndex = blockIndex + 8;
1848 meanOf8x8Blocks[12] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1849 meanOf8x8SquaredValuesBlocks[12] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1850
1851 // (1,5)
1852 blockIndex = blockIndex + 8;
1853 meanOf8x8Blocks[13] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1854 meanOf8x8SquaredValuesBlocks[13] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1855
1856 // (1,6)
1857 blockIndex = blockIndex + 8;
1858 meanOf8x8Blocks[14] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1859 meanOf8x8SquaredValuesBlocks[14] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1860
1861 // (1,7)
1862 blockIndex = blockIndex + 8;
1863 meanOf8x8Blocks[15] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1864 meanOf8x8SquaredValuesBlocks[15] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1865
1866 // (2,0)
1867 blockIndex = inputLumaOriginIndex + (strideY << 4);
1868 meanOf8x8Blocks[16] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1869 meanOf8x8SquaredValuesBlocks[16] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1870
1871 // (2,1)
1872 blockIndex = blockIndex + 8;
1873 meanOf8x8Blocks[17] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1874 meanOf8x8SquaredValuesBlocks[17] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1875
1876 // (2,2)
1877 blockIndex = blockIndex + 8;
1878 meanOf8x8Blocks[18] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1879 meanOf8x8SquaredValuesBlocks[18] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1880
1881 // (2,3)
1882 blockIndex = blockIndex + 8;
1883 meanOf8x8Blocks[19] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1884 meanOf8x8SquaredValuesBlocks[19] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1885
1886 /// (2,4)
1887 blockIndex = blockIndex + 8;
1888 meanOf8x8Blocks[20] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1889 meanOf8x8SquaredValuesBlocks[20] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1890
1891 // (2,5)
1892 blockIndex = blockIndex + 8;
1893 meanOf8x8Blocks[21] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1894 meanOf8x8SquaredValuesBlocks[21] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1895
1896 // (2,6)
1897 blockIndex = blockIndex + 8;
1898 meanOf8x8Blocks[22] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1899 meanOf8x8SquaredValuesBlocks[22] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1900
1901 // (2,7)
1902 blockIndex = blockIndex + 8;
1903 meanOf8x8Blocks[23] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1904 meanOf8x8SquaredValuesBlocks[23] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1905
1906 // (3,0)
1907 blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4);
1908 meanOf8x8Blocks[24] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1909 meanOf8x8SquaredValuesBlocks[24] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1910
1911 // (3,1)
1912 blockIndex = blockIndex + 8;
1913 meanOf8x8Blocks[25] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1914 meanOf8x8SquaredValuesBlocks[25] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1915
1916 // (3,2)
1917 blockIndex = blockIndex + 8;
1918 meanOf8x8Blocks[26] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1919 meanOf8x8SquaredValuesBlocks[26] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1920
1921 // (3,3)
1922 blockIndex = blockIndex + 8;
1923 meanOf8x8Blocks[27] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1924 meanOf8x8SquaredValuesBlocks[27] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1925
1926 // (3,4)
1927 blockIndex = blockIndex + 8;
1928 meanOf8x8Blocks[28] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1929 meanOf8x8SquaredValuesBlocks[28] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1930
1931 // (3,5)
1932 blockIndex = blockIndex + 8;
1933 meanOf8x8Blocks[29] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1934 meanOf8x8SquaredValuesBlocks[29] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1935
1936 // (3,6)
1937 blockIndex = blockIndex + 8;
1938 meanOf8x8Blocks[30] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1939 meanOf8x8SquaredValuesBlocks[30] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1940
1941 // (3,7)
1942 blockIndex = blockIndex + 8;
1943 meanOf8x8Blocks[31] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1944 meanOf8x8SquaredValuesBlocks[31] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1945
1946 // (4,0)
1947 blockIndex = inputLumaOriginIndex + (strideY << 5);
1948 meanOf8x8Blocks[32] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1949 meanOf8x8SquaredValuesBlocks[32] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1950
1951 // (4,1)
1952 blockIndex = blockIndex + 8;
1953 meanOf8x8Blocks[33] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1954 meanOf8x8SquaredValuesBlocks[33] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1955
1956 // (4,2)
1957 blockIndex = blockIndex + 8;
1958 meanOf8x8Blocks[34] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1959 meanOf8x8SquaredValuesBlocks[34] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1960
1961 // (4,3)
1962 blockIndex = blockIndex + 8;
1963 meanOf8x8Blocks[35] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1964 meanOf8x8SquaredValuesBlocks[35] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1965
1966 // (4,4)
1967 blockIndex = blockIndex + 8;
1968 meanOf8x8Blocks[36] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1969 meanOf8x8SquaredValuesBlocks[36] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1970
1971 // (4,5)
1972 blockIndex = blockIndex + 8;
1973 meanOf8x8Blocks[37] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1974 meanOf8x8SquaredValuesBlocks[37] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1975
1976 // (4,6)
1977 blockIndex = blockIndex + 8;
1978 meanOf8x8Blocks[38] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1979 meanOf8x8SquaredValuesBlocks[38] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1980
1981 // (4,7)
1982 blockIndex = blockIndex + 8;
1983 meanOf8x8Blocks[39] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1984 meanOf8x8SquaredValuesBlocks[39] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1985
1986 // (5,0)
1987 blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 5);
1988 meanOf8x8Blocks[40] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1989 meanOf8x8SquaredValuesBlocks[40] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1990
1991 // (5,1)
1992 blockIndex = blockIndex + 8;
1993 meanOf8x8Blocks[41] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1994 meanOf8x8SquaredValuesBlocks[41] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1995
1996 // (5,2)
1997 blockIndex = blockIndex + 8;
1998 meanOf8x8Blocks[42] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
1999 meanOf8x8SquaredValuesBlocks[42] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2000
2001 // (5,3)
2002 blockIndex = blockIndex + 8;
2003 meanOf8x8Blocks[43] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2004 meanOf8x8SquaredValuesBlocks[43] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2005
2006 // (5,4)
2007 blockIndex = blockIndex + 8;
2008 meanOf8x8Blocks[44] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2009 meanOf8x8SquaredValuesBlocks[44] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2010
2011 // (5,5)
2012 blockIndex = blockIndex + 8;
2013 meanOf8x8Blocks[45] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2014 meanOf8x8SquaredValuesBlocks[45] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2015
2016 // (5,6)
2017 blockIndex = blockIndex + 8;
2018 meanOf8x8Blocks[46] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2019 meanOf8x8SquaredValuesBlocks[46] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2020
2021 // (5,7)
2022 blockIndex = blockIndex + 8;
2023 meanOf8x8Blocks[47] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2024 meanOf8x8SquaredValuesBlocks[47] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2025
2026 // (6,0)
2027 blockIndex = inputLumaOriginIndex + (strideY << 4) + (strideY << 5);
2028 meanOf8x8Blocks[48] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2029 meanOf8x8SquaredValuesBlocks[48] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2030
2031 // (6,1)
2032 blockIndex = blockIndex + 8;
2033 meanOf8x8Blocks[49] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2034 meanOf8x8SquaredValuesBlocks[49] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2035
2036 // (6,2)
2037 blockIndex = blockIndex + 8;
2038 meanOf8x8Blocks[50] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2039 meanOf8x8SquaredValuesBlocks[50] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2040
2041 // (6,3)
2042 blockIndex = blockIndex + 8;
2043 meanOf8x8Blocks[51] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2044 meanOf8x8SquaredValuesBlocks[51] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2045
2046 // (6,4)
2047 blockIndex = blockIndex + 8;
2048 meanOf8x8Blocks[52] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2049 meanOf8x8SquaredValuesBlocks[52] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2050
2051 // (6,5)
2052 blockIndex = blockIndex + 8;
2053 meanOf8x8Blocks[53] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2054 meanOf8x8SquaredValuesBlocks[53] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2055
2056 // (6,6)
2057 blockIndex = blockIndex + 8;
2058 meanOf8x8Blocks[54] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2059 meanOf8x8SquaredValuesBlocks[54] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2060
2061 // (6,7)
2062 blockIndex = blockIndex + 8;
2063 meanOf8x8Blocks[55] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2064 meanOf8x8SquaredValuesBlocks[55] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2065
2066 // (7,0)
2067 blockIndex = inputLumaOriginIndex + (strideY << 3) + (strideY << 4) + (strideY << 5);
2068 meanOf8x8Blocks[56] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2069 meanOf8x8SquaredValuesBlocks[56] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2070
2071 // (7,1)
2072 blockIndex = blockIndex + 8;
2073 meanOf8x8Blocks[57] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2074 meanOf8x8SquaredValuesBlocks[57] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2075
2076 // (7,2)
2077 blockIndex = blockIndex + 8;
2078 meanOf8x8Blocks[58] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2079 meanOf8x8SquaredValuesBlocks[58] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2080
2081 // (7,3)
2082 blockIndex = blockIndex + 8;
2083 meanOf8x8Blocks[59] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2084 meanOf8x8SquaredValuesBlocks[59] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2085
2086 // (7,4)
2087 blockIndex = blockIndex + 8;
2088 meanOf8x8Blocks[60] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2089 meanOf8x8SquaredValuesBlocks[60] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2090
2091 // (7,5)
2092 blockIndex = blockIndex + 8;
2093 meanOf8x8Blocks[61] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2094 meanOf8x8SquaredValuesBlocks[61] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2095
2096 // (7,6)
2097 blockIndex = blockIndex + 8;
2098 meanOf8x8Blocks[62] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2099 meanOf8x8SquaredValuesBlocks[62] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2100
2101 // (7,7)
2102 blockIndex = blockIndex + 8;
2103 meanOf8x8Blocks[63] = ComputeSubMean8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2104 meanOf8x8SquaredValuesBlocks[63] = ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(&(inputPaddedPicturePtr->bufferY[blockIndex]), strideY);
2105 }
2106
2107
2108 // 16x16
2109 meanOf16x16Blocks[0] = (meanOf8x8Blocks[0] + meanOf8x8Blocks[1] + meanOf8x8Blocks[8] + meanOf8x8Blocks[9]) >> 2;
2110 meanOf16x16Blocks[1] = (meanOf8x8Blocks[2] + meanOf8x8Blocks[3] + meanOf8x8Blocks[10] + meanOf8x8Blocks[11]) >> 2;
2111 meanOf16x16Blocks[2] = (meanOf8x8Blocks[4] + meanOf8x8Blocks[5] + meanOf8x8Blocks[12] + meanOf8x8Blocks[13]) >> 2;
2112 meanOf16x16Blocks[3] = (meanOf8x8Blocks[6] + meanOf8x8Blocks[7] + meanOf8x8Blocks[14] + meanOf8x8Blocks[15]) >> 2;
2113
2114 meanOf16x16Blocks[4] = (meanOf8x8Blocks[16] + meanOf8x8Blocks[17] + meanOf8x8Blocks[24] + meanOf8x8Blocks[25]) >> 2;
2115 meanOf16x16Blocks[5] = (meanOf8x8Blocks[18] + meanOf8x8Blocks[19] + meanOf8x8Blocks[26] + meanOf8x8Blocks[27]) >> 2;
2116 meanOf16x16Blocks[6] = (meanOf8x8Blocks[20] + meanOf8x8Blocks[21] + meanOf8x8Blocks[28] + meanOf8x8Blocks[29]) >> 2;
2117 meanOf16x16Blocks[7] = (meanOf8x8Blocks[22] + meanOf8x8Blocks[23] + meanOf8x8Blocks[30] + meanOf8x8Blocks[31]) >> 2;
2118
2119 meanOf16x16Blocks[8] = (meanOf8x8Blocks[32] + meanOf8x8Blocks[33] + meanOf8x8Blocks[40] + meanOf8x8Blocks[41]) >> 2;
2120 meanOf16x16Blocks[9] = (meanOf8x8Blocks[34] + meanOf8x8Blocks[35] + meanOf8x8Blocks[42] + meanOf8x8Blocks[43]) >> 2;
2121 meanOf16x16Blocks[10] = (meanOf8x8Blocks[36] + meanOf8x8Blocks[37] + meanOf8x8Blocks[44] + meanOf8x8Blocks[45]) >> 2;
2122 meanOf16x16Blocks[11] = (meanOf8x8Blocks[38] + meanOf8x8Blocks[39] + meanOf8x8Blocks[46] + meanOf8x8Blocks[47]) >> 2;
2123
2124 meanOf16x16Blocks[12] = (meanOf8x8Blocks[48] + meanOf8x8Blocks[49] + meanOf8x8Blocks[56] + meanOf8x8Blocks[57]) >> 2;
2125 meanOf16x16Blocks[13] = (meanOf8x8Blocks[50] + meanOf8x8Blocks[51] + meanOf8x8Blocks[58] + meanOf8x8Blocks[59]) >> 2;
2126 meanOf16x16Blocks[14] = (meanOf8x8Blocks[52] + meanOf8x8Blocks[53] + meanOf8x8Blocks[60] + meanOf8x8Blocks[61]) >> 2;
2127 meanOf16x16Blocks[15] = (meanOf8x8Blocks[54] + meanOf8x8Blocks[55] + meanOf8x8Blocks[62] + meanOf8x8Blocks[63]) >> 2;
2128
2129 meanOf16x16SquaredValuesBlocks[0] = (meanOf8x8SquaredValuesBlocks[0] + meanOf8x8SquaredValuesBlocks[1] + meanOf8x8SquaredValuesBlocks[8] + meanOf8x8SquaredValuesBlocks[9]) >> 2;
2130 meanOf16x16SquaredValuesBlocks[1] = (meanOf8x8SquaredValuesBlocks[2] + meanOf8x8SquaredValuesBlocks[3] + meanOf8x8SquaredValuesBlocks[10] + meanOf8x8SquaredValuesBlocks[11]) >> 2;
2131 meanOf16x16SquaredValuesBlocks[2] = (meanOf8x8SquaredValuesBlocks[4] + meanOf8x8SquaredValuesBlocks[5] + meanOf8x8SquaredValuesBlocks[12] + meanOf8x8SquaredValuesBlocks[13]) >> 2;
2132 meanOf16x16SquaredValuesBlocks[3] = (meanOf8x8SquaredValuesBlocks[6] + meanOf8x8SquaredValuesBlocks[7] + meanOf8x8SquaredValuesBlocks[14] + meanOf8x8SquaredValuesBlocks[15]) >> 2;
2133
2134 meanOf16x16SquaredValuesBlocks[4] = (meanOf8x8SquaredValuesBlocks[16] + meanOf8x8SquaredValuesBlocks[17] + meanOf8x8SquaredValuesBlocks[24] + meanOf8x8SquaredValuesBlocks[25]) >> 2;
2135 meanOf16x16SquaredValuesBlocks[5] = (meanOf8x8SquaredValuesBlocks[18] + meanOf8x8SquaredValuesBlocks[19] + meanOf8x8SquaredValuesBlocks[26] + meanOf8x8SquaredValuesBlocks[27]) >> 2;
2136 meanOf16x16SquaredValuesBlocks[6] = (meanOf8x8SquaredValuesBlocks[20] + meanOf8x8SquaredValuesBlocks[21] + meanOf8x8SquaredValuesBlocks[28] + meanOf8x8SquaredValuesBlocks[29]) >> 2;
2137 meanOf16x16SquaredValuesBlocks[7] = (meanOf8x8SquaredValuesBlocks[22] + meanOf8x8SquaredValuesBlocks[23] + meanOf8x8SquaredValuesBlocks[30] + meanOf8x8SquaredValuesBlocks[31]) >> 2;
2138
2139 meanOf16x16SquaredValuesBlocks[8] = (meanOf8x8SquaredValuesBlocks[32] + meanOf8x8SquaredValuesBlocks[33] + meanOf8x8SquaredValuesBlocks[40] + meanOf8x8SquaredValuesBlocks[41]) >> 2;
2140 meanOf16x16SquaredValuesBlocks[9] = (meanOf8x8SquaredValuesBlocks[34] + meanOf8x8SquaredValuesBlocks[35] + meanOf8x8SquaredValuesBlocks[42] + meanOf8x8SquaredValuesBlocks[43]) >> 2;
2141 meanOf16x16SquaredValuesBlocks[10] = (meanOf8x8SquaredValuesBlocks[36] + meanOf8x8SquaredValuesBlocks[37] + meanOf8x8SquaredValuesBlocks[44] + meanOf8x8SquaredValuesBlocks[45]) >> 2;
2142 meanOf16x16SquaredValuesBlocks[11] = (meanOf8x8SquaredValuesBlocks[38] + meanOf8x8SquaredValuesBlocks[39] + meanOf8x8SquaredValuesBlocks[46] + meanOf8x8SquaredValuesBlocks[47]) >> 2;
2143
2144 meanOf16x16SquaredValuesBlocks[12] = (meanOf8x8SquaredValuesBlocks[48] + meanOf8x8SquaredValuesBlocks[49] + meanOf8x8SquaredValuesBlocks[56] + meanOf8x8SquaredValuesBlocks[57]) >> 2;
2145 meanOf16x16SquaredValuesBlocks[13] = (meanOf8x8SquaredValuesBlocks[50] + meanOf8x8SquaredValuesBlocks[51] + meanOf8x8SquaredValuesBlocks[58] + meanOf8x8SquaredValuesBlocks[59]) >> 2;
2146 meanOf16x16SquaredValuesBlocks[14] = (meanOf8x8SquaredValuesBlocks[52] + meanOf8x8SquaredValuesBlocks[53] + meanOf8x8SquaredValuesBlocks[60] + meanOf8x8SquaredValuesBlocks[61]) >> 2;
2147 meanOf16x16SquaredValuesBlocks[15] = (meanOf8x8SquaredValuesBlocks[54] + meanOf8x8SquaredValuesBlocks[55] + meanOf8x8SquaredValuesBlocks[62] + meanOf8x8SquaredValuesBlocks[63]) >> 2;
2148
2149 // 32x32
2150 meanOf32x32Blocks[0] = (meanOf16x16Blocks[0] + meanOf16x16Blocks[1] + meanOf16x16Blocks[4] + meanOf16x16Blocks[5]) >> 2;
2151 meanOf32x32Blocks[1] = (meanOf16x16Blocks[2] + meanOf16x16Blocks[3] + meanOf16x16Blocks[6] + meanOf16x16Blocks[7]) >> 2;
2152 meanOf32x32Blocks[2] = (meanOf16x16Blocks[8] + meanOf16x16Blocks[9] + meanOf16x16Blocks[12] + meanOf16x16Blocks[13]) >> 2;
2153 meanOf32x32Blocks[3] = (meanOf16x16Blocks[10] + meanOf16x16Blocks[11] + meanOf16x16Blocks[14] + meanOf16x16Blocks[15]) >> 2;
2154
2155 meanOf32x32SquaredValuesBlocks[0] = (meanOf16x16SquaredValuesBlocks[0] + meanOf16x16SquaredValuesBlocks[1] + meanOf16x16SquaredValuesBlocks[4] + meanOf16x16SquaredValuesBlocks[5]) >> 2;
2156 meanOf32x32SquaredValuesBlocks[1] = (meanOf16x16SquaredValuesBlocks[2] + meanOf16x16SquaredValuesBlocks[3] + meanOf16x16SquaredValuesBlocks[6] + meanOf16x16SquaredValuesBlocks[7]) >> 2;
2157 meanOf32x32SquaredValuesBlocks[2] = (meanOf16x16SquaredValuesBlocks[8] + meanOf16x16SquaredValuesBlocks[9] + meanOf16x16SquaredValuesBlocks[12] + meanOf16x16SquaredValuesBlocks[13]) >> 2;
2158 meanOf32x32SquaredValuesBlocks[3] = (meanOf16x16SquaredValuesBlocks[10] + meanOf16x16SquaredValuesBlocks[11] + meanOf16x16SquaredValuesBlocks[14] + meanOf16x16SquaredValuesBlocks[15]) >> 2;
2159
2160 // 64x64
2161 meanOf64x64Blocks = (meanOf32x32Blocks[0] + meanOf32x32Blocks[1] + meanOf32x32Blocks[2] + meanOf32x32Blocks[3]) >> 2;
2162 meanOf64x64SquaredValuesBlocks = (meanOf32x32SquaredValuesBlocks[0] + meanOf32x32SquaredValuesBlocks[1] + meanOf32x32SquaredValuesBlocks[2] + meanOf32x32SquaredValuesBlocks[3]) >> 2;
2163
2164 // 8x8 means
2165 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_0] = (EB_U8)(meanOf8x8Blocks[0] >> MEAN_PRECISION);
2166 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_1] = (EB_U8)(meanOf8x8Blocks[1] >> MEAN_PRECISION);
2167 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_2] = (EB_U8)(meanOf8x8Blocks[2] >> MEAN_PRECISION);
2168 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_3] = (EB_U8)(meanOf8x8Blocks[3] >> MEAN_PRECISION);
2169 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_4] = (EB_U8)(meanOf8x8Blocks[4] >> MEAN_PRECISION);
2170 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_5] = (EB_U8)(meanOf8x8Blocks[5] >> MEAN_PRECISION);
2171 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_6] = (EB_U8)(meanOf8x8Blocks[6] >> MEAN_PRECISION);
2172 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_7] = (EB_U8)(meanOf8x8Blocks[7] >> MEAN_PRECISION);
2173 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_8] = (EB_U8)(meanOf8x8Blocks[8] >> MEAN_PRECISION);
2174 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_9] = (EB_U8)(meanOf8x8Blocks[9] >> MEAN_PRECISION);
2175 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_10] = (EB_U8)(meanOf8x8Blocks[10] >> MEAN_PRECISION);
2176 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_11] = (EB_U8)(meanOf8x8Blocks[11] >> MEAN_PRECISION);
2177 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_12] = (EB_U8)(meanOf8x8Blocks[12] >> MEAN_PRECISION);
2178 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_13] = (EB_U8)(meanOf8x8Blocks[13] >> MEAN_PRECISION);
2179 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_14] = (EB_U8)(meanOf8x8Blocks[14] >> MEAN_PRECISION);
2180 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_15] = (EB_U8)(meanOf8x8Blocks[15] >> MEAN_PRECISION);
2181 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_16] = (EB_U8)(meanOf8x8Blocks[16] >> MEAN_PRECISION);
2182 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_17] = (EB_U8)(meanOf8x8Blocks[17] >> MEAN_PRECISION);
2183 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_18] = (EB_U8)(meanOf8x8Blocks[18] >> MEAN_PRECISION);
2184 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_19] = (EB_U8)(meanOf8x8Blocks[19] >> MEAN_PRECISION);
2185 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_20] = (EB_U8)(meanOf8x8Blocks[20] >> MEAN_PRECISION);
2186 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_21] = (EB_U8)(meanOf8x8Blocks[21] >> MEAN_PRECISION);
2187 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_22] = (EB_U8)(meanOf8x8Blocks[22] >> MEAN_PRECISION);
2188 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_23] = (EB_U8)(meanOf8x8Blocks[23] >> MEAN_PRECISION);
2189 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_24] = (EB_U8)(meanOf8x8Blocks[24] >> MEAN_PRECISION);
2190 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_25] = (EB_U8)(meanOf8x8Blocks[25] >> MEAN_PRECISION);
2191 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_26] = (EB_U8)(meanOf8x8Blocks[26] >> MEAN_PRECISION);
2192 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_27] = (EB_U8)(meanOf8x8Blocks[27] >> MEAN_PRECISION);
2193 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_28] = (EB_U8)(meanOf8x8Blocks[28] >> MEAN_PRECISION);
2194 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_29] = (EB_U8)(meanOf8x8Blocks[29] >> MEAN_PRECISION);
2195 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_30] = (EB_U8)(meanOf8x8Blocks[30] >> MEAN_PRECISION);
2196 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_31] = (EB_U8)(meanOf8x8Blocks[31] >> MEAN_PRECISION);
2197 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_32] = (EB_U8)(meanOf8x8Blocks[32] >> MEAN_PRECISION);
2198 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_33] = (EB_U8)(meanOf8x8Blocks[33] >> MEAN_PRECISION);
2199 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_34] = (EB_U8)(meanOf8x8Blocks[34] >> MEAN_PRECISION);
2200 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_35] = (EB_U8)(meanOf8x8Blocks[35] >> MEAN_PRECISION);
2201 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_36] = (EB_U8)(meanOf8x8Blocks[36] >> MEAN_PRECISION);
2202 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_37] = (EB_U8)(meanOf8x8Blocks[37] >> MEAN_PRECISION);
2203 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_38] = (EB_U8)(meanOf8x8Blocks[38] >> MEAN_PRECISION);
2204 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_39] = (EB_U8)(meanOf8x8Blocks[39] >> MEAN_PRECISION);
2205 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_40] = (EB_U8)(meanOf8x8Blocks[40] >> MEAN_PRECISION);
2206 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_41] = (EB_U8)(meanOf8x8Blocks[41] >> MEAN_PRECISION);
2207 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_42] = (EB_U8)(meanOf8x8Blocks[42] >> MEAN_PRECISION);
2208 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_43] = (EB_U8)(meanOf8x8Blocks[43] >> MEAN_PRECISION);
2209 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_44] = (EB_U8)(meanOf8x8Blocks[44] >> MEAN_PRECISION);
2210 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_45] = (EB_U8)(meanOf8x8Blocks[45] >> MEAN_PRECISION);
2211 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_46] = (EB_U8)(meanOf8x8Blocks[46] >> MEAN_PRECISION);
2212 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_47] = (EB_U8)(meanOf8x8Blocks[47] >> MEAN_PRECISION);
2213 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_48] = (EB_U8)(meanOf8x8Blocks[48] >> MEAN_PRECISION);
2214 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_49] = (EB_U8)(meanOf8x8Blocks[49] >> MEAN_PRECISION);
2215 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_50] = (EB_U8)(meanOf8x8Blocks[50] >> MEAN_PRECISION);
2216 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_51] = (EB_U8)(meanOf8x8Blocks[51] >> MEAN_PRECISION);
2217 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_52] = (EB_U8)(meanOf8x8Blocks[52] >> MEAN_PRECISION);
2218 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_53] = (EB_U8)(meanOf8x8Blocks[53] >> MEAN_PRECISION);
2219 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_54] = (EB_U8)(meanOf8x8Blocks[54] >> MEAN_PRECISION);
2220 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_55] = (EB_U8)(meanOf8x8Blocks[55] >> MEAN_PRECISION);
2221 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_56] = (EB_U8)(meanOf8x8Blocks[56] >> MEAN_PRECISION);
2222 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_57] = (EB_U8)(meanOf8x8Blocks[57] >> MEAN_PRECISION);
2223 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_58] = (EB_U8)(meanOf8x8Blocks[58] >> MEAN_PRECISION);
2224 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_59] = (EB_U8)(meanOf8x8Blocks[59] >> MEAN_PRECISION);
2225 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_60] = (EB_U8)(meanOf8x8Blocks[60] >> MEAN_PRECISION);
2226 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_61] = (EB_U8)(meanOf8x8Blocks[61] >> MEAN_PRECISION);
2227 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_62] = (EB_U8)(meanOf8x8Blocks[62] >> MEAN_PRECISION);
2228 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_8x8_63] = (EB_U8)(meanOf8x8Blocks[63] >> MEAN_PRECISION);
2229
2230 // 16x16 mean
2231 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_0] = (EB_U8)(meanOf16x16Blocks[0] >> MEAN_PRECISION);
2232 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_1] = (EB_U8)(meanOf16x16Blocks[1] >> MEAN_PRECISION);
2233 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_2] = (EB_U8)(meanOf16x16Blocks[2] >> MEAN_PRECISION);
2234 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_3] = (EB_U8)(meanOf16x16Blocks[3] >> MEAN_PRECISION);
2235 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_4] = (EB_U8)(meanOf16x16Blocks[4] >> MEAN_PRECISION);
2236 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_5] = (EB_U8)(meanOf16x16Blocks[5] >> MEAN_PRECISION);
2237 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_6] = (EB_U8)(meanOf16x16Blocks[6] >> MEAN_PRECISION);
2238 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_7] = (EB_U8)(meanOf16x16Blocks[7] >> MEAN_PRECISION);
2239 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_8] = (EB_U8)(meanOf16x16Blocks[8] >> MEAN_PRECISION);
2240 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_9] = (EB_U8)(meanOf16x16Blocks[9] >> MEAN_PRECISION);
2241 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_10] = (EB_U8)(meanOf16x16Blocks[10] >> MEAN_PRECISION);
2242 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_11] = (EB_U8)(meanOf16x16Blocks[11] >> MEAN_PRECISION);
2243 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_12] = (EB_U8)(meanOf16x16Blocks[12] >> MEAN_PRECISION);
2244 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_13] = (EB_U8)(meanOf16x16Blocks[13] >> MEAN_PRECISION);
2245 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_14] = (EB_U8)(meanOf16x16Blocks[14] >> MEAN_PRECISION);
2246 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_16x16_15] = (EB_U8)(meanOf16x16Blocks[15] >> MEAN_PRECISION);
2247
2248 // 32x32 mean
2249 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_32x32_0] = (EB_U8)(meanOf32x32Blocks[0] >> MEAN_PRECISION);
2250 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_32x32_1] = (EB_U8)(meanOf32x32Blocks[1] >> MEAN_PRECISION);
2251 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_32x32_2] = (EB_U8)(meanOf32x32Blocks[2] >> MEAN_PRECISION);
2252 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_32x32_3] = (EB_U8)(meanOf32x32Blocks[3] >> MEAN_PRECISION);
2253
2254 // 64x64 mean
2255 pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_64x64] = (EB_U8)(meanOf64x64Blocks >> MEAN_PRECISION);
2256
2257 // 8x8 variances
2258 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_0] = (EB_U16)((meanOf8x8SquaredValuesBlocks[0] - (meanOf8x8Blocks[0] * meanOf8x8Blocks[0])) >> VARIANCE_PRECISION);
2259 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_1] = (EB_U16)((meanOf8x8SquaredValuesBlocks[1] - (meanOf8x8Blocks[1] * meanOf8x8Blocks[1])) >> VARIANCE_PRECISION);
2260 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_2] = (EB_U16)((meanOf8x8SquaredValuesBlocks[2] - (meanOf8x8Blocks[2] * meanOf8x8Blocks[2])) >> VARIANCE_PRECISION);
2261 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_3] = (EB_U16)((meanOf8x8SquaredValuesBlocks[3] - (meanOf8x8Blocks[3] * meanOf8x8Blocks[3])) >> VARIANCE_PRECISION);
2262 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_4] = (EB_U16)((meanOf8x8SquaredValuesBlocks[4] - (meanOf8x8Blocks[4] * meanOf8x8Blocks[4])) >> VARIANCE_PRECISION);
2263 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_5] = (EB_U16)((meanOf8x8SquaredValuesBlocks[5] - (meanOf8x8Blocks[5] * meanOf8x8Blocks[5])) >> VARIANCE_PRECISION);
2264 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_6] = (EB_U16)((meanOf8x8SquaredValuesBlocks[6] - (meanOf8x8Blocks[6] * meanOf8x8Blocks[6])) >> VARIANCE_PRECISION);
2265 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_7] = (EB_U16)((meanOf8x8SquaredValuesBlocks[7] - (meanOf8x8Blocks[7] * meanOf8x8Blocks[7])) >> VARIANCE_PRECISION);
2266 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_8] = (EB_U16)((meanOf8x8SquaredValuesBlocks[8] - (meanOf8x8Blocks[8] * meanOf8x8Blocks[8])) >> VARIANCE_PRECISION);
2267 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_9] = (EB_U16)((meanOf8x8SquaredValuesBlocks[9] - (meanOf8x8Blocks[9] * meanOf8x8Blocks[9])) >> VARIANCE_PRECISION);
2268 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_10] = (EB_U16)((meanOf8x8SquaredValuesBlocks[10] - (meanOf8x8Blocks[10] * meanOf8x8Blocks[10])) >> VARIANCE_PRECISION);
2269 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_11] = (EB_U16)((meanOf8x8SquaredValuesBlocks[11] - (meanOf8x8Blocks[11] * meanOf8x8Blocks[11])) >> VARIANCE_PRECISION);
2270 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_12] = (EB_U16)((meanOf8x8SquaredValuesBlocks[12] - (meanOf8x8Blocks[12] * meanOf8x8Blocks[12])) >> VARIANCE_PRECISION);
2271 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_13] = (EB_U16)((meanOf8x8SquaredValuesBlocks[13] - (meanOf8x8Blocks[13] * meanOf8x8Blocks[13])) >> VARIANCE_PRECISION);
2272 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_14] = (EB_U16)((meanOf8x8SquaredValuesBlocks[14] - (meanOf8x8Blocks[14] * meanOf8x8Blocks[14])) >> VARIANCE_PRECISION);
2273 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_15] = (EB_U16)((meanOf8x8SquaredValuesBlocks[15] - (meanOf8x8Blocks[15] * meanOf8x8Blocks[15])) >> VARIANCE_PRECISION);
2274 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_16] = (EB_U16)((meanOf8x8SquaredValuesBlocks[16] - (meanOf8x8Blocks[16] * meanOf8x8Blocks[16])) >> VARIANCE_PRECISION);
2275 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_17] = (EB_U16)((meanOf8x8SquaredValuesBlocks[17] - (meanOf8x8Blocks[17] * meanOf8x8Blocks[17])) >> VARIANCE_PRECISION);
2276 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_18] = (EB_U16)((meanOf8x8SquaredValuesBlocks[18] - (meanOf8x8Blocks[18] * meanOf8x8Blocks[18])) >> VARIANCE_PRECISION);
2277 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_19] = (EB_U16)((meanOf8x8SquaredValuesBlocks[19] - (meanOf8x8Blocks[19] * meanOf8x8Blocks[19])) >> VARIANCE_PRECISION);
2278 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_20] = (EB_U16)((meanOf8x8SquaredValuesBlocks[20] - (meanOf8x8Blocks[20] * meanOf8x8Blocks[20])) >> VARIANCE_PRECISION);
2279 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_21] = (EB_U16)((meanOf8x8SquaredValuesBlocks[21] - (meanOf8x8Blocks[21] * meanOf8x8Blocks[21])) >> VARIANCE_PRECISION);
2280 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_22] = (EB_U16)((meanOf8x8SquaredValuesBlocks[22] - (meanOf8x8Blocks[22] * meanOf8x8Blocks[22])) >> VARIANCE_PRECISION);
2281 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_23] = (EB_U16)((meanOf8x8SquaredValuesBlocks[23] - (meanOf8x8Blocks[23] * meanOf8x8Blocks[23])) >> VARIANCE_PRECISION);
2282 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_24] = (EB_U16)((meanOf8x8SquaredValuesBlocks[24] - (meanOf8x8Blocks[24] * meanOf8x8Blocks[24])) >> VARIANCE_PRECISION);
2283 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_25] = (EB_U16)((meanOf8x8SquaredValuesBlocks[25] - (meanOf8x8Blocks[25] * meanOf8x8Blocks[25])) >> VARIANCE_PRECISION);
2284 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_26] = (EB_U16)((meanOf8x8SquaredValuesBlocks[26] - (meanOf8x8Blocks[26] * meanOf8x8Blocks[26])) >> VARIANCE_PRECISION);
2285 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_27] = (EB_U16)((meanOf8x8SquaredValuesBlocks[27] - (meanOf8x8Blocks[27] * meanOf8x8Blocks[27])) >> VARIANCE_PRECISION);
2286 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_28] = (EB_U16)((meanOf8x8SquaredValuesBlocks[28] - (meanOf8x8Blocks[28] * meanOf8x8Blocks[28])) >> VARIANCE_PRECISION);
2287 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_29] = (EB_U16)((meanOf8x8SquaredValuesBlocks[29] - (meanOf8x8Blocks[29] * meanOf8x8Blocks[29])) >> VARIANCE_PRECISION);
2288 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_30] = (EB_U16)((meanOf8x8SquaredValuesBlocks[30] - (meanOf8x8Blocks[30] * meanOf8x8Blocks[30])) >> VARIANCE_PRECISION);
2289 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_31] = (EB_U16)((meanOf8x8SquaredValuesBlocks[31] - (meanOf8x8Blocks[31] * meanOf8x8Blocks[31])) >> VARIANCE_PRECISION);
2290 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_32] = (EB_U16)((meanOf8x8SquaredValuesBlocks[32] - (meanOf8x8Blocks[32] * meanOf8x8Blocks[32])) >> VARIANCE_PRECISION);
2291 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_33] = (EB_U16)((meanOf8x8SquaredValuesBlocks[33] - (meanOf8x8Blocks[33] * meanOf8x8Blocks[33])) >> VARIANCE_PRECISION);
2292 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_34] = (EB_U16)((meanOf8x8SquaredValuesBlocks[34] - (meanOf8x8Blocks[34] * meanOf8x8Blocks[34])) >> VARIANCE_PRECISION);
2293 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_35] = (EB_U16)((meanOf8x8SquaredValuesBlocks[35] - (meanOf8x8Blocks[35] * meanOf8x8Blocks[35])) >> VARIANCE_PRECISION);
2294 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_36] = (EB_U16)((meanOf8x8SquaredValuesBlocks[36] - (meanOf8x8Blocks[36] * meanOf8x8Blocks[36])) >> VARIANCE_PRECISION);
2295 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_37] = (EB_U16)((meanOf8x8SquaredValuesBlocks[37] - (meanOf8x8Blocks[37] * meanOf8x8Blocks[37])) >> VARIANCE_PRECISION);
2296 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_38] = (EB_U16)((meanOf8x8SquaredValuesBlocks[38] - (meanOf8x8Blocks[38] * meanOf8x8Blocks[38])) >> VARIANCE_PRECISION);
2297 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_39] = (EB_U16)((meanOf8x8SquaredValuesBlocks[39] - (meanOf8x8Blocks[39] * meanOf8x8Blocks[39])) >> VARIANCE_PRECISION);
2298 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_40] = (EB_U16)((meanOf8x8SquaredValuesBlocks[40] - (meanOf8x8Blocks[40] * meanOf8x8Blocks[40])) >> VARIANCE_PRECISION);
2299 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_41] = (EB_U16)((meanOf8x8SquaredValuesBlocks[41] - (meanOf8x8Blocks[41] * meanOf8x8Blocks[41])) >> VARIANCE_PRECISION);
2300 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_42] = (EB_U16)((meanOf8x8SquaredValuesBlocks[42] - (meanOf8x8Blocks[42] * meanOf8x8Blocks[42])) >> VARIANCE_PRECISION);
2301 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_43] = (EB_U16)((meanOf8x8SquaredValuesBlocks[43] - (meanOf8x8Blocks[43] * meanOf8x8Blocks[43])) >> VARIANCE_PRECISION);
2302 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_44] = (EB_U16)((meanOf8x8SquaredValuesBlocks[44] - (meanOf8x8Blocks[44] * meanOf8x8Blocks[44])) >> VARIANCE_PRECISION);
2303 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_45] = (EB_U16)((meanOf8x8SquaredValuesBlocks[45] - (meanOf8x8Blocks[45] * meanOf8x8Blocks[45])) >> VARIANCE_PRECISION);
2304 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_46] = (EB_U16)((meanOf8x8SquaredValuesBlocks[46] - (meanOf8x8Blocks[46] * meanOf8x8Blocks[46])) >> VARIANCE_PRECISION);
2305 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_47] = (EB_U16)((meanOf8x8SquaredValuesBlocks[47] - (meanOf8x8Blocks[47] * meanOf8x8Blocks[47])) >> VARIANCE_PRECISION);
2306 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_48] = (EB_U16)((meanOf8x8SquaredValuesBlocks[48] - (meanOf8x8Blocks[48] * meanOf8x8Blocks[48])) >> VARIANCE_PRECISION);
2307 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_49] = (EB_U16)((meanOf8x8SquaredValuesBlocks[49] - (meanOf8x8Blocks[49] * meanOf8x8Blocks[49])) >> VARIANCE_PRECISION);
2308 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_50] = (EB_U16)((meanOf8x8SquaredValuesBlocks[50] - (meanOf8x8Blocks[50] * meanOf8x8Blocks[50])) >> VARIANCE_PRECISION);
2309 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_51] = (EB_U16)((meanOf8x8SquaredValuesBlocks[51] - (meanOf8x8Blocks[51] * meanOf8x8Blocks[51])) >> VARIANCE_PRECISION);
2310 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_52] = (EB_U16)((meanOf8x8SquaredValuesBlocks[52] - (meanOf8x8Blocks[52] * meanOf8x8Blocks[52])) >> VARIANCE_PRECISION);
2311 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_53] = (EB_U16)((meanOf8x8SquaredValuesBlocks[53] - (meanOf8x8Blocks[53] * meanOf8x8Blocks[53])) >> VARIANCE_PRECISION);
2312 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_54] = (EB_U16)((meanOf8x8SquaredValuesBlocks[54] - (meanOf8x8Blocks[54] * meanOf8x8Blocks[54])) >> VARIANCE_PRECISION);
2313 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_55] = (EB_U16)((meanOf8x8SquaredValuesBlocks[55] - (meanOf8x8Blocks[55] * meanOf8x8Blocks[55])) >> VARIANCE_PRECISION);
2314 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_56] = (EB_U16)((meanOf8x8SquaredValuesBlocks[56] - (meanOf8x8Blocks[56] * meanOf8x8Blocks[56])) >> VARIANCE_PRECISION);
2315 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_57] = (EB_U16)((meanOf8x8SquaredValuesBlocks[57] - (meanOf8x8Blocks[57] * meanOf8x8Blocks[57])) >> VARIANCE_PRECISION);
2316 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_58] = (EB_U16)((meanOf8x8SquaredValuesBlocks[58] - (meanOf8x8Blocks[58] * meanOf8x8Blocks[58])) >> VARIANCE_PRECISION);
2317 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_59] = (EB_U16)((meanOf8x8SquaredValuesBlocks[59] - (meanOf8x8Blocks[59] * meanOf8x8Blocks[59])) >> VARIANCE_PRECISION);
2318 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_60] = (EB_U16)((meanOf8x8SquaredValuesBlocks[60] - (meanOf8x8Blocks[60] * meanOf8x8Blocks[60])) >> VARIANCE_PRECISION);
2319 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_61] = (EB_U16)((meanOf8x8SquaredValuesBlocks[61] - (meanOf8x8Blocks[61] * meanOf8x8Blocks[61])) >> VARIANCE_PRECISION);
2320 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_62] = (EB_U16)((meanOf8x8SquaredValuesBlocks[62] - (meanOf8x8Blocks[62] * meanOf8x8Blocks[62])) >> VARIANCE_PRECISION);
2321 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_8x8_63] = (EB_U16)((meanOf8x8SquaredValuesBlocks[63] - (meanOf8x8Blocks[63] * meanOf8x8Blocks[63])) >> VARIANCE_PRECISION);
2322
2323 // 16x16 variances
2324 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_0] = (EB_U16)((meanOf16x16SquaredValuesBlocks[0] - (meanOf16x16Blocks[0] * meanOf16x16Blocks[0])) >> VARIANCE_PRECISION);
2325 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_1] = (EB_U16)((meanOf16x16SquaredValuesBlocks[1] - (meanOf16x16Blocks[1] * meanOf16x16Blocks[1])) >> VARIANCE_PRECISION);
2326 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_2] = (EB_U16)((meanOf16x16SquaredValuesBlocks[2] - (meanOf16x16Blocks[2] * meanOf16x16Blocks[2])) >> VARIANCE_PRECISION);
2327 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_3] = (EB_U16)((meanOf16x16SquaredValuesBlocks[3] - (meanOf16x16Blocks[3] * meanOf16x16Blocks[3])) >> VARIANCE_PRECISION);
2328 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_4] = (EB_U16)((meanOf16x16SquaredValuesBlocks[4] - (meanOf16x16Blocks[4] * meanOf16x16Blocks[4])) >> VARIANCE_PRECISION);
2329 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_5] = (EB_U16)((meanOf16x16SquaredValuesBlocks[5] - (meanOf16x16Blocks[5] * meanOf16x16Blocks[5])) >> VARIANCE_PRECISION);
2330 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_6] = (EB_U16)((meanOf16x16SquaredValuesBlocks[6] - (meanOf16x16Blocks[6] * meanOf16x16Blocks[6])) >> VARIANCE_PRECISION);
2331 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_7] = (EB_U16)((meanOf16x16SquaredValuesBlocks[7] - (meanOf16x16Blocks[7] * meanOf16x16Blocks[7])) >> VARIANCE_PRECISION);
2332 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_8] = (EB_U16)((meanOf16x16SquaredValuesBlocks[8] - (meanOf16x16Blocks[8] * meanOf16x16Blocks[8])) >> VARIANCE_PRECISION);
2333 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_9] = (EB_U16)((meanOf16x16SquaredValuesBlocks[9] - (meanOf16x16Blocks[9] * meanOf16x16Blocks[9])) >> VARIANCE_PRECISION);
2334 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_10] = (EB_U16)((meanOf16x16SquaredValuesBlocks[10] - (meanOf16x16Blocks[10] * meanOf16x16Blocks[10])) >> VARIANCE_PRECISION);
2335 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_11] = (EB_U16)((meanOf16x16SquaredValuesBlocks[11] - (meanOf16x16Blocks[11] * meanOf16x16Blocks[11])) >> VARIANCE_PRECISION);
2336 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_12] = (EB_U16)((meanOf16x16SquaredValuesBlocks[12] - (meanOf16x16Blocks[12] * meanOf16x16Blocks[12])) >> VARIANCE_PRECISION);
2337 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_13] = (EB_U16)((meanOf16x16SquaredValuesBlocks[13] - (meanOf16x16Blocks[13] * meanOf16x16Blocks[13])) >> VARIANCE_PRECISION);
2338 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_14] = (EB_U16)((meanOf16x16SquaredValuesBlocks[14] - (meanOf16x16Blocks[14] * meanOf16x16Blocks[14])) >> VARIANCE_PRECISION);
2339 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_16x16_15] = (EB_U16)((meanOf16x16SquaredValuesBlocks[15] - (meanOf16x16Blocks[15] * meanOf16x16Blocks[15])) >> VARIANCE_PRECISION);
2340
2341 // 32x32 variances
2342 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_32x32_0] = (EB_U16)((meanOf32x32SquaredValuesBlocks[0] - (meanOf32x32Blocks[0] * meanOf32x32Blocks[0])) >> VARIANCE_PRECISION);
2343 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_32x32_1] = (EB_U16)((meanOf32x32SquaredValuesBlocks[1] - (meanOf32x32Blocks[1] * meanOf32x32Blocks[1])) >> VARIANCE_PRECISION);
2344 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_32x32_2] = (EB_U16)((meanOf32x32SquaredValuesBlocks[2] - (meanOf32x32Blocks[2] * meanOf32x32Blocks[2])) >> VARIANCE_PRECISION);
2345 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_32x32_3] = (EB_U16)((meanOf32x32SquaredValuesBlocks[3] - (meanOf32x32Blocks[3] * meanOf32x32Blocks[3])) >> VARIANCE_PRECISION);
2346
2347 // 64x64 variance
2348 pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_64x64] = (EB_U16)((meanOf64x64SquaredValuesBlocks - (meanOf64x64Blocks * meanOf64x64Blocks)) >> VARIANCE_PRECISION);
2349 }
2350 return return_error;
2351 }
2352
DenoiseInputPicture(PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr)2353 static EB_ERRORTYPE DenoiseInputPicture(
2354 PictureAnalysisContext_t *contextPtr,
2355 SequenceControlSet_t *sequenceControlSetPtr,
2356 PictureParentControlSet_t *pictureControlSetPtr,
2357 EbPictureBufferDesc_t *inputPicturePtr,
2358 EbPictureBufferDesc_t *denoisedPicturePtr)
2359 {
2360 EB_ERRORTYPE return_error = EB_ErrorNone;
2361
2362 EB_U32 lcuIndex;
2363 EB_U32 lcuOriginX;
2364 EB_U32 lcuOriginY;
2365 EB_U16 verticalIdx;
2366 EB_U32 colorFormat = inputPicturePtr->colorFormat;
2367 EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
2368 EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
2369 //use denoised input if the source is extremly noisy
2370 if (pictureControlSetPtr->picNoiseClass >= PIC_NOISE_CLASS_4){
2371
2372 EB_U32 inLumaOffSet = inputPicturePtr->originX + inputPicturePtr->originY * inputPicturePtr->strideY;
2373 EB_U32 inChromaOffSet = (inputPicturePtr->originX >> subWidthCMinus1) + (inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideCb;
2374 EB_U32 denLumaOffSet = denoisedPicturePtr->originX + denoisedPicturePtr->originY * denoisedPicturePtr->strideY;
2375 EB_U32 denChromaOffSet = (denoisedPicturePtr->originX >> subWidthCMinus1) + (denoisedPicturePtr->originY >> subHeightCMinus1) * denoisedPicturePtr->strideCb;
2376
2377 //filter Luma
2378 for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2379
2380 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2381
2382 lcuOriginX = lcuParams->originX;
2383 lcuOriginY = lcuParams->originY;
2384
2385
2386 if (lcuOriginX == 0)
2387 StrongLumaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2388 inputPicturePtr,
2389 denoisedPicturePtr,
2390 lcuOriginY,
2391 lcuOriginX);
2392
2393 if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2394 {
2395 noiseExtractLumaStrong(
2396 inputPicturePtr,
2397 denoisedPicturePtr,
2398 lcuOriginY,
2399 lcuOriginX);
2400 }
2401
2402 }
2403
2404 //copy Luma
2405 for (verticalIdx = 0; verticalIdx < inputPicturePtr->height; ++verticalIdx) {
2406 EB_MEMCPY(inputPicturePtr->bufferY + inLumaOffSet + verticalIdx * inputPicturePtr->strideY,
2407 denoisedPicturePtr->bufferY + denLumaOffSet + verticalIdx * denoisedPicturePtr->strideY,
2408 sizeof(EB_U8) * inputPicturePtr->width);
2409 }
2410
2411 //copy chroma
2412 for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2413
2414 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2415
2416 lcuOriginX = lcuParams->originX;
2417 lcuOriginY = lcuParams->originY;
2418
2419 if (lcuOriginX == 0)
2420 StrongChromaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2421 inputPicturePtr,
2422 denoisedPicturePtr,
2423 lcuOriginY >> subHeightCMinus1,
2424 lcuOriginX >> subWidthCMinus1);
2425
2426 if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2427 {
2428 noiseExtractChromaStrong(
2429 inputPicturePtr,
2430 denoisedPicturePtr,
2431 lcuOriginY >> subHeightCMinus1,
2432 lcuOriginX >> subWidthCMinus1);
2433 }
2434
2435 }
2436
2437 //copy chroma
2438 for (verticalIdx = 0; verticalIdx < inputPicturePtr->height >> subHeightCMinus1; ++verticalIdx) {
2439
2440 EB_MEMCPY(inputPicturePtr->bufferCb + inChromaOffSet + verticalIdx * inputPicturePtr->strideCb,
2441 denoisedPicturePtr->bufferCb + denChromaOffSet + verticalIdx * denoisedPicturePtr->strideCb,
2442 sizeof(EB_U8) * inputPicturePtr->width >> subWidthCMinus1);
2443
2444 EB_MEMCPY(inputPicturePtr->bufferCr + inChromaOffSet + verticalIdx * inputPicturePtr->strideCr,
2445 denoisedPicturePtr->bufferCr + denChromaOffSet + verticalIdx * denoisedPicturePtr->strideCr,
2446 sizeof(EB_U8) * inputPicturePtr->width >> subWidthCMinus1);
2447 }
2448
2449 }
2450 else if (pictureControlSetPtr->picNoiseClass >= PIC_NOISE_CLASS_3_1){
2451
2452 EB_U32 inLumaOffSet = inputPicturePtr->originX + inputPicturePtr->originY * inputPicturePtr->strideY;
2453 EB_U32 inChromaOffSet = (inputPicturePtr->originX >> subWidthCMinus1) + (inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideCb;
2454 EB_U32 denLumaOffSet = denoisedPicturePtr->originX + denoisedPicturePtr->originY * denoisedPicturePtr->strideY;
2455 EB_U32 denChromaOffSet = (denoisedPicturePtr->originX >> subWidthCMinus1) + (denoisedPicturePtr->originY >> subHeightCMinus1) * denoisedPicturePtr->strideCb;
2456
2457
2458 for (verticalIdx = 0; verticalIdx < inputPicturePtr->height; ++verticalIdx) {
2459 EB_MEMCPY(inputPicturePtr->bufferY + inLumaOffSet + verticalIdx * inputPicturePtr->strideY,
2460 denoisedPicturePtr->bufferY + denLumaOffSet + verticalIdx * denoisedPicturePtr->strideY,
2461 sizeof(EB_U8) * inputPicturePtr->width);
2462 }
2463
2464 //copy chroma
2465 for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2466
2467 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2468
2469 lcuOriginX = lcuParams->originX;
2470 lcuOriginY = lcuParams->originY;
2471
2472 if (lcuOriginX == 0)
2473 WeakChromaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2474 inputPicturePtr,
2475 denoisedPicturePtr,
2476 lcuOriginY >> subHeightCMinus1,
2477 lcuOriginX >> subWidthCMinus1);
2478
2479 if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2480 {
2481 noiseExtractChromaWeak(
2482 inputPicturePtr,
2483 denoisedPicturePtr,
2484 lcuOriginY >> subHeightCMinus1,
2485 lcuOriginX >> subWidthCMinus1);
2486 }
2487
2488 }
2489
2490
2491
2492 for (verticalIdx = 0; verticalIdx < inputPicturePtr->height >> subHeightCMinus1; ++verticalIdx) {
2493
2494 EB_MEMCPY(inputPicturePtr->bufferCb + inChromaOffSet + verticalIdx * inputPicturePtr->strideCb,
2495 denoisedPicturePtr->bufferCb + denChromaOffSet + verticalIdx * denoisedPicturePtr->strideCb,
2496 sizeof(EB_U8) * inputPicturePtr->width >> subWidthCMinus1);
2497
2498 EB_MEMCPY(inputPicturePtr->bufferCr + inChromaOffSet + verticalIdx * inputPicturePtr->strideCr,
2499 denoisedPicturePtr->bufferCr + denChromaOffSet + verticalIdx * denoisedPicturePtr->strideCr,
2500 sizeof(EB_U8) * inputPicturePtr->width >> subWidthCMinus1);
2501 }
2502
2503 }
2504
2505 else if (contextPtr->picNoiseVarianceFloat >= 1.0 && sequenceControlSetPtr->inputResolution == INPUT_SIZE_4K_RANGE) {
2506
2507 //Luma : use filtered only for flatNoise LCUs
2508 for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2509
2510 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2511
2512 lcuOriginX = lcuParams->originX;
2513 lcuOriginY = lcuParams->originY;
2514
2515 EB_U32 lcuHeight = MIN(MAX_LCU_SIZE, inputPicturePtr->height - lcuOriginY);
2516 EB_U32 lcuWidth = MIN(MAX_LCU_SIZE, inputPicturePtr->width - lcuOriginX);
2517
2518 EB_U32 inLumaOffSet = inputPicturePtr->originX + lcuOriginX + (inputPicturePtr->originY + lcuOriginY) * inputPicturePtr->strideY;
2519 EB_U32 denLumaOffSet = denoisedPicturePtr->originX + lcuOriginX + (denoisedPicturePtr->originY + lcuOriginY) * denoisedPicturePtr->strideY;
2520
2521
2522 if (pictureControlSetPtr->lcuFlatNoiseArray[lcuIndex] == 1){
2523
2524
2525 for (verticalIdx = 0; verticalIdx < lcuHeight; ++verticalIdx) {
2526
2527 EB_MEMCPY(inputPicturePtr->bufferY + inLumaOffSet + verticalIdx * inputPicturePtr->strideY,
2528 denoisedPicturePtr->bufferY + denLumaOffSet + verticalIdx * denoisedPicturePtr->strideY,
2529 sizeof(EB_U8) * lcuWidth);
2530
2531 }
2532 }
2533 }
2534 }
2535
2536 return return_error;
2537 }
2538
DetectInputPictureNoise(PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * noisePicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr)2539 static EB_ERRORTYPE DetectInputPictureNoise(
2540 PictureAnalysisContext_t *contextPtr,
2541 SequenceControlSet_t *sequenceControlSetPtr,
2542 PictureParentControlSet_t *pictureControlSetPtr,
2543 EbPictureBufferDesc_t *inputPicturePtr,
2544 EbPictureBufferDesc_t *noisePicturePtr,
2545 EbPictureBufferDesc_t *denoisedPicturePtr)
2546 {
2547
2548 EB_ERRORTYPE return_error = EB_ErrorNone;
2549 EB_U32 lcuIndex;
2550
2551 EB_U64 picNoiseVariance;
2552
2553 EB_U32 totLcuCount, noiseTh;
2554
2555 EB_U32 lcuOriginX;
2556 EB_U32 lcuOriginY;
2557 EB_U32 inputLumaOriginIndex;
2558
2559 picNoiseVariance = 0;
2560 totLcuCount = 0;
2561
2562 //Variance calc for noise picture
2563 for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2564
2565 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2566
2567 lcuOriginX = lcuParams->originX;
2568 lcuOriginY = lcuParams->originY;
2569 inputLumaOriginIndex = (noisePicturePtr->originY + lcuOriginY) * noisePicturePtr->strideY +
2570 noisePicturePtr->originX + lcuOriginX;
2571
2572
2573 EB_U32 noiseOriginIndex = noisePicturePtr->originX + lcuOriginX + noisePicturePtr->originY * noisePicturePtr->strideY;
2574
2575 if (lcuOriginX == 0)
2576 WeakLumaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2577 inputPicturePtr,
2578 denoisedPicturePtr,
2579 noisePicturePtr,
2580 lcuOriginY,
2581 lcuOriginX);
2582
2583 if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2584 {
2585 noiseExtractLumaWeak(
2586 inputPicturePtr,
2587 denoisedPicturePtr,
2588 noisePicturePtr,
2589 lcuOriginY,
2590 lcuOriginX);
2591 }
2592
2593 //do it only for complete 64x64 blocks
2594 if (lcuParams->isCompleteLcu)
2595 {
2596
2597 EB_U64 noiseBlkVar32x32[4], denoiseBlkVar32x32[4];
2598
2599 EB_U64 noiseBlkVar = ComputeVariance64x64(
2600 noisePicturePtr,
2601 noiseOriginIndex,
2602 noiseBlkVar32x32);
2603
2604 EB_U64 noiseBlkVarTh ;
2605 EB_U64 denBlkVarTh = FLAT_MAX_VAR;
2606
2607 if (pictureControlSetPtr->noiseDetectionTh == 1)
2608 noiseBlkVarTh = NOISE_MIN_LEVEL_0;
2609 else
2610 noiseBlkVarTh = NOISE_MIN_LEVEL_1;
2611
2612 picNoiseVariance += (noiseBlkVar >> 16);
2613
2614 EB_U64 denBlkVar = ComputeVariance64x64(
2615 denoisedPicturePtr,
2616 inputLumaOriginIndex,
2617 denoiseBlkVar32x32) >> 16;
2618
2619 if (denBlkVar < denBlkVarTh && noiseBlkVar > noiseBlkVarTh) {
2620 pictureControlSetPtr->lcuFlatNoiseArray[lcuIndex] = 1;
2621 }
2622
2623 totLcuCount++;
2624 }
2625
2626 }
2627
2628 if (totLcuCount > 0) {
2629 contextPtr->picNoiseVarianceFloat = (double)picNoiseVariance / (double)totLcuCount;
2630
2631 picNoiseVariance = picNoiseVariance / totLcuCount;
2632 }
2633
2634 //the variance of a 64x64 noise area tends to be bigger for small resolutions.
2635 if (sequenceControlSetPtr->lumaHeight <= 720)
2636 noiseTh = 25;
2637 else
2638 noiseTh = 0;
2639
2640 if (picNoiseVariance >= 80 + noiseTh)
2641 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_10;
2642 else if (picNoiseVariance >= 70 + noiseTh)
2643 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_9;
2644 else if (picNoiseVariance >= 60 + noiseTh)
2645 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_8;
2646 else if (picNoiseVariance >= 50 + noiseTh)
2647 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_7;
2648 else if (picNoiseVariance >= 40 + noiseTh)
2649 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_6;
2650 else if (picNoiseVariance >= 30 + noiseTh)
2651 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_5;
2652 else if (picNoiseVariance >= 20 + noiseTh)
2653 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_4;
2654 else if (picNoiseVariance >= 17 + noiseTh)
2655 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3_1;
2656 else if (picNoiseVariance >= 10 + noiseTh)
2657 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3;
2658 else if (picNoiseVariance >= 5 + noiseTh)
2659 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_2;
2660 else
2661 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_1;
2662
2663 if (pictureControlSetPtr->picNoiseClass >= PIC_NOISE_CLASS_4)
2664 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3_1;
2665
2666 return return_error;
2667
2668 }
2669
FullSampleDenoise(PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EB_U32 lcuTotalCount,EB_BOOL denoiseFlag)2670 static EB_ERRORTYPE FullSampleDenoise(
2671 PictureAnalysisContext_t *contextPtr,
2672 SequenceControlSet_t *sequenceControlSetPtr,
2673 PictureParentControlSet_t *pictureControlSetPtr,
2674 EB_U32 lcuTotalCount,
2675 EB_BOOL denoiseFlag)
2676 {
2677
2678 EB_ERRORTYPE return_error = EB_ErrorNone;
2679
2680 EB_U32 lcuCodingOrder;
2681 EbPictureBufferDesc_t *inputPicturePtr = pictureControlSetPtr->enhancedPicturePtr;
2682 EbPictureBufferDesc_t *denoisedPicturePtr = contextPtr->denoisedPicturePtr;
2683 EbPictureBufferDesc_t *noisePicturePtr = contextPtr->noisePicturePtr;
2684
2685 //Reset the flat noise flag array to False for both RealTime/HighComplexity Modes
2686 for (lcuCodingOrder = 0; lcuCodingOrder < lcuTotalCount; ++lcuCodingOrder) {
2687 pictureControlSetPtr->lcuFlatNoiseArray[lcuCodingOrder] = 0;
2688 }
2689
2690 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_INV; //this init is for both REAL-TIME and BEST-QUALITY
2691
2692 DetectInputPictureNoise(
2693 contextPtr,
2694 sequenceControlSetPtr,
2695 pictureControlSetPtr,
2696 inputPicturePtr,
2697 noisePicturePtr,
2698 denoisedPicturePtr);
2699
2700 if (denoiseFlag == EB_TRUE)
2701 {
2702 DenoiseInputPicture(
2703 contextPtr,
2704 sequenceControlSetPtr,
2705 pictureControlSetPtr,
2706 inputPicturePtr,
2707 denoisedPicturePtr);
2708 }
2709
2710 return return_error;
2711
2712 }
2713
SubSampleFilterNoise(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * noisePicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr)2714 static EB_ERRORTYPE SubSampleFilterNoise(
2715 SequenceControlSet_t *sequenceControlSetPtr,
2716 PictureParentControlSet_t *pictureControlSetPtr,
2717 EbPictureBufferDesc_t *inputPicturePtr,
2718 EbPictureBufferDesc_t *noisePicturePtr,
2719 EbPictureBufferDesc_t *denoisedPicturePtr)
2720 {
2721 EB_ERRORTYPE return_error = EB_ErrorNone;
2722
2723 EB_U32 lcuIndex;
2724 EB_U32 lcuOriginX;
2725 EB_U32 lcuOriginY;
2726 EB_U16 verticalIdx;
2727 EB_U32 colorFormat = inputPicturePtr->colorFormat;
2728 EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
2729 EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
2730
2731 if (pictureControlSetPtr->picNoiseClass == PIC_NOISE_CLASS_3_1) {
2732
2733 EB_U32 inLumaOffSet = inputPicturePtr->originX + inputPicturePtr->originY * inputPicturePtr->strideY;
2734 EB_U32 inChromaOffSet = (inputPicturePtr->originX >> subWidthCMinus1) + (inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideCb;
2735 EB_U32 denLumaOffSet = denoisedPicturePtr->originX + denoisedPicturePtr->originY * denoisedPicturePtr->strideY;
2736 EB_U32 denChromaOffSet = (denoisedPicturePtr->originX >> subWidthCMinus1) + (denoisedPicturePtr->originY >> subHeightCMinus1) * denoisedPicturePtr->strideCb;
2737
2738
2739 //filter Luma
2740 for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2741
2742 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2743
2744 lcuOriginX = lcuParams->originX;
2745 lcuOriginY = lcuParams->originY;
2746
2747 if (lcuOriginX == 0)
2748 WeakLumaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2749 inputPicturePtr,
2750 denoisedPicturePtr,
2751 noisePicturePtr,
2752 lcuOriginY,
2753 lcuOriginX);
2754
2755 if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2756 {
2757 noiseExtractLumaWeak(
2758 inputPicturePtr,
2759 denoisedPicturePtr,
2760 noisePicturePtr,
2761 lcuOriginY,
2762 lcuOriginX);
2763 }
2764 }
2765
2766 //copy luma
2767 for (verticalIdx = 0; verticalIdx < inputPicturePtr->height; ++verticalIdx) {
2768 EB_MEMCPY(inputPicturePtr->bufferY + inLumaOffSet + verticalIdx * inputPicturePtr->strideY,
2769 denoisedPicturePtr->bufferY + denLumaOffSet + verticalIdx * denoisedPicturePtr->strideY,
2770 sizeof(EB_U8) * inputPicturePtr->width);
2771 }
2772
2773 //filter chroma
2774 for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2775
2776 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2777
2778 lcuOriginX = lcuParams->originX;
2779 lcuOriginY = lcuParams->originY;
2780
2781 if (lcuOriginX == 0)
2782 WeakChromaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2783 inputPicturePtr,
2784 denoisedPicturePtr,
2785 lcuOriginY >> subHeightCMinus1,
2786 lcuOriginX >> subWidthCMinus1);
2787
2788 if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2789 {
2790 noiseExtractChromaWeak(
2791 inputPicturePtr,
2792 denoisedPicturePtr,
2793 lcuOriginY >> subHeightCMinus1,
2794 lcuOriginX >> subWidthCMinus1);
2795 }
2796
2797 }
2798
2799 //copy chroma
2800 for (verticalIdx = 0; verticalIdx < inputPicturePtr->height >> subHeightCMinus1; ++verticalIdx) {
2801
2802 EB_MEMCPY(inputPicturePtr->bufferCb + inChromaOffSet + verticalIdx * inputPicturePtr->strideCb,
2803 denoisedPicturePtr->bufferCb + denChromaOffSet + verticalIdx * denoisedPicturePtr->strideCb,
2804 sizeof(EB_U8) * inputPicturePtr->width >> subWidthCMinus1);
2805
2806 EB_MEMCPY(inputPicturePtr->bufferCr + inChromaOffSet + verticalIdx * inputPicturePtr->strideCr,
2807 denoisedPicturePtr->bufferCr + denChromaOffSet + verticalIdx * denoisedPicturePtr->strideCr,
2808 sizeof(EB_U8) * inputPicturePtr->width >> subWidthCMinus1);
2809 }
2810
2811 } else if (pictureControlSetPtr->picNoiseClass == PIC_NOISE_CLASS_2){
2812
2813 EB_U32 newTotFN = 0;
2814
2815 //for each LCU ,re check the FN information for only the FNdecim ones
2816 for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2817
2818 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2819
2820 lcuOriginX = lcuParams->originX;
2821 lcuOriginY = lcuParams->originY;
2822 EB_U32 inputLumaOriginIndex = noisePicturePtr->originX + lcuOriginX + (noisePicturePtr->originY + lcuOriginY) * noisePicturePtr->strideY;
2823 EB_U32 noiseOriginIndex = noisePicturePtr->originX + lcuOriginX + (noisePicturePtr->originY * noisePicturePtr->strideY);
2824
2825 if (lcuParams->isCompleteLcu && pictureControlSetPtr->lcuFlatNoiseArray[lcuIndex] == 1)
2826 {
2827
2828 WeakLumaFilterLcu_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2829 inputPicturePtr,
2830 denoisedPicturePtr,
2831 noisePicturePtr,
2832 lcuOriginY,
2833 lcuOriginX);
2834
2835 if (lcuOriginX + MAX_LCU_SIZE > inputPicturePtr->width)
2836 {
2837 noiseExtractLumaWeakLcu(
2838 inputPicturePtr,
2839 denoisedPicturePtr,
2840 noisePicturePtr,
2841 lcuOriginY,
2842 lcuOriginX);
2843 }
2844
2845 EB_U64 noiseBlkVar32x32[4], denoiseBlkVar32x32[4];
2846 EB_U64 noiseBlkVar = ComputeVariance64x64(
2847 noisePicturePtr, noiseOriginIndex, noiseBlkVar32x32);
2848 EB_U64 denBlkVar = ComputeVariance64x64(
2849 denoisedPicturePtr, inputLumaOriginIndex, denoiseBlkVar32x32) >> 16;
2850
2851 EB_U64 noiseBlkVarTh ;
2852 EB_U64 denBlkVarTh = FLAT_MAX_VAR;
2853
2854 if (pictureControlSetPtr->noiseDetectionTh == 1)
2855 noiseBlkVarTh = NOISE_MIN_LEVEL_0;
2856 else
2857 noiseBlkVarTh = NOISE_MIN_LEVEL_1;
2858
2859 if (denBlkVar<denBlkVarTh && noiseBlkVar> noiseBlkVarTh) {
2860 pictureControlSetPtr->lcuFlatNoiseArray[lcuIndex] = 1;
2861 //SVT_LOG("POC %i (%i,%i) denBlkVar: %i noiseBlkVar :%i\n", pictureControlSetPtr->pictureNumber,lcuOriginX,lcuOriginY, denBlkVar, noiseBlkVar);
2862 newTotFN++;
2863
2864 }
2865 else{
2866 pictureControlSetPtr->lcuFlatNoiseArray[lcuIndex] = 0;
2867 }
2868 }
2869 }
2870
2871 //filter Luma
2872 for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
2873
2874 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
2875
2876 lcuOriginX = lcuParams->originX;
2877 lcuOriginY = lcuParams->originY;
2878
2879 if (lcuOriginX + 64 <= inputPicturePtr->width && lcuOriginY + 64 <= inputPicturePtr->height)
2880 {
2881
2882
2883 //use the denoised for FN LCUs
2884 if (pictureControlSetPtr->lcuFlatNoiseArray[lcuIndex] == 1){
2885
2886 EB_U32 lcuHeight = MIN(MAX_LCU_SIZE, inputPicturePtr->height - lcuOriginY);
2887 EB_U32 lcuWidth = MIN(MAX_LCU_SIZE, inputPicturePtr->width - lcuOriginX);
2888
2889 EB_U32 inLumaOffSet = inputPicturePtr->originX + lcuOriginX + (inputPicturePtr->originY + lcuOriginY) * inputPicturePtr->strideY;
2890 EB_U32 denLumaOffSet = denoisedPicturePtr->originX + lcuOriginX + (denoisedPicturePtr->originY + lcuOriginY) * denoisedPicturePtr->strideY;
2891
2892 for (verticalIdx = 0; verticalIdx < lcuHeight; ++verticalIdx) {
2893
2894 EB_MEMCPY(inputPicturePtr->bufferY + inLumaOffSet + verticalIdx * inputPicturePtr->strideY,
2895 denoisedPicturePtr->bufferY + denLumaOffSet + verticalIdx * denoisedPicturePtr->strideY,
2896 sizeof(EB_U8) * lcuWidth);
2897
2898 }
2899 }
2900
2901 }
2902
2903 }
2904
2905 }
2906 return return_error;
2907 }
2908
QuarterSampleDetectNoise(PictureAnalysisContext_t * contextPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * quarterDecimatedPicturePtr,EbPictureBufferDesc_t * noisePicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EB_U32 pictureWidthInLcu)2909 static EB_ERRORTYPE QuarterSampleDetectNoise(
2910 PictureAnalysisContext_t *contextPtr,
2911 PictureParentControlSet_t *pictureControlSetPtr,
2912 EbPictureBufferDesc_t *quarterDecimatedPicturePtr,
2913 EbPictureBufferDesc_t *noisePicturePtr,
2914 EbPictureBufferDesc_t *denoisedPicturePtr,
2915 EB_U32 pictureWidthInLcu)
2916 {
2917
2918 EB_ERRORTYPE return_error = EB_ErrorNone;
2919
2920 EB_U64 picNoiseVariance;
2921
2922 EB_U32 totLcuCount, noiseTh;
2923
2924 EB_U32 blockIndex;
2925
2926 picNoiseVariance = 0;
2927 totLcuCount = 0;
2928
2929
2930 EB_U16 vert64x64Index;
2931 EB_U16 horz64x64Index;
2932 EB_U32 block64x64X;
2933 EB_U32 block64x64Y;
2934 EB_U32 vert32x32Index;
2935 EB_U32 horz32x32Index;
2936 EB_U32 block32x32X;
2937 EB_U32 block32x32Y;
2938 EB_U32 noiseOriginIndex;
2939 EB_U32 lcuCodingOrder;
2940
2941 // Loop over 64x64 blocks on the downsampled domain (each block would contain 16 LCUs on the full sampled domain)
2942 for (vert64x64Index = 0; vert64x64Index < (quarterDecimatedPicturePtr->height / 64); vert64x64Index++){
2943 for (horz64x64Index = 0; horz64x64Index < (quarterDecimatedPicturePtr->width / 64); horz64x64Index++){
2944
2945 block64x64X = horz64x64Index * 64;
2946 block64x64Y = vert64x64Index * 64;
2947
2948 if (block64x64X == 0)
2949 WeakLumaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
2950 quarterDecimatedPicturePtr,
2951 denoisedPicturePtr,
2952 noisePicturePtr,
2953 block64x64Y,
2954 block64x64X);
2955
2956 if (block64x64Y + MAX_LCU_SIZE > quarterDecimatedPicturePtr->width)
2957 {
2958 noiseExtractLumaWeak(
2959 quarterDecimatedPicturePtr,
2960 denoisedPicturePtr,
2961 noisePicturePtr,
2962 block64x64Y,
2963 block64x64X);
2964 }
2965
2966
2967 // Loop over 32x32 blocks (i.e, 64x64 blocks in full resolution)
2968 for (vert32x32Index = 0; vert32x32Index < 2; vert32x32Index++){
2969 for (horz32x32Index = 0; horz32x32Index < 2; horz32x32Index++){
2970
2971 block32x32X = block64x64X + horz32x32Index * 32;
2972 block32x32Y = block64x64Y + vert32x32Index * 32;
2973
2974 //do it only for complete 32x32 blocks (i.e, complete 64x64 blocks in full resolution)
2975 if ((block32x32X + 32 <= quarterDecimatedPicturePtr->width) && (block32x32Y + 32 <= quarterDecimatedPicturePtr->height))
2976 {
2977
2978 lcuCodingOrder = ((vert64x64Index * 2) + vert32x32Index) * pictureWidthInLcu + ((horz64x64Index * 2) + horz32x32Index);
2979
2980
2981 EB_U64 noiseBlkVar8x8[16], denoiseBlkVar8x8[16];
2982
2983 noiseOriginIndex = noisePicturePtr->originX + block32x32X + noisePicturePtr->originY * noisePicturePtr->strideY;
2984
2985 EB_U64 noiseBlkVar = ComputeVariance32x32(
2986 noisePicturePtr,
2987 noiseOriginIndex,
2988 noiseBlkVar8x8);
2989
2990
2991 picNoiseVariance += (noiseBlkVar >> 16);
2992
2993 blockIndex = (noisePicturePtr->originY + block32x32Y) * noisePicturePtr->strideY + noisePicturePtr->originX + block32x32X;
2994
2995 EB_U64 denBlkVar = ComputeVariance32x32(
2996 denoisedPicturePtr,
2997 blockIndex,
2998 denoiseBlkVar8x8) >> 16;
2999
3000 EB_U64 denBlkVarDecTh;
3001
3002 if (pictureControlSetPtr->noiseDetectionTh == 0){
3003 denBlkVarDecTh = NOISE_MIN_LEVEL_DECIM_1;
3004 }
3005 else{
3006 denBlkVarDecTh = NOISE_MIN_LEVEL_DECIM_0;
3007 }
3008
3009 if (denBlkVar < FLAT_MAX_VAR_DECIM && noiseBlkVar> denBlkVarDecTh) {
3010 pictureControlSetPtr->lcuFlatNoiseArray[lcuCodingOrder] = 1;
3011 }
3012
3013 totLcuCount++;
3014 }
3015 }
3016 }
3017 }
3018 }
3019
3020 if (totLcuCount > 0) {
3021 contextPtr->picNoiseVarianceFloat = (double)picNoiseVariance / (double)totLcuCount;
3022
3023 picNoiseVariance = picNoiseVariance / totLcuCount;
3024 }
3025
3026 //the variance of a 64x64 noise area tends to be bigger for small resolutions.
3027 //if (sequenceControlSetPtr->lumaHeight <= 720)
3028 // noiseTh = 25;
3029 //else if (sequenceControlSetPtr->lumaHeight <= 1080)
3030 // noiseTh = 10;
3031 //else
3032 noiseTh = 0;
3033
3034 //look for extreme noise or big enough flat noisy area to be denoised.
3035 if (picNoiseVariance > 60)
3036 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3_1; //Noise+Edge information is too big, so may be this is all noise (action: frame based denoising)
3037 else if (picNoiseVariance >= 10 + noiseTh)
3038 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3; //Noise+Edge information is big enough, so there is no big enough flat noisy area (action : no denoising)
3039 else if (picNoiseVariance >= 5 + noiseTh)
3040 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_2; //Noise+Edge information is relatively small, so there might be a big enough flat noisy area(action : denoising only for FN blocks)
3041 else
3042 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_1; //Noise+Edge information is very small, so no noise nor edge area (action : no denoising)
3043
3044
3045
3046 return return_error;
3047
3048 }
3049
3050
3051
SubSampleDetectNoise(PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * sixteenthDecimatedPicturePtr,EbPictureBufferDesc_t * noisePicturePtr,EbPictureBufferDesc_t * denoisedPicturePtr,EB_U32 pictureWidthInLcu)3052 static EB_ERRORTYPE SubSampleDetectNoise(
3053 PictureAnalysisContext_t *contextPtr,
3054 SequenceControlSet_t *sequenceControlSetPtr,
3055 PictureParentControlSet_t *pictureControlSetPtr,
3056 EbPictureBufferDesc_t *sixteenthDecimatedPicturePtr,
3057 EbPictureBufferDesc_t *noisePicturePtr,
3058 EbPictureBufferDesc_t *denoisedPicturePtr,
3059 EB_U32 pictureWidthInLcu)
3060 {
3061
3062 EB_ERRORTYPE return_error = EB_ErrorNone;
3063
3064 EB_U64 picNoiseVariance;
3065
3066 EB_U32 totLcuCount, noiseTh;
3067
3068 EB_U32 blockIndex;
3069
3070 picNoiseVariance = 0;
3071 totLcuCount = 0;
3072
3073
3074 EB_U16 vert64x64Index;
3075 EB_U16 horz64x64Index;
3076 EB_U32 block64x64X;
3077 EB_U32 block64x64Y;
3078 EB_U32 vert16x16Index;
3079 EB_U32 horz16x16Index;
3080 EB_U32 block16x16X;
3081 EB_U32 block16x16Y;
3082 EB_U32 noiseOriginIndex;
3083 EB_U32 lcuCodingOrder;
3084
3085 // Loop over 64x64 blocks on the downsampled domain (each block would contain 16 LCUs on the full sampled domain)
3086 for (vert64x64Index = 0; vert64x64Index < (sixteenthDecimatedPicturePtr->height / 64); vert64x64Index++){
3087 for (horz64x64Index = 0; horz64x64Index < (sixteenthDecimatedPicturePtr->width / 64); horz64x64Index++){
3088
3089 block64x64X = horz64x64Index * 64;
3090 block64x64Y = vert64x64Index * 64;
3091
3092 if (block64x64X == 0)
3093 WeakLumaFilter_funcPtrArray[!!(ASM_TYPES & AVX2_MASK)](
3094 sixteenthDecimatedPicturePtr,
3095 denoisedPicturePtr,
3096 noisePicturePtr,
3097 block64x64Y,
3098 block64x64X);
3099
3100 if (block64x64Y + MAX_LCU_SIZE > sixteenthDecimatedPicturePtr->width)
3101 {
3102 noiseExtractLumaWeak(
3103 sixteenthDecimatedPicturePtr,
3104 denoisedPicturePtr,
3105 noisePicturePtr,
3106 block64x64Y,
3107 block64x64X);
3108 }
3109
3110
3111 // Loop over 16x16 blocks (i.e, 64x64 blocks in full resolution)
3112 for (vert16x16Index = 0; vert16x16Index < 4; vert16x16Index++){
3113 for (horz16x16Index = 0; horz16x16Index < 4; horz16x16Index++){
3114
3115 block16x16X = block64x64X + horz16x16Index * 16;
3116 block16x16Y = block64x64Y + vert16x16Index * 16;
3117
3118 //do it only for complete 16x16 blocks (i.e, complete 64x64 blocks in full resolution)
3119 if (block16x16X + 16 <= sixteenthDecimatedPicturePtr->width && block16x16Y + 16 <= sixteenthDecimatedPicturePtr->height)
3120 {
3121
3122 lcuCodingOrder = ((vert64x64Index * 4) + vert16x16Index) * pictureWidthInLcu + ((horz64x64Index * 4) + horz16x16Index);
3123
3124
3125 EB_U64 noiseBlkVar8x8[4], denoiseBlkVar8x8[4];
3126
3127 noiseOriginIndex = noisePicturePtr->originX + block16x16X + noisePicturePtr->originY * noisePicturePtr->strideY;
3128
3129 EB_U64 noiseBlkVar = ComputeVariance16x16(
3130 noisePicturePtr,
3131 noiseOriginIndex,
3132 noiseBlkVar8x8);
3133
3134
3135 picNoiseVariance += (noiseBlkVar >> 16);
3136
3137 blockIndex = (noisePicturePtr->originY + block16x16Y) * noisePicturePtr->strideY + noisePicturePtr->originX + block16x16X;
3138
3139 EB_U64 denBlkVar = ComputeVariance16x16(
3140 denoisedPicturePtr,
3141 blockIndex,
3142 denoiseBlkVar8x8) >> 16;
3143
3144 EB_U64 noiseBlkVarDecTh ;
3145 EB_U64 denBlkVarDecTh = FLAT_MAX_VAR_DECIM;
3146
3147 if (pictureControlSetPtr->noiseDetectionTh == 1) {
3148 noiseBlkVarDecTh = NOISE_MIN_LEVEL_DECIM_0;
3149 }
3150 else {
3151 noiseBlkVarDecTh = NOISE_MIN_LEVEL_DECIM_1;
3152 }
3153
3154 if (denBlkVar < denBlkVarDecTh && noiseBlkVar> noiseBlkVarDecTh) {
3155 pictureControlSetPtr->lcuFlatNoiseArray[lcuCodingOrder] = 1;
3156 }
3157 totLcuCount++;
3158 }
3159 }
3160 }
3161 }
3162 }
3163
3164 if (totLcuCount > 0) {
3165 contextPtr->picNoiseVarianceFloat = (double)picNoiseVariance / (double)totLcuCount;
3166
3167 picNoiseVariance = picNoiseVariance / totLcuCount;
3168 }
3169
3170 //the variance of a 64x64 noise area tends to be bigger for small resolutions.
3171 if (sequenceControlSetPtr->lumaHeight <= 720)
3172 noiseTh = 25;
3173 else if (sequenceControlSetPtr->lumaHeight <= 1080)
3174 noiseTh = 10;
3175 else
3176 noiseTh = 0;
3177
3178 //look for extreme noise or big enough flat noisy area to be denoised.
3179 if (picNoiseVariance >= 55 + noiseTh)
3180 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3_1; //Noise+Edge information is too big, so may be this is all noise (action: frame based denoising)
3181 else if (picNoiseVariance >= 10 + noiseTh)
3182 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_3; //Noise+Edge information is big enough, so there is no big enough flat noisy area (action : no denoising)
3183 else if (picNoiseVariance >= 5 + noiseTh)
3184 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_2; //Noise+Edge information is relatively small, so there might be a big enough flat noisy area(action : denoising only for FN blocks)
3185 else
3186 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_1; //Noise+Edge information is very small, so no noise nor edge area (action : no denoising)
3187
3188 return return_error;
3189
3190 }
3191
QuarterSampleDenoise(PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * quarterDecimatedPicturePtr,EB_U32 lcuTotalCount,EB_BOOL denoiseFlag,EB_U32 pictureWidthInLcu)3192 static EB_ERRORTYPE QuarterSampleDenoise(
3193 PictureAnalysisContext_t *contextPtr,
3194 SequenceControlSet_t *sequenceControlSetPtr,
3195 PictureParentControlSet_t *pictureControlSetPtr,
3196 EbPictureBufferDesc_t *quarterDecimatedPicturePtr,
3197 EB_U32 lcuTotalCount,
3198 EB_BOOL denoiseFlag,
3199 EB_U32 pictureWidthInLcu)
3200 {
3201
3202 EB_ERRORTYPE return_error = EB_ErrorNone;
3203
3204 EB_U32 lcuCodingOrder;
3205 EbPictureBufferDesc_t *inputPicturePtr = pictureControlSetPtr->enhancedPicturePtr;
3206 EbPictureBufferDesc_t *denoisedPicturePtr = contextPtr->denoisedPicturePtr;
3207 EbPictureBufferDesc_t *noisePicturePtr = contextPtr->noisePicturePtr;
3208
3209 //Reset the flat noise flag array to False for both RealTime/HighComplexity Modes
3210 for (lcuCodingOrder = 0; lcuCodingOrder < lcuTotalCount; ++lcuCodingOrder) {
3211 pictureControlSetPtr->lcuFlatNoiseArray[lcuCodingOrder] = 0;
3212 }
3213
3214 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_INV; //this init is for both REAL-TIME and BEST-QUALITY
3215
3216 Decimation2D(
3217 &inputPicturePtr->bufferY[inputPicturePtr->originX + inputPicturePtr->originY * inputPicturePtr->strideY],
3218 inputPicturePtr->strideY,
3219 inputPicturePtr->width,
3220 inputPicturePtr->height,
3221 &quarterDecimatedPicturePtr->bufferY[quarterDecimatedPicturePtr->originX + (quarterDecimatedPicturePtr->originY * quarterDecimatedPicturePtr->strideY)],
3222 quarterDecimatedPicturePtr->strideY,
3223 2);
3224
3225
3226 QuarterSampleDetectNoise(
3227 contextPtr,
3228 pictureControlSetPtr,
3229 quarterDecimatedPicturePtr,
3230 noisePicturePtr,
3231 denoisedPicturePtr,
3232 pictureWidthInLcu);
3233
3234 if (denoiseFlag == EB_TRUE) {
3235
3236 // Turn OFF the de-noiser for Class 2 at QP=29 and lower (for Fixed_QP) and at the target rate of 14Mbps and higher (for RC=ON)
3237 if ((pictureControlSetPtr->picNoiseClass == PIC_NOISE_CLASS_3_1) || ((pictureControlSetPtr->picNoiseClass == PIC_NOISE_CLASS_2) && ((sequenceControlSetPtr->staticConfig.rateControlMode == 0 && sequenceControlSetPtr->qp > DENOISER_QP_TH) || (sequenceControlSetPtr->staticConfig.rateControlMode != 0 && sequenceControlSetPtr->staticConfig.targetBitRate < DENOISER_BITRATE_TH)))) {
3238
3239 SubSampleFilterNoise(
3240 sequenceControlSetPtr,
3241 pictureControlSetPtr,
3242 inputPicturePtr,
3243 noisePicturePtr,
3244 denoisedPicturePtr);
3245 }
3246 }
3247
3248 return return_error;
3249
3250 }
3251
3252
HalfSampleDenoise(PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * sixteenthDecimatedPicturePtr,EB_U32 lcuTotalCount,EB_BOOL denoiseFlag,EB_U32 pictureWidthInLcu)3253 static EB_ERRORTYPE HalfSampleDenoise(
3254 PictureAnalysisContext_t *contextPtr,
3255 SequenceControlSet_t *sequenceControlSetPtr,
3256 PictureParentControlSet_t *pictureControlSetPtr,
3257 EbPictureBufferDesc_t *sixteenthDecimatedPicturePtr,
3258 EB_U32 lcuTotalCount,
3259 EB_BOOL denoiseFlag,
3260 EB_U32 pictureWidthInLcu)
3261 {
3262
3263 EB_ERRORTYPE return_error = EB_ErrorNone;
3264
3265 EB_U32 lcuCodingOrder;
3266 EbPictureBufferDesc_t *inputPicturePtr = pictureControlSetPtr->enhancedPicturePtr;
3267 EbPictureBufferDesc_t *denoisedPicturePtr = contextPtr->denoisedPicturePtr;
3268 EbPictureBufferDesc_t *noisePicturePtr = contextPtr->noisePicturePtr;
3269
3270 //Reset the flat noise flag array to False for both RealTime/HighComplexity Modes
3271 for (lcuCodingOrder = 0; lcuCodingOrder < lcuTotalCount; ++lcuCodingOrder) {
3272 pictureControlSetPtr->lcuFlatNoiseArray[lcuCodingOrder] = 0;
3273 }
3274
3275 pictureControlSetPtr->picNoiseClass = PIC_NOISE_CLASS_INV; //this init is for both REAL-TIME and BEST-QUALITY
3276
3277 Decimation2D(
3278 &inputPicturePtr->bufferY[inputPicturePtr->originX + inputPicturePtr->originY * inputPicturePtr->strideY],
3279 inputPicturePtr->strideY,
3280 inputPicturePtr->width,
3281 inputPicturePtr->height,
3282 &sixteenthDecimatedPicturePtr->bufferY[sixteenthDecimatedPicturePtr->originX + (sixteenthDecimatedPicturePtr->originY * sixteenthDecimatedPicturePtr->strideY)],
3283 sixteenthDecimatedPicturePtr->strideY,
3284 4);
3285
3286 SubSampleDetectNoise(
3287 contextPtr,
3288 sequenceControlSetPtr,
3289 pictureControlSetPtr,
3290 sixteenthDecimatedPicturePtr,
3291 noisePicturePtr,
3292 denoisedPicturePtr,
3293 pictureWidthInLcu);
3294
3295 if (denoiseFlag == EB_TRUE) {
3296
3297 // Turn OFF the de-noiser for Class 2 at QP=29 and lower (for Fixed_QP) and at the target rate of 14Mbps and higher (for RC=ON)
3298 if ((pictureControlSetPtr->picNoiseClass == PIC_NOISE_CLASS_3_1) || ((pictureControlSetPtr->picNoiseClass == PIC_NOISE_CLASS_2) && ((sequenceControlSetPtr->staticConfig.rateControlMode == 0 && sequenceControlSetPtr->qp > DENOISER_QP_TH) || (sequenceControlSetPtr->staticConfig.rateControlMode != 0 && sequenceControlSetPtr->staticConfig.targetBitRate < DENOISER_BITRATE_TH)))) {
3299
3300 SubSampleFilterNoise(
3301 sequenceControlSetPtr,
3302 pictureControlSetPtr,
3303 inputPicturePtr,
3304 noisePicturePtr,
3305 denoisedPicturePtr);
3306 }
3307 }
3308
3309 return return_error;
3310
3311 }
3312
3313
3314 /************************************************
3315 * Set Picture Parameters based on input configuration
3316 ** Setting Number of regions per resolution
3317 ** Setting width and height for subpicture and when picture scan type is 1
3318 ************************************************/
SetPictureParametersForStatisticsGathering(SequenceControlSet_t * sequenceControlSetPtr)3319 static void SetPictureParametersForStatisticsGathering(
3320 SequenceControlSet_t *sequenceControlSetPtr
3321 )
3322 {
3323 sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth = HIGHER_THAN_CLASS_1_REGION_SPLIT_PER_WIDTH;
3324 sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight = HIGHER_THAN_CLASS_1_REGION_SPLIT_PER_HEIGHT;
3325 sequenceControlSetPtr->pictureActivityRegionTh = HIGHER_THAN_CLASS_1_PICTURE_ACTIVITY_REGIONS_TH;
3326
3327 return;
3328 }
3329
3330 /************************************************
3331 * Picture Pre Processing Operations *
3332 *** A function that groups all of the Pre proceesing
3333 * operations performed on the input picture
3334 *** Operations included at this point:
3335 ***** Borders preprocessing
3336 ***** Denoising
3337 ************************************************/
PicturePreProcessingOperations(PictureParentControlSet_t * pictureControlSetPtr,PictureAnalysisContext_t * contextPtr,SequenceControlSet_t * sequenceControlSetPtr,EbPictureBufferDesc_t * quarterDecimatedPicturePtr,EbPictureBufferDesc_t * sixteenthDecimatedPicturePtr,EB_U32 lcuTotalCount,EB_U32 pictureWidthInLcu)3338 static void PicturePreProcessingOperations(
3339 PictureParentControlSet_t *pictureControlSetPtr,
3340 PictureAnalysisContext_t *contextPtr,
3341 SequenceControlSet_t *sequenceControlSetPtr,
3342 EbPictureBufferDesc_t *quarterDecimatedPicturePtr,
3343 EbPictureBufferDesc_t *sixteenthDecimatedPicturePtr,
3344 EB_U32 lcuTotalCount,
3345 EB_U32 pictureWidthInLcu)
3346 {
3347 if (pictureControlSetPtr->noiseDetectionMethod == NOISE_DETECT_HALF_PRECISION) {
3348
3349 HalfSampleDenoise(
3350 contextPtr,
3351 sequenceControlSetPtr,
3352 pictureControlSetPtr,
3353 sixteenthDecimatedPicturePtr,
3354 lcuTotalCount,
3355 pictureControlSetPtr->enableDenoiseSrcFlag,
3356 pictureWidthInLcu);
3357 }
3358 else if (pictureControlSetPtr->noiseDetectionMethod == NOISE_DETECT_QUARTER_PRECISION) {
3359 QuarterSampleDenoise(
3360 contextPtr,
3361 sequenceControlSetPtr,
3362 pictureControlSetPtr,
3363 quarterDecimatedPicturePtr,
3364 lcuTotalCount,
3365 pictureControlSetPtr->enableDenoiseSrcFlag,
3366 pictureWidthInLcu);
3367 } else {
3368 FullSampleDenoise(
3369 contextPtr,
3370 sequenceControlSetPtr,
3371 pictureControlSetPtr,
3372 lcuTotalCount,
3373 pictureControlSetPtr->enableDenoiseSrcFlag
3374 );
3375 }
3376 return;
3377
3378 }
3379
3380 /**************************************************************
3381 * Generate picture histogram bins for YUV pixel intensity *
3382 * Calculation is done on a region based (Set previously, resolution dependent)
3383 **************************************************************/
SubSampleLumaGeneratePixelIntensityHistogramBins(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr,EB_U64 * sumAverageIntensityTotalRegionsLuma)3384 static void SubSampleLumaGeneratePixelIntensityHistogramBins(
3385 SequenceControlSet_t *sequenceControlSetPtr,
3386 PictureParentControlSet_t *pictureControlSetPtr,
3387 EbPictureBufferDesc_t *inputPicturePtr,
3388 EB_U64 *sumAverageIntensityTotalRegionsLuma){
3389
3390 EB_U32 regionWidth;
3391 EB_U32 regionHeight;
3392 EB_U32 regionWidthOffset;
3393 EB_U32 regionHeightOffset;
3394 EB_U32 regionInPictureWidthIndex;
3395 EB_U32 regionInPictureHeightIndex;
3396 EB_U32 histogramBin;
3397 EB_U64 sum;
3398
3399 regionWidth = inputPicturePtr->width / sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth;
3400 regionHeight = inputPicturePtr->height / sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight;
3401
3402 // Loop over regions inside the picture
3403 for (regionInPictureWidthIndex = 0; regionInPictureWidthIndex < sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth; regionInPictureWidthIndex++){ // loop over horizontal regions
3404 for (regionInPictureHeightIndex = 0; regionInPictureHeightIndex < sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight; regionInPictureHeightIndex++){ // loop over vertical regions
3405
3406
3407 // Initialize bins to 1
3408 InitializeBuffer_32bits_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)](pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][0], 64, 0, 1);
3409
3410 regionWidthOffset = (regionInPictureWidthIndex == sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth - 1) ?
3411 inputPicturePtr->width - (sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth * regionWidth) :
3412 0;
3413
3414 regionHeightOffset = (regionInPictureHeightIndex == sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight - 1) ?
3415 inputPicturePtr->height - (sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight * regionHeight) :
3416 0;
3417
3418 // Y Histogram
3419 CalculateHistogram(
3420 &inputPicturePtr->bufferY[(inputPicturePtr->originX + regionInPictureWidthIndex * regionWidth) + ((inputPicturePtr->originY + regionInPictureHeightIndex * regionHeight) * inputPicturePtr->strideY)],
3421 regionWidth + regionWidthOffset,
3422 regionHeight + regionHeightOffset,
3423 inputPicturePtr->strideY,
3424 1,
3425 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][0],
3426 &sum);
3427
3428 pictureControlSetPtr->averageIntensityPerRegion[regionInPictureWidthIndex][regionInPictureHeightIndex][0] = (EB_U8)((sum + (((regionWidth + regionWidthOffset)*(regionHeight + regionHeightOffset)) >> 1)) / ((regionWidth + regionWidthOffset)*(regionHeight + regionHeightOffset)));
3429 (*sumAverageIntensityTotalRegionsLuma) += (sum << 4);
3430 for (histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++){ // Loop over the histogram bins
3431 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][0][histogramBin] =
3432 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][0][histogramBin] << 4;
3433 }
3434 }
3435 }
3436
3437 return;
3438 }
3439
SubSampleChromaGeneratePixelIntensityHistogramBins(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr,EB_U64 * sumAverageIntensityTotalRegionsCb,EB_U64 * sumAverageIntensityTotalRegionsCr)3440 static void SubSampleChromaGeneratePixelIntensityHistogramBins(
3441 SequenceControlSet_t *sequenceControlSetPtr,
3442 PictureParentControlSet_t *pictureControlSetPtr,
3443 EbPictureBufferDesc_t *inputPicturePtr,
3444 EB_U64 *sumAverageIntensityTotalRegionsCb,
3445 EB_U64 *sumAverageIntensityTotalRegionsCr){
3446
3447 EB_U64 sum;
3448 EB_U32 regionWidth;
3449 EB_U32 regionHeight;
3450 EB_U32 regionWidthOffset;
3451 EB_U32 regionHeightOffset;
3452 EB_U32 regionInPictureWidthIndex;
3453 EB_U32 regionInPictureHeightIndex;
3454
3455 EB_U16 histogramBin;
3456 EB_U8 decimStep = 4;
3457
3458 regionWidth = inputPicturePtr->width / sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth;
3459 regionHeight = inputPicturePtr->height / sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight;
3460
3461 // Loop over regions inside the picture
3462 for (regionInPictureWidthIndex = 0; regionInPictureWidthIndex < sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth; regionInPictureWidthIndex++){ // loop over horizontal regions
3463 for (regionInPictureHeightIndex = 0; regionInPictureHeightIndex < sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight; regionInPictureHeightIndex++){ // loop over vertical regions
3464
3465
3466 // Initialize bins to 1
3467 InitializeBuffer_32bits_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)](pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][1], 64, 0, 1);
3468 InitializeBuffer_32bits_funcPtrArray[!!(ASM_TYPES & PREAVX2_MASK)](pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][2], 64, 0, 1);
3469
3470 regionWidthOffset = (regionInPictureWidthIndex == sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth - 1) ?
3471 inputPicturePtr->width - (sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerWidth * regionWidth) :
3472 0;
3473
3474 regionHeightOffset = (regionInPictureHeightIndex == sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight - 1) ?
3475 inputPicturePtr->height - (sequenceControlSetPtr->pictureAnalysisNumberOfRegionsPerHeight * regionHeight) :
3476 0;
3477
3478
3479 // U Histogram
3480 CalculateHistogram(
3481 &inputPicturePtr->bufferCb[((inputPicturePtr->originX + regionInPictureWidthIndex * regionWidth) >> 1) + (((inputPicturePtr->originY + regionInPictureHeightIndex * regionHeight) >> 1) * inputPicturePtr->strideCb)],
3482 (regionWidth + regionWidthOffset) >> 1,
3483 (regionHeight + regionHeightOffset) >> 1,
3484 inputPicturePtr->strideCb,
3485 decimStep,
3486 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][1],
3487 &sum);
3488
3489 sum = (sum << decimStep);
3490 *sumAverageIntensityTotalRegionsCb += sum;
3491 pictureControlSetPtr->averageIntensityPerRegion[regionInPictureWidthIndex][regionInPictureHeightIndex][1] = (EB_U8)((sum + (((regionWidth + regionWidthOffset)*(regionHeight + regionHeightOffset)) >> 3)) / (((regionWidth + regionWidthOffset)*(regionHeight + regionHeightOffset)) >> 2));
3492
3493 for (histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++){ // Loop over the histogram bins
3494 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][1][histogramBin] =
3495 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][1][histogramBin] << decimStep;
3496 }
3497
3498 // V Histogram
3499 CalculateHistogram(
3500 &inputPicturePtr->bufferCr[((inputPicturePtr->originX + regionInPictureWidthIndex * regionWidth) >> 1) + (((inputPicturePtr->originY + regionInPictureHeightIndex * regionHeight) >> 1) * inputPicturePtr->strideCr)],
3501 (regionWidth + regionWidthOffset) >> 1,
3502 (regionHeight + regionHeightOffset) >> 1,
3503 inputPicturePtr->strideCr,
3504 decimStep,
3505 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][2],
3506 &sum);
3507
3508 sum = (sum << decimStep);
3509 *sumAverageIntensityTotalRegionsCr += sum;
3510 pictureControlSetPtr->averageIntensityPerRegion[regionInPictureWidthIndex][regionInPictureHeightIndex][2] = (EB_U8)((sum + (((regionWidth + regionWidthOffset)*(regionHeight + regionHeightOffset)) >> 3)) / (((regionWidth + regionWidthOffset)*(regionHeight + regionHeightOffset)) >> 2));
3511
3512 for (histogramBin = 0; histogramBin < HISTOGRAM_NUMBER_OF_BINS; histogramBin++){ // Loop over the histogram bins
3513 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][2][histogramBin] =
3514 pictureControlSetPtr->pictureHistogram[regionInPictureWidthIndex][regionInPictureHeightIndex][2][histogramBin] << decimStep;
3515 }
3516 }
3517 }
3518 return;
3519
3520 }
3521
EdgeDetectionMeanLumaChroma16x16(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,PictureAnalysisContext_t * contextPtr,EB_U32 totalLcuCount)3522 static void EdgeDetectionMeanLumaChroma16x16(
3523 SequenceControlSet_t *sequenceControlSetPtr,
3524 PictureParentControlSet_t *pictureControlSetPtr,
3525 PictureAnalysisContext_t *contextPtr,
3526 EB_U32 totalLcuCount)
3527 {
3528
3529 EB_U32 lcuIndex;
3530
3531
3532 EB_U32 maxGrad = 1;
3533
3534 // The values are calculated for every 4th frame
3535 if ((pictureControlSetPtr->pictureNumber & 3) == 0){
3536 for (lcuIndex = 0; lcuIndex < totalLcuCount; lcuIndex++) {
3537
3538 LcuStat_t *lcuStatPtr = &pictureControlSetPtr->lcuStatArray[lcuIndex];
3539
3540 EB_MEMSET(lcuStatPtr, 0, sizeof(LcuStat_t));
3541 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
3542 if (lcuParams->potentialLogoLcu &&lcuParams->isCompleteLcu)
3543
3544 {
3545 EB_U8 *yMeanPtr = pictureControlSetPtr->yMean[lcuIndex];
3546 EB_U8 *crMeanPtr = pictureControlSetPtr->crMean[lcuIndex];
3547 EB_U8 *cbMeanPtr = pictureControlSetPtr->cbMean[lcuIndex];
3548
3549 EB_U8 rasterScanCuIndex;
3550
3551 for (rasterScanCuIndex = RASTER_SCAN_CU_INDEX_16x16_0; rasterScanCuIndex <= RASTER_SCAN_CU_INDEX_16x16_15; rasterScanCuIndex++) {
3552 EB_U8 cuIndex = rasterScanCuIndex - 5;
3553 EB_U8 x = cuIndex & 3;
3554 EB_U8 y = (cuIndex >> 2);
3555 EB_S32 gradx = 0;
3556 EB_S32 grady = 0;
3557 EB_S32 nbcompx = 0;
3558 EB_S32 nbcompy = 0;
3559 if (x != 0)
3560 {
3561 gradx += ABS((EB_S32)(yMeanPtr[rasterScanCuIndex]) - (EB_S32)(yMeanPtr[rasterScanCuIndex - 1]));
3562 gradx += ABS((EB_S32)(crMeanPtr[rasterScanCuIndex]) - (EB_S32)(crMeanPtr[rasterScanCuIndex - 1]));
3563 gradx += ABS((EB_S32)(cbMeanPtr[rasterScanCuIndex]) - (EB_S32)(cbMeanPtr[rasterScanCuIndex - 1]));
3564 nbcompx++;
3565 }
3566 if (x != 3)
3567 {
3568 gradx += ABS((EB_S32)(yMeanPtr[rasterScanCuIndex + 1]) - (EB_S32)(yMeanPtr[rasterScanCuIndex]));
3569 gradx += ABS((EB_S32)(crMeanPtr[rasterScanCuIndex + 1]) - (EB_S32)(crMeanPtr[rasterScanCuIndex]));
3570 gradx += ABS((EB_S32)(cbMeanPtr[rasterScanCuIndex + 1]) - (EB_S32)(cbMeanPtr[rasterScanCuIndex]));
3571 nbcompx++;
3572 }
3573 gradx = gradx / nbcompx;
3574
3575
3576 if (y != 0)
3577 {
3578 grady += ABS((EB_S32)(yMeanPtr[rasterScanCuIndex]) - (EB_S32)(yMeanPtr[rasterScanCuIndex - 4]));
3579 grady += ABS((EB_S32)(crMeanPtr[rasterScanCuIndex]) - (EB_S32)(crMeanPtr[rasterScanCuIndex - 4]));
3580 grady += ABS((EB_S32)(cbMeanPtr[rasterScanCuIndex]) - (EB_S32)(cbMeanPtr[rasterScanCuIndex - 4]));
3581 nbcompy++;
3582 }
3583 if (y != 3)
3584 {
3585 grady += ABS((EB_S32)(yMeanPtr[rasterScanCuIndex + 4]) - (EB_S32)(yMeanPtr[rasterScanCuIndex]));
3586 grady += ABS((EB_S32)(crMeanPtr[rasterScanCuIndex + 4]) - (EB_S32)(crMeanPtr[rasterScanCuIndex]));
3587 grady += ABS((EB_S32)(cbMeanPtr[rasterScanCuIndex + 4]) - (EB_S32)(cbMeanPtr[rasterScanCuIndex]));
3588
3589 nbcompy++;
3590 }
3591
3592 grady = grady / nbcompy;
3593
3594 contextPtr->grad[lcuIndex][rasterScanCuIndex] = (EB_U16) (ABS(gradx) + ABS(grady));
3595 if (contextPtr->grad[lcuIndex][rasterScanCuIndex] > maxGrad){
3596 maxGrad = contextPtr->grad[lcuIndex][rasterScanCuIndex];
3597 }
3598 }
3599 }
3600 }
3601
3602 for (lcuIndex = 0; lcuIndex < totalLcuCount; lcuIndex++) {
3603 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
3604 if (lcuParams->potentialLogoLcu &&lcuParams->isCompleteLcu){
3605 LcuStat_t *lcuStatPtr = &pictureControlSetPtr->lcuStatArray[lcuIndex];
3606
3607 EB_U32 rasterScanCuIndex;
3608 for (rasterScanCuIndex = RASTER_SCAN_CU_INDEX_16x16_0; rasterScanCuIndex <= RASTER_SCAN_CU_INDEX_16x16_15; rasterScanCuIndex++) {
3609 lcuStatPtr->cuStatArray[rasterScanCuIndex].edgeCu = (EB_U16)MIN(((contextPtr->grad[lcuIndex][rasterScanCuIndex] * (255*3)) / maxGrad), 255) < 30 ? 0 : 1;
3610 }
3611 }
3612 }
3613 }
3614 else{
3615 for (lcuIndex = 0; lcuIndex < totalLcuCount; lcuIndex++) {
3616
3617 LcuStat_t *lcuStatPtr = &pictureControlSetPtr->lcuStatArray[lcuIndex];
3618
3619 EB_MEMSET(lcuStatPtr, 0, sizeof(LcuStat_t));
3620 }
3621 }
3622 }
3623
3624 /******************************************************
3625 * Edge map derivation
3626 ******************************************************/
EdgeDetection(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr)3627 static void EdgeDetection(
3628 SequenceControlSet_t *sequenceControlSetPtr,
3629 PictureParentControlSet_t *pictureControlSetPtr)
3630 {
3631
3632 EB_U16 *variancePtr;
3633 EB_U64 thrsldLevel0 = (pictureControlSetPtr->picAvgVariance * 70) / 100;
3634 EB_U8 *meanPtr;
3635 EB_U32 pictureWidthInLcu = (sequenceControlSetPtr->lumaWidth + sequenceControlSetPtr->lcuSize - 1) / sequenceControlSetPtr->lcuSize;
3636 EB_U32 pictureHeightInLcu = (sequenceControlSetPtr->lumaHeight + sequenceControlSetPtr->lcuSize - 1) / sequenceControlSetPtr->lcuSize;
3637 EB_U32 neighbourLcuIndex = 0;
3638 EB_U64 similarityCount = 0;
3639 EB_U64 similarityCount0 = 0;
3640 EB_U64 similarityCount1 = 0;
3641 EB_U64 similarityCount2 = 0;
3642 EB_U64 similarityCount3 = 0;
3643 EB_U32 lcu_X = 0;
3644 EB_U32 lcu_Y = 0;
3645 EB_U32 lcuIndex;
3646 EB_BOOL highVarianceLucFlag;
3647
3648 EB_U32 rasterScanCuIndex = 0;
3649 EB_U32 numberOfEdgeLcu = 0;
3650 EB_BOOL highIntensityLcuFlag;
3651
3652 EB_U64 neighbourLcuMean;
3653 EB_S32 i, j;
3654
3655 EB_U8 highIntensityTh = 180;
3656 EB_U8 lowIntensityTh = 120;
3657 EB_U8 highIntensityTh1 = 200;
3658 EB_U8 veryLowIntensityTh = 20;
3659
3660 for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
3661
3662 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
3663
3664 lcu_X = lcuParams->horizontalIndex;
3665 lcu_Y = lcuParams->verticalIndex;
3666
3667 EdgeLcuResults_t *edgeResultsPtr = pictureControlSetPtr->edgeResultsPtr;
3668 pictureControlSetPtr->edgeResultsPtr[lcuIndex].edgeBlockNum = 0;
3669 pictureControlSetPtr->edgeResultsPtr[lcuIndex].isolatedHighIntensityLcu = 0;
3670 pictureControlSetPtr->sharpEdgeLcuFlag[lcuIndex] = 0;
3671
3672 if (lcu_X > 0 && lcu_X < (EB_U32)(pictureWidthInLcu - 1) && lcu_Y > 0 && lcu_Y < (EB_U32)(pictureHeightInLcu - 1)){
3673
3674 variancePtr = pictureControlSetPtr->variance[lcuIndex];
3675 meanPtr = pictureControlSetPtr->yMean[lcuIndex];
3676
3677
3678 similarityCount = 0;
3679
3680 highVarianceLucFlag =
3681 (variancePtr[RASTER_SCAN_CU_INDEX_64x64] > thrsldLevel0) ? EB_TRUE : EB_FALSE;
3682 edgeResultsPtr[lcuIndex].edgeBlockNum = highVarianceLucFlag;
3683 if (variancePtr[0] > highIntensityTh1){
3684 EB_U8 sharpEdge = 0;
3685 for (rasterScanCuIndex = RASTER_SCAN_CU_INDEX_16x16_0; rasterScanCuIndex <= RASTER_SCAN_CU_INDEX_16x16_15; rasterScanCuIndex++) {
3686 sharpEdge = (variancePtr[rasterScanCuIndex] < veryLowIntensityTh) ? sharpEdge + 1 : sharpEdge;
3687
3688 }
3689 if (sharpEdge > 4)
3690 {
3691 pictureControlSetPtr->sharpEdgeLcuFlag[lcuIndex] = 1;
3692 }
3693 }
3694
3695
3696 if (lcu_X > 3 && lcu_X < (EB_U32)(pictureWidthInLcu - 4) && lcu_Y > 3 && lcu_Y < (EB_U32)(pictureHeightInLcu - 4)){
3697
3698 highIntensityLcuFlag =
3699 (meanPtr[RASTER_SCAN_CU_INDEX_64x64] > highIntensityTh) ? EB_TRUE : EB_FALSE;
3700
3701 if (highIntensityLcuFlag){
3702
3703 neighbourLcuIndex = lcuIndex - 1;
3704 neighbourLcuMean = pictureControlSetPtr->yMean[neighbourLcuIndex][RASTER_SCAN_CU_INDEX_64x64];
3705
3706 similarityCount0 = (neighbourLcuMean < lowIntensityTh) ? 1 : 0;
3707
3708 neighbourLcuIndex = lcuIndex + 1;
3709
3710 neighbourLcuMean = pictureControlSetPtr->yMean[neighbourLcuIndex][RASTER_SCAN_CU_INDEX_64x64];
3711 similarityCount1 = (neighbourLcuMean < lowIntensityTh) ? 1 : 0;
3712
3713 neighbourLcuIndex = lcuIndex - pictureWidthInLcu;
3714 neighbourLcuMean = pictureControlSetPtr->yMean[neighbourLcuIndex][RASTER_SCAN_CU_INDEX_64x64];
3715 similarityCount2 = (neighbourLcuMean < lowIntensityTh) ? 1 : 0;
3716
3717 neighbourLcuIndex = lcuIndex + pictureWidthInLcu;
3718 neighbourLcuMean = pictureControlSetPtr->yMean[neighbourLcuIndex][RASTER_SCAN_CU_INDEX_64x64];
3719 similarityCount3 = (neighbourLcuMean < lowIntensityTh) ? 1 : 0;
3720
3721 similarityCount = similarityCount0 + similarityCount1 + similarityCount2 + similarityCount3;
3722
3723 if (similarityCount > 0){
3724
3725
3726 for (i = -4; i < 5; i++){
3727 for (j = -4; j < 5; j++){
3728 neighbourLcuIndex = lcuIndex + (i * pictureWidthInLcu) + j;
3729 pictureControlSetPtr->edgeResultsPtr[neighbourLcuIndex].isolatedHighIntensityLcu = 1;
3730 }
3731 }
3732 }
3733 }
3734 }
3735
3736
3737 if (highVarianceLucFlag){
3738 numberOfEdgeLcu += edgeResultsPtr[lcuIndex].edgeBlockNum;
3739 }
3740 }
3741 }
3742
3743 pictureControlSetPtr->lcuBlockPercentage = (EB_U8)((numberOfEdgeLcu * 100) / pictureControlSetPtr->lcuTotalCount);
3744
3745 return;
3746 }
3747
3748 /******************************************************
3749 * Calculate the variance of variance to determine Homogeneous regions. Note: Variance calculation should be on.
3750 ******************************************************/
DetermineHomogeneousRegionInPicture(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr)3751 static inline void DetermineHomogeneousRegionInPicture(
3752 SequenceControlSet_t *sequenceControlSetPtr,
3753 PictureParentControlSet_t *pictureControlSetPtr)
3754 {
3755
3756 EB_U16 *variancePtr;
3757 EB_U32 lcuIndex;
3758
3759 EB_U32 cuNum, cuSize, cuIndexOffset, cuH, cuW;
3760 EB_U64 nullVarCnt = 0;
3761 EB_U64 veryLowVarCnt = 0;
3762 EB_U64 varLcuCnt = 0;
3763 EB_U32 lcuTotalCount = pictureControlSetPtr->lcuTotalCount;
3764
3765 for (lcuIndex = 0; lcuIndex < lcuTotalCount; ++lcuIndex) {
3766 EB_U64 meanSqrVariance32x32Based[4] = { 0 }, meanVariance32x32Based[4] = { 0 };
3767
3768 EB_U64 meanSqrVariance64x64Based = 0, meanVariance64x64Based = 0;
3769 EB_U64 varOfVar64x64Based = 0;
3770
3771 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
3772
3773 // Initialize
3774 pictureControlSetPtr->lcuHomogeneousAreaArray[lcuIndex] = EB_TRUE;
3775
3776 variancePtr = pictureControlSetPtr->variance[lcuIndex];
3777
3778 if (lcuParams->isCompleteLcu){
3779
3780 nullVarCnt += (variancePtr[ME_TIER_ZERO_PU_64x64] == 0) ? 1 : 0;
3781
3782 varLcuCnt++;
3783
3784 veryLowVarCnt += ((variancePtr[ME_TIER_ZERO_PU_64x64]) < LCU_LOW_VAR_TH) ? 1 : 0;
3785 cuSize = 8;
3786 cuIndexOffset = ME_TIER_ZERO_PU_8x8_0;
3787 cuNum = 64 / cuSize;
3788
3789 //Variance of 8x8 blocks in a 32x32
3790 for (cuH = 0; cuH < (cuNum / 2); cuH++){
3791 for (cuW = 0; cuW < (cuNum / 2); cuW++){
3792
3793 meanSqrVariance32x32Based[0] += (variancePtr[cuIndexOffset + cuH*cuNum + cuW])*(variancePtr[cuIndexOffset + cuH*cuNum + cuW]);
3794 meanVariance32x32Based[0] += (variancePtr[cuIndexOffset + cuH*cuNum + cuW]);
3795
3796 meanSqrVariance32x32Based[1] += (variancePtr[cuIndexOffset + cuH*cuNum + cuW + 4])*(variancePtr[cuIndexOffset + cuH*cuNum + cuW + 4]);
3797 meanVariance32x32Based[1] += (variancePtr[cuIndexOffset + cuH*cuNum + cuW + 4]);
3798
3799 meanSqrVariance32x32Based[2] += (variancePtr[cuIndexOffset + (cuH + 4)*cuNum + cuW])*(variancePtr[cuIndexOffset + (cuH + 4)*cuNum + cuW]);
3800 meanVariance32x32Based[2] += (variancePtr[cuIndexOffset + (cuH + 4)*cuNum + cuW]);
3801
3802 meanSqrVariance32x32Based[3] += (variancePtr[cuIndexOffset + (cuH + 4)*cuNum + cuW + 4])*(variancePtr[cuIndexOffset + (cuH + 4)*cuNum + cuW + 4]);
3803 meanVariance32x32Based[3] += (variancePtr[cuIndexOffset + (cuH + 4)*cuNum + cuW + 4]);
3804
3805 }
3806 }
3807
3808 meanSqrVariance32x32Based[0] = meanSqrVariance32x32Based[0] >> 4;
3809 meanVariance32x32Based[0] = meanVariance32x32Based[0] >> 4;
3810 pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][0] = meanSqrVariance32x32Based[0] - meanVariance32x32Based[0] * meanVariance32x32Based[0];
3811
3812 meanSqrVariance32x32Based[1] = meanSqrVariance32x32Based[1] >> 4;
3813 meanVariance32x32Based[1] = meanVariance32x32Based[1] >> 4;
3814 pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][1] = meanSqrVariance32x32Based[1] - meanVariance32x32Based[1] * meanVariance32x32Based[1];
3815
3816 meanSqrVariance32x32Based[2] = meanSqrVariance32x32Based[2] >> 4;
3817 meanVariance32x32Based[2] = meanVariance32x32Based[2] >> 4;
3818 pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][2] = meanSqrVariance32x32Based[2] - meanVariance32x32Based[2] * meanVariance32x32Based[2];
3819
3820 meanSqrVariance32x32Based[3] = meanSqrVariance32x32Based[3] >> 4;
3821 meanVariance32x32Based[3] = meanVariance32x32Based[3] >> 4;
3822 pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][3] = meanSqrVariance32x32Based[3] - meanVariance32x32Based[3] * meanVariance32x32Based[3];
3823
3824 // Compute the 64x64 based variance of variance
3825 {
3826 EB_U32 varIndex;
3827 // Loop over all 8x8s in a 64x64
3828 for (varIndex = ME_TIER_ZERO_PU_8x8_0; varIndex <= ME_TIER_ZERO_PU_8x8_63; varIndex++) {
3829 meanSqrVariance64x64Based += variancePtr[varIndex] * variancePtr[varIndex];
3830 meanVariance64x64Based += variancePtr[varIndex];
3831 }
3832
3833 meanSqrVariance64x64Based = meanSqrVariance64x64Based >> 6;
3834 meanVariance64x64Based = meanVariance64x64Based >> 6;
3835
3836 // Compute variance
3837 varOfVar64x64Based = meanSqrVariance64x64Based - meanVariance64x64Based * meanVariance64x64Based;
3838
3839 // Turn off detail preservation if the varOfVar is greater than a threshold
3840 if (varOfVar64x64Based > VAR_BASED_DETAIL_PRESERVATION_SELECTOR_THRSLHD)
3841 {
3842 pictureControlSetPtr->lcuHomogeneousAreaArray[lcuIndex] = EB_FALSE;
3843 }
3844 }
3845
3846 }
3847 else{
3848
3849 // Should be re-calculated and scaled properly
3850 pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][0] = 0xFFFFFFFFFFFFFFFF;
3851 pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][1] = 0xFFFFFFFFFFFFFFFF;
3852 pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][2] = 0xFFFFFFFFFFFFFFFF;
3853 pictureControlSetPtr->varOfVar32x32BasedLcuArray[lcuIndex][3] = 0xFFFFFFFFFFFFFFFF;
3854 }
3855 }
3856 pictureControlSetPtr->veryLowVarPicFlag = EB_FALSE;
3857 if (varLcuCnt > 0) {
3858 if (((veryLowVarCnt * 100) / varLcuCnt) > PIC_LOW_VAR_PERCENTAGE_TH) {
3859 pictureControlSetPtr->veryLowVarPicFlag = EB_TRUE;
3860 }
3861 }
3862
3863 pictureControlSetPtr->logoPicFlag = EB_FALSE;
3864 if (varLcuCnt > 0) {
3865 if (((veryLowVarCnt * 100) / varLcuCnt) > 80) {
3866 pictureControlSetPtr->logoPicFlag = EB_TRUE;
3867 }
3868 }
3869
3870 return;
3871 }
3872
3873 /************************************************
3874 * ComputePictureSpatialStatistics
3875 ** Compute Block Variance
3876 ** Compute Picture Variance
3877 ** Compute Block Mean for all blocks in the picture
3878 ************************************************/
ComputePictureSpatialStatistics(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,PictureAnalysisContext_t * contextPtr,EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * inputPaddedPicturePtr,EB_U32 lcuTotalCount)3879 static void ComputePictureSpatialStatistics(
3880 SequenceControlSet_t *sequenceControlSetPtr,
3881 PictureParentControlSet_t *pictureControlSetPtr,
3882 PictureAnalysisContext_t *contextPtr,
3883 EbPictureBufferDesc_t *inputPicturePtr,
3884 EbPictureBufferDesc_t *inputPaddedPicturePtr,
3885 EB_U32 lcuTotalCount)
3886 {
3887 EB_U32 lcuIndex;
3888 EB_U32 lcuOriginX; // to avoid using child PCS
3889 EB_U32 lcuOriginY;
3890 EB_U32 inputLumaOriginIndex;
3891 EB_U32 inputCbOriginIndex;
3892 EB_U32 inputCrOriginIndex;
3893 EB_U64 picTotVariance;
3894
3895 // Variance
3896 picTotVariance = 0;
3897
3898 for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex) {
3899 LcuParams_t *lcuParams = &sequenceControlSetPtr->lcuParamsArray[lcuIndex];
3900
3901 lcuOriginX = lcuParams->originX;
3902 lcuOriginY = lcuParams->originY;
3903 inputLumaOriginIndex = (inputPaddedPicturePtr->originY + lcuOriginY) * inputPaddedPicturePtr->strideY +
3904 inputPaddedPicturePtr->originX + lcuOriginX;
3905
3906 inputCbOriginIndex = ((inputPicturePtr->originY + lcuOriginY) >> 1) * inputPicturePtr->strideCb + ((inputPicturePtr->originX + lcuOriginX) >> 1);
3907 inputCrOriginIndex = ((inputPicturePtr->originY + lcuOriginY) >> 1) * inputPicturePtr->strideCr + ((inputPicturePtr->originX + lcuOriginX) >> 1);
3908
3909 ComputeBlockMeanComputeVariance(
3910 pictureControlSetPtr,
3911 inputPaddedPicturePtr,
3912 lcuIndex,
3913 inputLumaOriginIndex);
3914
3915 if (lcuParams->isCompleteLcu){
3916
3917 ComputeChromaBlockMean(
3918 pictureControlSetPtr,
3919 inputPicturePtr,
3920 lcuIndex,
3921 inputCbOriginIndex,
3922 inputCrOriginIndex);
3923 }
3924 else{
3925 ZeroOutChromaBlockMean(
3926 pictureControlSetPtr,
3927 lcuIndex);
3928 }
3929
3930 picTotVariance += (pictureControlSetPtr->variance[lcuIndex][RASTER_SCAN_CU_INDEX_64x64]);
3931 }
3932
3933 pictureControlSetPtr->picAvgVariance = (EB_U16) (picTotVariance / lcuTotalCount);
3934 // Calculate the variance of variance to determine Homogeneous regions. Note: Variance calculation should be on.
3935 DetermineHomogeneousRegionInPicture(
3936 sequenceControlSetPtr,
3937 pictureControlSetPtr);
3938
3939 EdgeDetectionMeanLumaChroma16x16(
3940 sequenceControlSetPtr,
3941 pictureControlSetPtr,
3942 contextPtr,
3943 sequenceControlSetPtr->lcuTotalCount);
3944
3945 EdgeDetection(
3946 sequenceControlSetPtr,
3947 pictureControlSetPtr);
3948
3949
3950 return;
3951 }
3952
CalculateInputAverageIntensity(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr,EB_U64 sumAverageIntensityTotalRegionsLuma,EB_U64 sumAverageIntensityTotalRegionsCb,EB_U64 sumAverageIntensityTotalRegionsCr)3953 static void CalculateInputAverageIntensity(
3954 SequenceControlSet_t *sequenceControlSetPtr,
3955 PictureParentControlSet_t *pictureControlSetPtr,
3956 EbPictureBufferDesc_t *inputPicturePtr,
3957 EB_U64 sumAverageIntensityTotalRegionsLuma,
3958 EB_U64 sumAverageIntensityTotalRegionsCb,
3959 EB_U64 sumAverageIntensityTotalRegionsCr)
3960 {
3961
3962 if (sequenceControlSetPtr->scdMode == SCD_MODE_0){
3963 EB_U16 blockIndexInWidth;
3964 EB_U16 blockIndexInHeight;
3965 EB_U64 mean = 0;
3966
3967 const EB_U16 strideY = inputPicturePtr->strideY;
3968
3969 // Loop over 8x8 blocks and calculates the mean value
3970 for (blockIndexInHeight = 0; blockIndexInHeight < inputPicturePtr->height >> 3; ++blockIndexInHeight) {
3971 for (blockIndexInWidth = 0; blockIndexInWidth < inputPicturePtr->width >> 3; ++blockIndexInWidth) {
3972 mean += ComputeSubMean8x8_SSE2_INTRIN(&(inputPicturePtr->bufferY[(blockIndexInWidth << 3) + (blockIndexInHeight << 3) * strideY]), strideY);
3973 }
3974 }
3975
3976 mean = ((mean + ((inputPicturePtr->height* inputPicturePtr->width) >> 7)) / ((inputPicturePtr->height* inputPicturePtr->width) >> 6));
3977 mean = (mean + (1 << (MEAN_PRECISION - 1))) >> MEAN_PRECISION;
3978 pictureControlSetPtr->averageIntensity[0] = (EB_U8)mean;
3979 }
3980
3981 else{
3982 pictureControlSetPtr->averageIntensity[0] = (EB_U8)((sumAverageIntensityTotalRegionsLuma + ((inputPicturePtr->width*inputPicturePtr->height) >> 1)) / (inputPicturePtr->width*inputPicturePtr->height));
3983 pictureControlSetPtr->averageIntensity[1] = (EB_U8)((sumAverageIntensityTotalRegionsCb + ((inputPicturePtr->width*inputPicturePtr->height) >> 3)) / ((inputPicturePtr->width*inputPicturePtr->height) >> 2));
3984 pictureControlSetPtr->averageIntensity[2] = (EB_U8)((sumAverageIntensityTotalRegionsCr + ((inputPicturePtr->width*inputPicturePtr->height) >> 3)) / ((inputPicturePtr->width*inputPicturePtr->height) >> 2));
3985 }
3986
3987 return;
3988 }
3989
3990 /************************************************
3991 * Gathering statistics per picture
3992 ** Calculating the pixel intensity histogram bins per picture needed for SCD
3993 ** Computing Picture Variance
3994 ************************************************/
GatheringPictureStatistics(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,PictureAnalysisContext_t * contextPtr,EbPictureBufferDesc_t * inputPicturePtr,EbPictureBufferDesc_t * inputPaddedPicturePtr,EbPictureBufferDesc_t * sixteenthDecimatedPicturePtr,EB_U32 lcuTotalCount)3995 static void GatheringPictureStatistics(
3996 SequenceControlSet_t *sequenceControlSetPtr,
3997 PictureParentControlSet_t *pictureControlSetPtr,
3998 PictureAnalysisContext_t *contextPtr,
3999 EbPictureBufferDesc_t *inputPicturePtr,
4000 EbPictureBufferDesc_t *inputPaddedPicturePtr,
4001 EbPictureBufferDesc_t *sixteenthDecimatedPicturePtr,
4002 EB_U32 lcuTotalCount)
4003 {
4004
4005 EB_U64 sumAverageIntensityTotalRegionsLuma = 0;
4006 EB_U64 sumAverageIntensityTotalRegionsCb = 0;
4007 EB_U64 sumAverageIntensityTotalRegionsCr = 0;
4008
4009 // Histogram bins
4010 // Use 1/16 Luma for Histogram generation
4011 // 1/16 input ready
4012 SubSampleLumaGeneratePixelIntensityHistogramBins(
4013 sequenceControlSetPtr,
4014 pictureControlSetPtr,
4015 sixteenthDecimatedPicturePtr,
4016 &sumAverageIntensityTotalRegionsLuma);
4017
4018 // Use 1/4 Chroma for Histogram generation
4019 // 1/4 input not ready => perform operation on the fly
4020 SubSampleChromaGeneratePixelIntensityHistogramBins(
4021 sequenceControlSetPtr,
4022 pictureControlSetPtr,
4023 inputPicturePtr,
4024 &sumAverageIntensityTotalRegionsCb,
4025 &sumAverageIntensityTotalRegionsCr);
4026
4027 // Calculate the LUMA average intensity
4028 CalculateInputAverageIntensity(
4029 sequenceControlSetPtr,
4030 pictureControlSetPtr,
4031 inputPicturePtr,
4032 sumAverageIntensityTotalRegionsLuma,
4033 sumAverageIntensityTotalRegionsCb,
4034 sumAverageIntensityTotalRegionsCr);
4035
4036 ComputePictureSpatialStatistics(
4037 sequenceControlSetPtr,
4038 pictureControlSetPtr,
4039 contextPtr,
4040 inputPicturePtr,
4041 inputPaddedPicturePtr,
4042 lcuTotalCount);
4043
4044 return;
4045 }
4046
4047 /************************************************
4048 * Pad Picture at the right and bottom sides
4049 ** To match a multiple of min CU size in width and height
4050 ************************************************/
PadPictureToMultipleOfMinCuSizeDimensions(SequenceControlSet_t * sequenceControlSetPtr,EbPictureBufferDesc_t * inputPicturePtr)4051 static void PadPictureToMultipleOfMinCuSizeDimensions(
4052 SequenceControlSet_t *sequenceControlSetPtr,
4053 EbPictureBufferDesc_t *inputPicturePtr)
4054 {
4055 EB_BOOL is16BitInput = (EB_BOOL)(sequenceControlSetPtr->staticConfig.encoderBitDepth > EB_8BIT);
4056 EB_U32 colorFormat = inputPicturePtr->colorFormat;
4057 EB_U16 subWidthCMinus1 = (colorFormat == EB_YUV444 ? 1 : 2) - 1;
4058 EB_U16 subHeightCMinus1 = (colorFormat >= EB_YUV422 ? 1 : 2) - 1;
4059
4060 // Input Picture Padding
4061 PadInputPicture(
4062 &inputPicturePtr->bufferY[inputPicturePtr->originX + (inputPicturePtr->originY * inputPicturePtr->strideY)],
4063 inputPicturePtr->strideY,
4064 (inputPicturePtr->width - sequenceControlSetPtr->padRight),
4065 (inputPicturePtr->height - sequenceControlSetPtr->padBottom),
4066 sequenceControlSetPtr->padRight,
4067 sequenceControlSetPtr->padBottom);
4068
4069 PadInputPicture(
4070 &inputPicturePtr->bufferCb[(inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideCb)],
4071 inputPicturePtr->strideCb,
4072 (inputPicturePtr->width - sequenceControlSetPtr->padRight) >> subWidthCMinus1,
4073 (inputPicturePtr->height - sequenceControlSetPtr->padBottom) >> subHeightCMinus1,
4074 sequenceControlSetPtr->padRight >> subWidthCMinus1,
4075 sequenceControlSetPtr->padBottom >> subHeightCMinus1);
4076
4077 PadInputPicture(
4078 &inputPicturePtr->bufferCr[(inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideCr)],
4079 inputPicturePtr->strideCr,
4080 (inputPicturePtr->width - sequenceControlSetPtr->padRight) >> subWidthCMinus1,
4081 (inputPicturePtr->height - sequenceControlSetPtr->padBottom) >> subHeightCMinus1,
4082 sequenceControlSetPtr->padRight >> subWidthCMinus1,
4083 sequenceControlSetPtr->padBottom >> subHeightCMinus1);
4084
4085 if (is16BitInput) {
4086 PadInputPicture(
4087 &inputPicturePtr->bufferBitIncY[inputPicturePtr->originX + (inputPicturePtr->originY * inputPicturePtr->strideBitIncY)],
4088 inputPicturePtr->strideBitIncY,
4089 (inputPicturePtr->width - sequenceControlSetPtr->padRight),
4090 (inputPicturePtr->height - sequenceControlSetPtr->padBottom),
4091 sequenceControlSetPtr->padRight,
4092 sequenceControlSetPtr->padBottom);
4093
4094 PadInputPicture(
4095 &inputPicturePtr->bufferBitIncCb[(inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideBitIncCb)],
4096 inputPicturePtr->strideBitIncCb,
4097 (inputPicturePtr->width - sequenceControlSetPtr->padRight) >> subWidthCMinus1,
4098 (inputPicturePtr->height - sequenceControlSetPtr->padBottom) >> subHeightCMinus1,
4099 sequenceControlSetPtr->padRight >> subWidthCMinus1,
4100 sequenceControlSetPtr->padBottom >> subHeightCMinus1);
4101
4102 PadInputPicture(
4103 &inputPicturePtr->bufferBitIncCr[(inputPicturePtr->originX >> subWidthCMinus1) + ((inputPicturePtr->originY >> subHeightCMinus1) * inputPicturePtr->strideBitIncCr)],
4104 inputPicturePtr->strideBitIncCr,
4105 (inputPicturePtr->width - sequenceControlSetPtr->padRight) >> subWidthCMinus1,
4106 (inputPicturePtr->height - sequenceControlSetPtr->padBottom) >> subHeightCMinus1,
4107 sequenceControlSetPtr->padRight >> subWidthCMinus1,
4108 sequenceControlSetPtr->padBottom >> subHeightCMinus1);
4109
4110 }
4111
4112 return;
4113 }
4114
4115 /************************************************
4116 * Pad Picture at the right and bottom sides
4117 ** To complete border LCU smaller than LCU size
4118 ************************************************/
PadPictureToMultipleOfLcuDimensions(EbPictureBufferDesc_t * inputPaddedPicturePtr)4119 static void PadPictureToMultipleOfLcuDimensions(
4120 EbPictureBufferDesc_t *inputPaddedPicturePtr
4121 )
4122 {
4123
4124 // Generate Padding
4125 GeneratePadding(
4126 &inputPaddedPicturePtr->bufferY[0],
4127 inputPaddedPicturePtr->strideY,
4128 inputPaddedPicturePtr->width,
4129 inputPaddedPicturePtr->height,
4130 inputPaddedPicturePtr->originX,
4131 inputPaddedPicturePtr->originY);
4132
4133 return;
4134 }
4135
4136 /************************************************
4137 * 1/4 & 1/16 input picture decimation
4138 ************************************************/
DecimateInputPicture(SequenceControlSet_t * sequenceControlSetPtr,PictureParentControlSet_t * pictureControlSetPtr,EbPictureBufferDesc_t * inputPaddedPicturePtr,EbPictureBufferDesc_t * quarterDecimatedPicturePtr,EbPictureBufferDesc_t * sixteenthDecimatedPicturePtr)4139 static void DecimateInputPicture(
4140 SequenceControlSet_t *sequenceControlSetPtr,
4141 PictureParentControlSet_t *pictureControlSetPtr,
4142 EbPictureBufferDesc_t *inputPaddedPicturePtr,
4143 EbPictureBufferDesc_t *quarterDecimatedPicturePtr,
4144 EbPictureBufferDesc_t *sixteenthDecimatedPicturePtr) {
4145
4146 // Decimate input picture for HME L1
4147 EB_BOOL preformQuarterPellDecimationFlag;
4148 if (sequenceControlSetPtr->staticConfig.speedControlFlag){
4149 preformQuarterPellDecimationFlag = EB_TRUE;
4150 }
4151 else{
4152 if (pictureControlSetPtr->enableHmeLevel1Flag == 1){
4153 preformQuarterPellDecimationFlag = EB_TRUE;
4154 }
4155 else{
4156 preformQuarterPellDecimationFlag = EB_FALSE;
4157 }
4158 }
4159
4160 if (preformQuarterPellDecimationFlag) {
4161 Decimation2D(
4162 &inputPaddedPicturePtr->bufferY[inputPaddedPicturePtr->originX + inputPaddedPicturePtr->originY * inputPaddedPicturePtr->strideY],
4163 inputPaddedPicturePtr->strideY,
4164 inputPaddedPicturePtr->width ,
4165 inputPaddedPicturePtr->height,
4166 &quarterDecimatedPicturePtr->bufferY[quarterDecimatedPicturePtr->originX+quarterDecimatedPicturePtr->originY*quarterDecimatedPicturePtr->strideY],
4167 quarterDecimatedPicturePtr->strideY,
4168 2);
4169
4170 GeneratePadding(
4171 &quarterDecimatedPicturePtr->bufferY[0],
4172 quarterDecimatedPicturePtr->strideY,
4173 quarterDecimatedPicturePtr->width,
4174 quarterDecimatedPicturePtr->height,
4175 quarterDecimatedPicturePtr->originX,
4176 quarterDecimatedPicturePtr->originY);
4177
4178 }
4179
4180 // Decimate input picture for HME L0
4181 // Sixteenth Input Picture Decimation
4182 Decimation2D(
4183 &inputPaddedPicturePtr->bufferY[inputPaddedPicturePtr->originX + inputPaddedPicturePtr->originY * inputPaddedPicturePtr->strideY],
4184 inputPaddedPicturePtr->strideY,
4185 inputPaddedPicturePtr->width ,
4186 inputPaddedPicturePtr->height ,
4187 &sixteenthDecimatedPicturePtr->bufferY[sixteenthDecimatedPicturePtr->originX+sixteenthDecimatedPicturePtr->originY*sixteenthDecimatedPicturePtr->strideY],
4188 sixteenthDecimatedPicturePtr->strideY,
4189 4);
4190
4191 GeneratePadding(
4192 &sixteenthDecimatedPicturePtr->bufferY[0],
4193 sixteenthDecimatedPicturePtr->strideY,
4194 sixteenthDecimatedPicturePtr->width,
4195 sixteenthDecimatedPicturePtr->height,
4196 sixteenthDecimatedPicturePtr->originX,
4197 sixteenthDecimatedPicturePtr->originY);
4198 }
4199
4200 /************************************************
4201 * Picture Analysis Kernel
4202 * The Picture Analysis Process pads & decimates the input pictures.
4203 * The Picture Analysis also includes creating an n-bin Histogram,
4204 * gathering picture 1st and 2nd moment statistics for each 8x8 block,
4205 * which are used to compute variance.
4206 * The Picture Analysis process is multithreaded, so pictures can be
4207 * processed out of order as long as all inputs are available.
4208 ************************************************/
PictureAnalysisKernel(void * inputPtr)4209 void* PictureAnalysisKernel(void *inputPtr)
4210 {
4211 PictureAnalysisContext_t *contextPtr = (PictureAnalysisContext_t*)inputPtr;
4212 PictureParentControlSet_t *pictureControlSetPtr;
4213 SequenceControlSet_t *sequenceControlSetPtr;
4214
4215 EbObjectWrapper_t *inputResultsWrapperPtr;
4216 ResourceCoordinationResults_t *inputResultsPtr;
4217 EbObjectWrapper_t *outputResultsWrapperPtr;
4218 PictureAnalysisResults_t *outputResultsPtr;
4219 EbPaReferenceObject_t *paReferenceObject;
4220
4221 EbPictureBufferDesc_t *inputPaddedPicturePtr;
4222 EbPictureBufferDesc_t *quarterDecimatedPicturePtr;
4223 EbPictureBufferDesc_t *sixteenthDecimatedPicturePtr;
4224 EbPictureBufferDesc_t *inputPicturePtr;
4225
4226 // Variance
4227 EB_U32 pictureWidthInLcu;
4228 EB_U32 pictureHeighInLcu;
4229 EB_U32 lcuTotalCount;
4230
4231 for (;;) {
4232
4233 // Get Input Full Object
4234 EbGetFullObject(
4235 contextPtr->resourceCoordinationResultsInputFifoPtr,
4236 &inputResultsWrapperPtr);
4237 EB_CHECK_END_OBJ(inputResultsWrapperPtr);
4238
4239 inputResultsPtr = (ResourceCoordinationResults_t*)inputResultsWrapperPtr->objectPtr;
4240 pictureControlSetPtr = (PictureParentControlSet_t*)inputResultsPtr->pictureControlSetWrapperPtr->objectPtr;
4241 sequenceControlSetPtr = (SequenceControlSet_t*)pictureControlSetPtr->sequenceControlSetWrapperPtr->objectPtr;
4242 inputPicturePtr = pictureControlSetPtr->enhancedPicturePtr;
4243 #if DEADLOCK_DEBUG
4244 if ((pictureControlSetPtr->pictureNumber >= MIN_POC) && (pictureControlSetPtr->pictureNumber <= MAX_POC))
4245 SVT_LOG("POC %lu PA IN \n", pictureControlSetPtr->pictureNumber);
4246 #endif
4247 paReferenceObject = (EbPaReferenceObject_t*)pictureControlSetPtr->paReferencePictureWrapperPtr->objectPtr;
4248 inputPaddedPicturePtr = (EbPictureBufferDesc_t*)paReferenceObject->inputPaddedPicturePtr;
4249 quarterDecimatedPicturePtr = (EbPictureBufferDesc_t*)paReferenceObject->quarterDecimatedPicturePtr;
4250 sixteenthDecimatedPicturePtr = (EbPictureBufferDesc_t*)paReferenceObject->sixteenthDecimatedPicturePtr;
4251
4252 // Variance
4253 pictureWidthInLcu = (sequenceControlSetPtr->lumaWidth + sequenceControlSetPtr->lcuSize - 1) / sequenceControlSetPtr->lcuSize;
4254 pictureHeighInLcu = (sequenceControlSetPtr->lumaHeight + sequenceControlSetPtr->lcuSize - 1) / sequenceControlSetPtr->lcuSize;
4255 lcuTotalCount = pictureWidthInLcu * pictureHeighInLcu;
4256
4257 // Pad pictures to multiple min cu size
4258 PadPictureToMultipleOfMinCuSizeDimensions(
4259 sequenceControlSetPtr,
4260 inputPicturePtr);
4261
4262 // Backup the Y component data from input picture into PA reference picture, to work arond the race condition that
4263 // the input picture buffer pointed by PA reference picture (in ResourceCoordination) would be updated even though
4264 // it's still being referenced.
4265 EB_U8 *pa = inputPaddedPicturePtr->bufferY + inputPaddedPicturePtr->originX + inputPaddedPicturePtr->originY * inputPaddedPicturePtr->strideY;
4266 EB_U8 *in = inputPicturePtr->bufferY + inputPicturePtr->originX + inputPicturePtr->originY * inputPicturePtr->strideY;
4267 for (EB_U32 row = 0; row < inputPicturePtr->height; row++) {
4268 EB_MEMCPY(pa + row * inputPaddedPicturePtr->strideY, in + row * inputPicturePtr->strideY, sizeof(EB_U8) * inputPicturePtr->width);
4269 }
4270
4271 // Set picture parameters to account for subpicture, picture scantype, and set regions by resolutions
4272 SetPictureParametersForStatisticsGathering(
4273 sequenceControlSetPtr);
4274
4275
4276 // Pre processing operations performed on the input picture
4277 PicturePreProcessingOperations(
4278 pictureControlSetPtr,
4279 contextPtr,
4280 sequenceControlSetPtr,
4281 quarterDecimatedPicturePtr,
4282 sixteenthDecimatedPicturePtr,
4283 lcuTotalCount,
4284 pictureWidthInLcu);
4285
4286 if (inputPicturePtr->colorFormat >= EB_YUV422) {
4287 // Jing: Do the conversion of 422/444=>420 here since it's multi-threaded kernel
4288 // Reuse the Y, only add cb/cr in the newly created buffer desc
4289 // NOTE: since denoise may change the src, so this part is after PicturePreProcessingOperations()
4290 pictureControlSetPtr->chromaDownSamplePicturePtr->bufferY = inputPicturePtr->bufferY;
4291 DownSampleChroma(inputPicturePtr, pictureControlSetPtr->chromaDownSamplePicturePtr);
4292 } else {
4293 pictureControlSetPtr->chromaDownSamplePicturePtr = inputPicturePtr;
4294 }
4295
4296 // Pad input picture to complete border LCUs
4297 PadPictureToMultipleOfLcuDimensions(
4298 inputPaddedPicturePtr
4299 );
4300
4301 // 1/4 & 1/16 input picture decimation
4302 DecimateInputPicture(
4303 sequenceControlSetPtr,
4304 pictureControlSetPtr,
4305 inputPaddedPicturePtr,
4306 quarterDecimatedPicturePtr,
4307 sixteenthDecimatedPicturePtr);
4308
4309 // Gathering statistics of input picture, including Variance Calculation, Histogram Bins
4310 GatheringPictureStatistics(
4311 sequenceControlSetPtr,
4312 pictureControlSetPtr,
4313 contextPtr,
4314 pictureControlSetPtr->chromaDownSamplePicturePtr, //420 inputPicturePtr
4315 inputPaddedPicturePtr,
4316 sixteenthDecimatedPicturePtr,
4317 lcuTotalCount);
4318
4319
4320 // Hold the 64x64 variance and mean in the reference frame
4321 EB_U32 lcuIndex;
4322 for (lcuIndex = 0; lcuIndex < pictureControlSetPtr->lcuTotalCount; ++lcuIndex){
4323 paReferenceObject->variance[lcuIndex] = pictureControlSetPtr->variance[lcuIndex][ME_TIER_ZERO_PU_64x64];
4324 paReferenceObject->yMean[lcuIndex] = pictureControlSetPtr->yMean[lcuIndex][ME_TIER_ZERO_PU_64x64];
4325
4326 }
4327
4328 // Get Empty Results Object
4329 EbGetEmptyObject(
4330 contextPtr->pictureAnalysisResultsOutputFifoPtr,
4331 &outputResultsWrapperPtr);
4332
4333 outputResultsPtr = (PictureAnalysisResults_t*)outputResultsWrapperPtr->objectPtr;
4334 outputResultsPtr->pictureControlSetWrapperPtr = inputResultsPtr->pictureControlSetWrapperPtr;
4335
4336 // Release the Input Results
4337 EbReleaseObject(inputResultsWrapperPtr);
4338
4339 #if LATENCY_PROFILE
4340 double latency = 0.0;
4341 EB_U64 finishTimeSeconds = 0;
4342 EB_U64 finishTimeuSeconds = 0;
4343 EbHevcFinishTime((uint64_t*)&finishTimeSeconds, (uint64_t*)&finishTimeuSeconds);
4344
4345 EbHevcComputeOverallElapsedTimeMs(
4346 pictureControlSetPtr->startTimeSeconds,
4347 pictureControlSetPtr->startTimeuSeconds,
4348 finishTimeSeconds,
4349 finishTimeuSeconds,
4350 &latency);
4351
4352 SVT_LOG("POC %lld PA OUT, decoder order %d, latency %3.3f \n",
4353 pictureControlSetPtr->pictureNumber,
4354 pictureControlSetPtr->decodeOrder,
4355 latency);
4356 #endif
4357 // Post the Full Results Object
4358 EbPostFullObject(outputResultsWrapperPtr);
4359
4360 #if DEADLOCK_DEBUG
4361 if ((pictureControlSetPtr->pictureNumber >= MIN_POC) && (pictureControlSetPtr->pictureNumber <= MAX_POC))
4362 SVT_LOG("POC %lu PA OUT \n", pictureControlSetPtr->pictureNumber);
4363 #endif
4364 }
4365 return EB_NULL;
4366 }
4367
UnusedVariablevoidFunc_PA()4368 void UnusedVariablevoidFunc_PA()
4369 {
4370 (void)SadCalculation_8x8_16x16_funcPtrArray;
4371 (void)SadCalculation_32x32_64x64_funcPtrArray;
4372 }
4373