1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 #ifndef EbPictureOperators_h
7 #define EbPictureOperators_h
8 
9 #include "EbPictureOperators_C.h"
10 #include "EbPictureOperators_SSE2.h"
11 #include "EbPictureOperators_SSE4_1.h"
12 #include "EbPictureOperators_AVX2.h"
13 #include "EbHmCode.h"
14 #include "EbDefinitions.h"
15 #include "EbPictureBufferDesc.h"
16 #include "EbSequenceControlSet.h"
17 #include "EbUnPackProcess.h"
18 #ifdef __cplusplus
19 extern "C" {
20 #endif
21 
22 extern void PictureAddition(
23     EB_U8  *predPtr,
24     EB_U32  predStride,
25     EB_S16 *residualPtr,
26     EB_U32  residualStride,
27     EB_U8  *reconPtr,
28     EB_U32  reconStride,
29     EB_U32  width,
30     EB_U32  height);
31 
32 void PictureAddition16bit_TEST(
33     EB_U16  *predPtr,
34     EB_U32  predStride,
35     EB_S16 *residualPtr,
36     EB_U32  residualStride,
37     EB_U16  *reconPtr,
38     EB_U32  reconStride,
39     EB_U32  width,
40     EB_U32  height);
41 
42 extern EB_ERRORTYPE PictureCopy8Bit(
43     EbPictureBufferDesc_t   *src,
44     EB_U32                   srcLumaOriginIndex,
45     EB_U32                   srcChromaOriginIndex,
46     EbPictureBufferDesc_t   *dst,
47     EB_U32                   dstLumaOriginIndex,
48     EB_U32                   dstChromaOriginIndex,
49     EB_U32                   areaWidth,
50     EB_U32                   areaHeight,
51     EB_U32                   chromaAreaWidth,
52     EB_U32                   chromaAreaHeight,
53     EB_U32                   componentMask);
54 
55 extern EB_ERRORTYPE PictureFastDistortion(
56     EbPictureBufferDesc_t   *input,
57     EB_U32                   inputLumaOriginIndex,
58     EB_U32                   inputChromaOriginIndex,
59     EbPictureBufferDesc_t   *pred,
60     EB_U32                   predLumaOriginIndex,
61     EB_U32                   predChromaOriginIndex,
62     EB_U32                   size,
63     EB_U32                   componentMask,
64     EB_U64                   lumaDistortion[DIST_CALC_TOTAL],
65     EB_U64                   chromaDistortion[DIST_CALC_TOTAL]);
66 
67 EB_ERRORTYPE PictureFullDistortion_R(
68     EbPictureBufferDesc_t   *coeff,
69     EB_U32                   coeffLumaOriginIndex,
70     EB_U32                   coeffChromaOriginIndex,
71     EbPictureBufferDesc_t   *reconCoeff,
72     EB_U32                   areaSize,
73     EB_U32                   chromaAreaSize,
74     EB_U32                   componentMask,
75     EB_U64                   lumaDistortion[DIST_CALC_TOTAL],
76     EB_U64                   cbDistortion[DIST_CALC_TOTAL],
77     EB_U64                   crDistortion[DIST_CALC_TOTAL],
78 	EB_U32                   *countNonZeroCoeffs,
79 	EB_MODETYPE				 mode);
80 
81 extern EB_ERRORTYPE PictureFullDistortionLuma(
82     EbPictureBufferDesc_t   *coeff,
83     EB_U32                   coeffLumaOriginIndex,
84     EbPictureBufferDesc_t   *reconCoeff,
85     EB_U32                   reconCoeffLumaOriginIndex,
86     EB_U32                   areaSize,
87     EB_U64                   lumaDistortion[DIST_CALC_TOTAL],
88 	EB_U32                   countNonZeroCoeffsY,
89 	EB_MODETYPE				 mode);
90 
91 extern EB_ERRORTYPE PictureFullDistortionChroma(
92     EbPictureBufferDesc_t   *coeff,
93     EB_U32                   coeffCbOriginIndex,
94 	EB_U32                   coeffCrOriginIndex,
95     EbPictureBufferDesc_t   *reconCoeff,
96     EB_U32                   reconCoeffCbOriginIndex,
97 	EB_U32                   reconCoeffCrOriginIndex,
98     EB_U32                   areaSize,
99     EB_U64                   cbDistortion[DIST_CALC_TOTAL],
100 	EB_U64                   crDistortion[DIST_CALC_TOTAL],
101 	EB_U32                   countNonZeroCoeffsCb,
102 	EB_U32                   countNonZeroCoeffsCr,
103 	EB_MODETYPE				 mode);
104 
105 extern EB_U64 ComputeNxMSatdSadLCU(
106     EB_U8  *src,        // input parameter, source samples Ptr
107     EB_U32  srcStride,  // input parameter, source stride
108     EB_U32  width,      // input parameter, block width (N)
109     EB_U32  height);    // input parameter, block height (M)
110 
111 //Residual Data
112 extern void PictureResidual(
113     EB_U8   *input,
114     EB_U32   inputStride,
115     EB_U8   *pred,
116     EB_U32   predStride,
117     EB_S16  *residual,
118     EB_U32   residualStride,
119     EB_U32   areaWidth,
120     EB_U32   areaHeight);
121 extern void PictureSubSampledResidual(
122     EB_U8   *input,
123     EB_U32   inputStride,
124     EB_U8   *pred,
125     EB_U32   predStride,
126     EB_S16  *residual,
127     EB_U32   residualStride,
128     EB_U32   areaWidth,
129     EB_U32   areaHeight,
130     EB_U8    lastLine);
131 extern void PictureResidual16bit(
132     EB_U16   *input,
133     EB_U32   inputStride,
134     EB_U16   *pred,
135     EB_U32   predStride,
136     EB_S16  *residual,
137     EB_U32   residualStride,
138     EB_U32   areaWidth,
139     EB_U32   areaHeight);
140 
141 void CompressedPackBlk(
142 	EB_U8     *in8BitBuffer,
143 	EB_U32     in8Stride,
144 	EB_U8     *innBitBuffer,
145 	EB_U32     innStride,
146 	EB_U16    *out16BitBuffer,
147 	EB_U32     outStride,
148 	EB_U32     width,
149 	EB_U32     height
150 	);
151 void Conv2bToCPackLcu(
152 	const EB_U8     *innBitBuffer,
153 	EB_U32     innStride,
154 	EB_U8     *inCompnBitBuffer,
155 	EB_U32     outStride,
156 	EB_U8    *localCache,
157 	EB_U32     width,
158 	EB_U32     height);
159 
160 void CompressedPackLcu(
161     EB_U8     *in8BitBuffer,
162     EB_U32     in8Stride,
163     EB_U8     *innBitBuffer,
164     EB_U32     innStride,
165     EB_U16    *out16BitBuffer,
166     EB_U32     outStride,
167     EB_U32     width,
168     EB_U32     height);
169 
170 void Pack2D_SRC(
171    EB_U8     *in8BitBuffer,
172    EB_U32     in8Stride,
173    EB_U8     *innBitBuffer,
174    EB_U32     innStride,
175    EB_U16    *out16BitBuffer,
176    EB_U32     outStride,
177    EB_U32     width,
178    EB_U32     height);
179 
180 void* UnPack2D(void *context);
181 
182 void extract8Bitdata(
183     EB_U16      *in16BitBuffer,
184     EB_U32       inStride,
185     EB_U8       *out8BitBuffer,
186     EB_U32       out8Stride,
187     EB_U32       width,
188     EB_U32       height
189     );
190 void UnpackL0L1Avg(
191         EB_U16 *ref16L0,
192         EB_U32  refL0Stride,
193         EB_U16 *ref16L1,
194         EB_U32  refL1Stride,
195         EB_U8  *dstPtr,
196         EB_U32  dstStride,
197         EB_U32  width,
198         EB_U32  height);
199 void Extract8BitdataSafeSub(
200     EB_U16      *in16BitBuffer,
201     EB_U32       inStride,
202     EB_U8       *out8BitBuffer,
203     EB_U32       out8Stride,
204     EB_U32       width,
205     EB_U32       height
206     );
207 void UnpackL0L1AvgSafeSub(
208         EB_U16 *ref16L0,
209         EB_U32  refL0Stride,
210         EB_U16 *ref16L1,
211         EB_U32  refL1Stride,
212         EB_U8  *dstPtr,
213         EB_U32  dstStride,
214         EB_U32  width,
215         EB_U32  height);
216 
217 void EbHevcMemcpy16bit(
218     EB_U16                     * outPtr,
219     EB_U16                     * inPtr,
220     EB_U64                       numOfElements );
221 void memset16bit(
222     EB_U16                     * inPtr,
223     EB_U16                       value,
224     EB_U64                       numOfElements );
225 
PictureAdditionVoidFunc()226 static void PictureAdditionVoidFunc(){}
PicCopyVoidFunc()227 static void PicCopyVoidFunc(){}
PicResdVoidFunc()228 static void PicResdVoidFunc(){}
PicZeroOutCoefVoidFunc()229 static void PicZeroOutCoefVoidFunc(){}
FullDistortionVoidFunc()230 static void FullDistortionVoidFunc(){}
231 
232 EB_S32  sumResidual( EB_S16 * inPtr,
233                      EB_U32   size,
234                      EB_U32   strideIn );
235 
236 
237 typedef EB_S32(*EB_SUM_RES)(
238                      EB_S16 * inPtr,
239                      EB_U32   size,
240                      EB_U32   strideIn );
241 
242 static EB_SUM_RES FUNC_TABLE SumResidual_funcPtrArray[EB_ASM_TYPE_TOTAL] = {
243 	// C_DEFAULT
244 	sumResidual,
245 	// AVX2
246 	sumResidual8bit_AVX2_INTRIN,
247 };
248 
249 void memset16bitBlock (
250                     EB_S16 * inPtr,
251                     EB_U32   strideIn,
252                     EB_U32   size,
253                     EB_S16   value
254     );
255 
256 typedef void(*EB_MEMSET16bitBLK)(
257                     EB_S16 * inPtr,
258                     EB_U32   strideIn,
259                     EB_U32   size,
260                     EB_S16   value );
261 
262 static EB_MEMSET16bitBLK FUNC_TABLE memset16bitBlock_funcPtrArray[EB_ASM_TYPE_TOTAL] = {
263 	// C_DEFAULT
264 	memset16bitBlock,
265 	// AVX2
266 	memset16bitBlock_AVX2_INTRIN,
267 };
268 
269 /***************************************
270 * Function Types
271 ***************************************/
272 typedef void(*EB_ADDDKERNEL_TYPE)(
273     EB_U8  *predPtr,
274     EB_U32  predStride,
275     EB_S16 *residualPtr,
276     EB_U32  residualStride,
277     EB_U8  *reconPtr,
278     EB_U32  reconStride,
279     EB_U32  width,
280     EB_U32  height);
281 
282 typedef void(*EB_ADDDKERNEL_TYPE_16BIT)(
283     EB_U16  *predPtr,
284     EB_U32  predStride,
285     EB_S16 *residualPtr,
286     EB_U32  residualStride,
287     EB_U16  *reconPtr,
288     EB_U32  reconStride,
289     EB_U32  width,
290     EB_U32  height);
291 
292 typedef void(*EB_PICCOPY_TYPE)(
293     EB_BYTE                  src,
294     EB_U32                   srcStride,
295     EB_BYTE                  dst,
296     EB_U32                   dstStride,
297     EB_U32                   areaWidth,
298     EB_U32                   areaHeight);
299 
300 
301 
302 typedef void(*EB_RESDKERNEL_TYPE)(
303     EB_U8   *input,
304     EB_U32   inputStride,
305     EB_U8   *pred,
306     EB_U32   predStride,
307     EB_S16  *residual,
308     EB_U32   residualStride,
309     EB_U32   areaWidth,
310     EB_U32   areaHeight);
311 
312 typedef void(*EB_RESDKERNEL_TYPE_16BIT)(
313     EB_U16   *input,
314     EB_U32   inputStride,
315     EB_U16   *pred,
316     EB_U32   predStride,
317     EB_S16  *residual,
318     EB_U32   residualStride,
319     EB_U32   areaWidth,
320     EB_U32   areaHeight);
321 
322 typedef void(*EB_ZEROCOEFF_TYPE)(
323     EB_S16*                  coeffbuffer,
324     EB_U32                   coeffStride,
325     EB_U32                   coeffOriginIndex,
326     EB_U32                   areaWidth,
327     EB_U32                   areaHeight);
328 
329 typedef void(*EB_FULLDIST_TYPE)(
330     EB_S16  *coeff,
331     EB_U32   coeffStride,
332     EB_S16  *reconCoeff,
333     EB_U32   reconCoeffStride,
334     EB_U64   distortionResult[DIST_CALC_TOTAL],
335     EB_U32   areaWidth,
336     EB_U32   areaHeight);
337 
338 typedef EB_U64(*EB_SATD_TYPE)(
339     EB_S16 *diff);
340 
341 typedef EB_U64(*EB_SATD_U8_TYPE)(
342 	EB_U8 *diff,
343 	EB_U64 *dcValue,
344 	EB_U32  srcStride);
345 
346 /***************************************
347 * Function Tables
348 ***************************************/
349 static EB_ADDDKERNEL_TYPE FUNC_TABLE AdditionKernel_funcPtrArray[EB_ASM_TYPE_TOTAL][9] = {
350         // C_DEFAULT
351         {
352             /*0 4x4   */    PictureAdditionKernel,
353             /*1 8x8   */    PictureAdditionKernel,
354             /*2 16x16 */    PictureAdditionKernel,
355             /*3       */    (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
356             /*4 32x32 */    PictureAdditionKernel,
357             /*5       */    (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
358             /*6       */    (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
359             /*7       */    (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
360             /*8 64x64 */    PictureAdditionKernel
361         },
362         // AVX2
363         {
364 			/*0 4x4   */    PictureAdditionKernel4x4_SSE_INTRIN,
365 			/*1 8x8   */    PictureAdditionKernel8x8_SSE2_INTRIN,
366 			/*2 16x16 */    PictureAdditionKernel16x16_SSE2_INTRIN,
367             /*3       */    (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
368 			/*4 32x32 */    PictureAdditionKernel32x32_SSE2_INTRIN,
369             /*5       */    (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
370             /*6       */    (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
371             /*7       */    (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
372 			/*8 64x64 */    PictureAdditionKernel64x64_SSE2_INTRIN,
373         },
374 };
375 
376 static EB_ADDDKERNEL_TYPE_16BIT FUNC_TABLE AdditionKernel_funcPtrArray16bit[EB_ASM_TYPE_TOTAL] = {
377 	// C_DEFAULT
378     PictureAdditionKernel16bit,
379 	// AVX2
380 	PictureAdditionKernel16bit_SSE2_INTRIN,
381 };
382 
383 static EB_PICCOPY_TYPE FUNC_TABLE PicCopyKernel_funcPtrArray[EB_ASM_TYPE_TOTAL][9] = {
384         // C_DEFAULT
385         {
386             /*0 4x4   */     CopyKernel8Bit,
387             /*1 8x8   */     CopyKernel8Bit,
388             /*2 16x16 */     CopyKernel8Bit,
389             /*3       */     (EB_PICCOPY_TYPE)PicCopyVoidFunc,
390             /*4 32x32 */     CopyKernel8Bit,
391             /*5       */     (EB_PICCOPY_TYPE)PicCopyVoidFunc,
392             /*6       */     (EB_PICCOPY_TYPE)PicCopyVoidFunc,
393             /*7       */     (EB_PICCOPY_TYPE)PicCopyVoidFunc,
394             /*8 64x64 */     CopyKernel8Bit
395         },
396         // AVX2
397         {
398 			/*0 4x4   */     PictureCopyKernel4x4_SSE_INTRIN,
399 			/*1 8x8   */     PictureCopyKernel8x8_SSE2_INTRIN,
400 			/*2 16x16 */     PictureCopyKernel16x16_SSE2_INTRIN,
401             /*3       */     (EB_PICCOPY_TYPE)PicCopyVoidFunc,
402 			/*4 32x32 */     PictureCopyKernel32x32_SSE2_INTRIN,
403             /*5       */     (EB_PICCOPY_TYPE)PicCopyVoidFunc,
404             /*6       */     (EB_PICCOPY_TYPE)PicCopyVoidFunc,
405             /*7       */     (EB_PICCOPY_TYPE)PicCopyVoidFunc,
406 			/*8 64x64 */     PictureCopyKernel64x64_SSE2_INTRIN,
407         },
408 };
409 
410 
411 typedef void(*EB_RESDKERNELSUBSAMPLED_TYPE)(
412     EB_U8   *input,
413     EB_U32   inputStride,
414     EB_U8   *pred,
415     EB_U32   predStride,
416     EB_S16  *residual,
417     EB_U32   residualStride,
418     EB_U32   areaWidth,
419     EB_U32   areaHeight ,
420     EB_U8    lastLine
421     );
422 static EB_RESDKERNELSUBSAMPLED_TYPE FUNC_TABLE ResidualKernelSubSampled_funcPtrArray[EB_ASM_TYPE_TOTAL][9] = {
423 	// C_DEFAULT
424 	{
425 		/*0 4x4  */     ResidualKernelSubSampled,
426 		/*1 8x8  */     ResidualKernelSubSampled,
427 		/*2 16x16 */    ResidualKernelSubSampled,
428 		/*3  */         (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
429 		/*4 32x32 */    ResidualKernelSubSampled,
430 		/*5      */     (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
431 		/*6  */         (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
432 		/*7      */     (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
433 		/*8 64x64 */    ResidualKernelSubSampled
434 	},
435 	// AVX2
436 	{
437 		/*0 4x4  */     ResidualKernelSubSampled4x4_SSE_INTRIN,
438 		/*1 8x8  */     ResidualKernelSubSampled8x8_SSE2_INTRIN,
439 		/*2 16x16 */    ResidualKernelSubSampled16x16_SSE2_INTRIN,
440 		/*3  */         (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
441 		/*4 32x32 */    ResidualKernelSubSampled32x32_SSE2_INTRIN,
442 		/*5      */     (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
443 		/*6  */         (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
444 		/*7      */     (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
445 		/*8 64x64 */    ResidualKernelSubSampled64x64_SSE2_INTRIN,
446 	},
447 };
448 
449 static EB_RESDKERNEL_TYPE FUNC_TABLE ResidualKernel_funcPtrArray[EB_ASM_TYPE_TOTAL][9] = {
450 	// C_DEFAULT
451 	{
452         /*0 4x4  */     ResidualKernel,
453         /*1 8x8  */     ResidualKernel,
454         /*2 16x16 */    ResidualKernel,
455         /*3  */         (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
456         /*4 32x32 */    ResidualKernel,
457         /*5      */     (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
458         /*6  */         (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
459         /*7      */     (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
460         /*8 64x64 */    ResidualKernel
461 	},
462 	// AVX2
463 	{
464 		/*0 4x4  */     ResidualKernel4x4_SSE_INTRIN,
465 		/*1 8x8  */     ResidualKernel8x8_SSE2_INTRIN,
466 		/*2 16x16 */    ResidualKernel16x16_SSE2_INTRIN,
467 		/*3  */         (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
468 		/*4 32x32 */    ResidualKernel32x32_SSE2_INTRIN,
469 		/*5      */     (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
470 		/*6  */         (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
471 		/*7      */     (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
472 		/*8 64x64 */    ResidualKernel64x64_SSE2_INTRIN,
473 	},
474 };
475 
476 
477 static EB_RESDKERNEL_TYPE_16BIT FUNC_TABLE ResidualKernel_funcPtrArray16Bit[EB_ASM_TYPE_TOTAL] = {
478     // C_DEFAULT
479     ResidualKernel16bit,
480     // AVX2
481     ResidualKernel16bit_SSE2_INTRIN
482 };
483 
484 static EB_ZEROCOEFF_TYPE FUNC_TABLE PicZeroOutCoef_funcPtrArray[EB_ASM_TYPE_TOTAL][5] = {
485         // C_DEFAULT
486         {
487             /*0 4x4   */     ZeroOutCoeffKernel,
488             /*1 8x8   */     ZeroOutCoeffKernel,
489             /*2 16x16 */     ZeroOutCoeffKernel,
490             /*3       */     (EB_ZEROCOEFF_TYPE)PicZeroOutCoefVoidFunc,
491             /*4 32x32 */     ZeroOutCoeffKernel
492         },
493         // AVX2
494         {
495             /*0 4x4   */     ZeroOutCoeff4x4_SSE,
496             /*1 8x8   */     ZeroOutCoeff8x8_SSE2,
497             /*2 16x16 */     ZeroOutCoeff16x16_SSE2,
498             /*3       */     (EB_ZEROCOEFF_TYPE)PicZeroOutCoefVoidFunc,
499             /*4 32x32 */     ZeroOutCoeff32x32_SSE2
500         },
501 };
502 
503 static EB_FULLDIST_TYPE FUNC_TABLE FullDistortionIntrinsic_funcPtrArray[EB_ASM_TYPE_TOTAL][2][2][9] = {
504     // C_DEFAULT
505     // It was found that the SSE2 intrinsic code is much faster (~2x) than the SSE4.1 code
506     {
507         {
508             {
509                 /*0 4x4   */    FullDistortionKernelCbfZero_32bit,
510                 /*1 8x8   */    FullDistortionKernelCbfZero_32bit,
511                 /*2 16x16 */    FullDistortionKernelCbfZero_32bit,
512                 /*3       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
513                 /*4 32x32 */    FullDistortionKernelCbfZero_32bit,
514                 /*5       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
515                 /*6       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
516                 /*7       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
517                 /*8 64x64 */    FullDistortionKernelCbfZero_32bit,
518             },
519             {
520                 /*0 4x4   */    FullDistortionKernelCbfZero_32bit,
521                 /*1 8x8   */    FullDistortionKernelCbfZero_32bit,
522                 /*2 16x16 */    FullDistortionKernelCbfZero_32bit,
523                 /*3       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
524                 /*4 32x32 */    FullDistortionKernelCbfZero_32bit,
525                 /*5       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
526                 /*6       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
527                 /*7       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
528                 /*8 64x64 */    FullDistortionKernelCbfZero_32bit,
529             }
530         },
531         {
532             {
533                 /*0 4x4   */    FullDistortionKernel_32bit,
534                 /*1 8x8   */    FullDistortionKernel_32bit,
535                 /*2 16x16 */    FullDistortionKernel_32bit,
536                 /*3       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
537                 /*4 32x32 */    FullDistortionKernel_32bit,
538                 /*5       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
539                 /*6       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
540                 /*7       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
541                 /*8 64x64 */    FullDistortionKernel_32bit,
542             },
543             {
544                 /*0 4x4   */    FullDistortionKernelIntra_32bit,
545                 /*1 8x8   */    FullDistortionKernelIntra_32bit,
546                 /*2 16x16 */    FullDistortionKernelIntra_32bit,
547                 /*3       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
548                 /*4 32x32 */    FullDistortionKernelIntra_32bit,
549                 /*5       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
550                 /*6       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
551                 /*7       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
552                 /*8 64x64 */    FullDistortionKernelIntra_32bit,
553             }
554 
555         }
556     },
557     // AVX2
558     // It was found that the SSE2 intrinsic code is much faster (~2x) than the SSE4.1 code
559     {
560         {
561             {
562                 /*0 4x4   */    FullDistortionKernelCbfZero4x4_32bit_BT_SSE2,
563                 /*1 8x8   */    FullDistortionKernelCbfZero8x8_32bit_BT_SSE2,
564                 /*2 16x16 */    FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2,
565                 /*3       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
566                 /*4 32x32 */    FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2,
567                 /*5       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
568                 /*6       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
569                 /*7       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
570                 /*8 64x64 */    FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2,
571             },
572             {
573                 /*0 4x4   */    FullDistortionKernelCbfZero4x4_32bit_BT_SSE2,
574                 /*1 8x8   */    FullDistortionKernelCbfZero8x8_32bit_BT_SSE2,
575                 /*2 16x16 */    FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2,
576                 /*3       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
577                 /*4 32x32 */    FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2,
578                 /*5       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
579                 /*6       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
580                 /*7       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
581                 /*8 64x64 */    FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2,
582             }
583         },
584         {
585             {
586                 /*0 4x4   */    FullDistortionKernel4x4_32bit_BT_SSE2,
587                 /*1 8x8   */    FullDistortionKernel8x8_32bit_BT_SSE2,
588                 /*2 16x16 */    FullDistortionKernel16MxN_32bit_BT_SSE2,
589                 /*3       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
590                 /*4 32x32 */    FullDistortionKernel16MxN_32bit_BT_SSE2,
591                 /*5       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
592                 /*6       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
593                 /*7       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
594                 /*8 64x64 */    FullDistortionKernel16MxN_32bit_BT_SSE2,
595             },
596             {
597                 /*0 4x4   */    FullDistortionKernelIntra4x4_32bit_BT_SSE2,
598                 /*1 8x8   */    FullDistortionKernelIntra8x8_32bit_BT_SSE2,
599                 /*2 16x16 */    FullDistortionKernelIntra16MxN_32bit_BT_SSE2,
600                 /*3       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
601                 /*4 32x32 */    FullDistortionKernelIntra16MxN_32bit_BT_SSE2,
602                 /*5       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
603                 /*6       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
604                 /*7       */    (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
605                 /*8 64x64 */    FullDistortionKernelIntra16MxN_32bit_BT_SSE2,
606             }
607         }
608     },
609 };
610 
611 static EB_SATD_TYPE FUNC_TABLE Compute8x8Satd_funcPtrArray[EB_ASM_TYPE_TOTAL] = {
612     // C_DEFAULT
613     Compute8x8Satd,
614     // ASM_AVX2
615     Compute8x8Satd_SSE4
616 };
617 
618 static EB_SATD_U8_TYPE FUNC_TABLE Compute8x8Satd_U8_funcPtrArray[EB_ASM_TYPE_TOTAL] = {
619 	// C_DEFAULT
620     Compute8x8Satd_U8,
621 	// ASM_AVX2
622 	Compute8x8Satd_U8_SSE4
623 };
624 
625 typedef EB_U64(*EB_SPATIALFULLDIST_TYPE)(
626     EB_U8   *input,
627     EB_U32   inputStride,
628     EB_U8   *recon,
629     EB_U32   reconStride,
630     EB_U32   areaWidth,
631     EB_U32   areaHeight);
632 
633 static EB_SPATIALFULLDIST_TYPE FUNC_TABLE SpatialFullDistortionKernel_funcPtrArray[EB_ASM_TYPE_TOTAL][5] = {
634     // C_DEFAULT
635     {
636         // 4x4
637         SpatialFullDistortionKernel,
638         // 8x8
639         SpatialFullDistortionKernel,
640         // 16x16
641         SpatialFullDistortionKernel,
642         // 32x32
643         SpatialFullDistortionKernel,
644         // 64x64
645         SpatialFullDistortionKernel
646     },
647     // ASM_AVX2
648     {
649         // 4x4
650         SpatialFullDistortionKernel4x4_SSSE3_INTRIN,
651         // 8x8
652         SpatialFullDistortionKernel8x8_SSSE3_INTRIN,
653         // 16x16
654         SpatialFullDistortionKernel16MxN_SSSE3_INTRIN,
655         // 32x32
656         SpatialFullDistortionKernel16MxN_SSSE3_INTRIN,
657         // 64x64
658         SpatialFullDistortionKernel16MxN_SSSE3_INTRIN
659     },
660 };
661 
662 
663 
664 
665 #ifdef __cplusplus
666 }
667 #endif
668 #endif // EbPictureOperators_h
669