1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5
6 #ifndef EbPictureOperators_h
7 #define EbPictureOperators_h
8
9 #include "EbPictureOperators_C.h"
10 #include "EbPictureOperators_SSE2.h"
11 #include "EbPictureOperators_SSE4_1.h"
12 #include "EbPictureOperators_AVX2.h"
13 #include "EbHmCode.h"
14 #include "EbDefinitions.h"
15 #include "EbPictureBufferDesc.h"
16 #include "EbSequenceControlSet.h"
17 #include "EbUnPackProcess.h"
18 #ifdef __cplusplus
19 extern "C" {
20 #endif
21
22 extern void PictureAddition(
23 EB_U8 *predPtr,
24 EB_U32 predStride,
25 EB_S16 *residualPtr,
26 EB_U32 residualStride,
27 EB_U8 *reconPtr,
28 EB_U32 reconStride,
29 EB_U32 width,
30 EB_U32 height);
31
32 void PictureAddition16bit_TEST(
33 EB_U16 *predPtr,
34 EB_U32 predStride,
35 EB_S16 *residualPtr,
36 EB_U32 residualStride,
37 EB_U16 *reconPtr,
38 EB_U32 reconStride,
39 EB_U32 width,
40 EB_U32 height);
41
42 extern EB_ERRORTYPE PictureCopy8Bit(
43 EbPictureBufferDesc_t *src,
44 EB_U32 srcLumaOriginIndex,
45 EB_U32 srcChromaOriginIndex,
46 EbPictureBufferDesc_t *dst,
47 EB_U32 dstLumaOriginIndex,
48 EB_U32 dstChromaOriginIndex,
49 EB_U32 areaWidth,
50 EB_U32 areaHeight,
51 EB_U32 chromaAreaWidth,
52 EB_U32 chromaAreaHeight,
53 EB_U32 componentMask);
54
55 extern EB_ERRORTYPE PictureFastDistortion(
56 EbPictureBufferDesc_t *input,
57 EB_U32 inputLumaOriginIndex,
58 EB_U32 inputChromaOriginIndex,
59 EbPictureBufferDesc_t *pred,
60 EB_U32 predLumaOriginIndex,
61 EB_U32 predChromaOriginIndex,
62 EB_U32 size,
63 EB_U32 componentMask,
64 EB_U64 lumaDistortion[DIST_CALC_TOTAL],
65 EB_U64 chromaDistortion[DIST_CALC_TOTAL]);
66
67 EB_ERRORTYPE PictureFullDistortion_R(
68 EbPictureBufferDesc_t *coeff,
69 EB_U32 coeffLumaOriginIndex,
70 EB_U32 coeffChromaOriginIndex,
71 EbPictureBufferDesc_t *reconCoeff,
72 EB_U32 areaSize,
73 EB_U32 chromaAreaSize,
74 EB_U32 componentMask,
75 EB_U64 lumaDistortion[DIST_CALC_TOTAL],
76 EB_U64 cbDistortion[DIST_CALC_TOTAL],
77 EB_U64 crDistortion[DIST_CALC_TOTAL],
78 EB_U32 *countNonZeroCoeffs,
79 EB_MODETYPE mode);
80
81 extern EB_ERRORTYPE PictureFullDistortionLuma(
82 EbPictureBufferDesc_t *coeff,
83 EB_U32 coeffLumaOriginIndex,
84 EbPictureBufferDesc_t *reconCoeff,
85 EB_U32 reconCoeffLumaOriginIndex,
86 EB_U32 areaSize,
87 EB_U64 lumaDistortion[DIST_CALC_TOTAL],
88 EB_U32 countNonZeroCoeffsY,
89 EB_MODETYPE mode);
90
91 extern EB_ERRORTYPE PictureFullDistortionChroma(
92 EbPictureBufferDesc_t *coeff,
93 EB_U32 coeffCbOriginIndex,
94 EB_U32 coeffCrOriginIndex,
95 EbPictureBufferDesc_t *reconCoeff,
96 EB_U32 reconCoeffCbOriginIndex,
97 EB_U32 reconCoeffCrOriginIndex,
98 EB_U32 areaSize,
99 EB_U64 cbDistortion[DIST_CALC_TOTAL],
100 EB_U64 crDistortion[DIST_CALC_TOTAL],
101 EB_U32 countNonZeroCoeffsCb,
102 EB_U32 countNonZeroCoeffsCr,
103 EB_MODETYPE mode);
104
105 extern EB_U64 ComputeNxMSatdSadLCU(
106 EB_U8 *src, // input parameter, source samples Ptr
107 EB_U32 srcStride, // input parameter, source stride
108 EB_U32 width, // input parameter, block width (N)
109 EB_U32 height); // input parameter, block height (M)
110
111 //Residual Data
112 extern void PictureResidual(
113 EB_U8 *input,
114 EB_U32 inputStride,
115 EB_U8 *pred,
116 EB_U32 predStride,
117 EB_S16 *residual,
118 EB_U32 residualStride,
119 EB_U32 areaWidth,
120 EB_U32 areaHeight);
121 extern void PictureSubSampledResidual(
122 EB_U8 *input,
123 EB_U32 inputStride,
124 EB_U8 *pred,
125 EB_U32 predStride,
126 EB_S16 *residual,
127 EB_U32 residualStride,
128 EB_U32 areaWidth,
129 EB_U32 areaHeight,
130 EB_U8 lastLine);
131 extern void PictureResidual16bit(
132 EB_U16 *input,
133 EB_U32 inputStride,
134 EB_U16 *pred,
135 EB_U32 predStride,
136 EB_S16 *residual,
137 EB_U32 residualStride,
138 EB_U32 areaWidth,
139 EB_U32 areaHeight);
140
141 void CompressedPackBlk(
142 EB_U8 *in8BitBuffer,
143 EB_U32 in8Stride,
144 EB_U8 *innBitBuffer,
145 EB_U32 innStride,
146 EB_U16 *out16BitBuffer,
147 EB_U32 outStride,
148 EB_U32 width,
149 EB_U32 height
150 );
151 void Conv2bToCPackLcu(
152 const EB_U8 *innBitBuffer,
153 EB_U32 innStride,
154 EB_U8 *inCompnBitBuffer,
155 EB_U32 outStride,
156 EB_U8 *localCache,
157 EB_U32 width,
158 EB_U32 height);
159
160 void CompressedPackLcu(
161 EB_U8 *in8BitBuffer,
162 EB_U32 in8Stride,
163 EB_U8 *innBitBuffer,
164 EB_U32 innStride,
165 EB_U16 *out16BitBuffer,
166 EB_U32 outStride,
167 EB_U32 width,
168 EB_U32 height);
169
170 void Pack2D_SRC(
171 EB_U8 *in8BitBuffer,
172 EB_U32 in8Stride,
173 EB_U8 *innBitBuffer,
174 EB_U32 innStride,
175 EB_U16 *out16BitBuffer,
176 EB_U32 outStride,
177 EB_U32 width,
178 EB_U32 height);
179
180 void* UnPack2D(void *context);
181
182 void extract8Bitdata(
183 EB_U16 *in16BitBuffer,
184 EB_U32 inStride,
185 EB_U8 *out8BitBuffer,
186 EB_U32 out8Stride,
187 EB_U32 width,
188 EB_U32 height
189 );
190 void UnpackL0L1Avg(
191 EB_U16 *ref16L0,
192 EB_U32 refL0Stride,
193 EB_U16 *ref16L1,
194 EB_U32 refL1Stride,
195 EB_U8 *dstPtr,
196 EB_U32 dstStride,
197 EB_U32 width,
198 EB_U32 height);
199 void Extract8BitdataSafeSub(
200 EB_U16 *in16BitBuffer,
201 EB_U32 inStride,
202 EB_U8 *out8BitBuffer,
203 EB_U32 out8Stride,
204 EB_U32 width,
205 EB_U32 height
206 );
207 void UnpackL0L1AvgSafeSub(
208 EB_U16 *ref16L0,
209 EB_U32 refL0Stride,
210 EB_U16 *ref16L1,
211 EB_U32 refL1Stride,
212 EB_U8 *dstPtr,
213 EB_U32 dstStride,
214 EB_U32 width,
215 EB_U32 height);
216
217 void EbHevcMemcpy16bit(
218 EB_U16 * outPtr,
219 EB_U16 * inPtr,
220 EB_U64 numOfElements );
221 void memset16bit(
222 EB_U16 * inPtr,
223 EB_U16 value,
224 EB_U64 numOfElements );
225
PictureAdditionVoidFunc()226 static void PictureAdditionVoidFunc(){}
PicCopyVoidFunc()227 static void PicCopyVoidFunc(){}
PicResdVoidFunc()228 static void PicResdVoidFunc(){}
PicZeroOutCoefVoidFunc()229 static void PicZeroOutCoefVoidFunc(){}
FullDistortionVoidFunc()230 static void FullDistortionVoidFunc(){}
231
232 EB_S32 sumResidual( EB_S16 * inPtr,
233 EB_U32 size,
234 EB_U32 strideIn );
235
236
237 typedef EB_S32(*EB_SUM_RES)(
238 EB_S16 * inPtr,
239 EB_U32 size,
240 EB_U32 strideIn );
241
242 static EB_SUM_RES FUNC_TABLE SumResidual_funcPtrArray[EB_ASM_TYPE_TOTAL] = {
243 // C_DEFAULT
244 sumResidual,
245 // AVX2
246 sumResidual8bit_AVX2_INTRIN,
247 };
248
249 void memset16bitBlock (
250 EB_S16 * inPtr,
251 EB_U32 strideIn,
252 EB_U32 size,
253 EB_S16 value
254 );
255
256 typedef void(*EB_MEMSET16bitBLK)(
257 EB_S16 * inPtr,
258 EB_U32 strideIn,
259 EB_U32 size,
260 EB_S16 value );
261
262 static EB_MEMSET16bitBLK FUNC_TABLE memset16bitBlock_funcPtrArray[EB_ASM_TYPE_TOTAL] = {
263 // C_DEFAULT
264 memset16bitBlock,
265 // AVX2
266 memset16bitBlock_AVX2_INTRIN,
267 };
268
269 /***************************************
270 * Function Types
271 ***************************************/
272 typedef void(*EB_ADDDKERNEL_TYPE)(
273 EB_U8 *predPtr,
274 EB_U32 predStride,
275 EB_S16 *residualPtr,
276 EB_U32 residualStride,
277 EB_U8 *reconPtr,
278 EB_U32 reconStride,
279 EB_U32 width,
280 EB_U32 height);
281
282 typedef void(*EB_ADDDKERNEL_TYPE_16BIT)(
283 EB_U16 *predPtr,
284 EB_U32 predStride,
285 EB_S16 *residualPtr,
286 EB_U32 residualStride,
287 EB_U16 *reconPtr,
288 EB_U32 reconStride,
289 EB_U32 width,
290 EB_U32 height);
291
292 typedef void(*EB_PICCOPY_TYPE)(
293 EB_BYTE src,
294 EB_U32 srcStride,
295 EB_BYTE dst,
296 EB_U32 dstStride,
297 EB_U32 areaWidth,
298 EB_U32 areaHeight);
299
300
301
302 typedef void(*EB_RESDKERNEL_TYPE)(
303 EB_U8 *input,
304 EB_U32 inputStride,
305 EB_U8 *pred,
306 EB_U32 predStride,
307 EB_S16 *residual,
308 EB_U32 residualStride,
309 EB_U32 areaWidth,
310 EB_U32 areaHeight);
311
312 typedef void(*EB_RESDKERNEL_TYPE_16BIT)(
313 EB_U16 *input,
314 EB_U32 inputStride,
315 EB_U16 *pred,
316 EB_U32 predStride,
317 EB_S16 *residual,
318 EB_U32 residualStride,
319 EB_U32 areaWidth,
320 EB_U32 areaHeight);
321
322 typedef void(*EB_ZEROCOEFF_TYPE)(
323 EB_S16* coeffbuffer,
324 EB_U32 coeffStride,
325 EB_U32 coeffOriginIndex,
326 EB_U32 areaWidth,
327 EB_U32 areaHeight);
328
329 typedef void(*EB_FULLDIST_TYPE)(
330 EB_S16 *coeff,
331 EB_U32 coeffStride,
332 EB_S16 *reconCoeff,
333 EB_U32 reconCoeffStride,
334 EB_U64 distortionResult[DIST_CALC_TOTAL],
335 EB_U32 areaWidth,
336 EB_U32 areaHeight);
337
338 typedef EB_U64(*EB_SATD_TYPE)(
339 EB_S16 *diff);
340
341 typedef EB_U64(*EB_SATD_U8_TYPE)(
342 EB_U8 *diff,
343 EB_U64 *dcValue,
344 EB_U32 srcStride);
345
346 /***************************************
347 * Function Tables
348 ***************************************/
349 static EB_ADDDKERNEL_TYPE FUNC_TABLE AdditionKernel_funcPtrArray[EB_ASM_TYPE_TOTAL][9] = {
350 // C_DEFAULT
351 {
352 /*0 4x4 */ PictureAdditionKernel,
353 /*1 8x8 */ PictureAdditionKernel,
354 /*2 16x16 */ PictureAdditionKernel,
355 /*3 */ (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
356 /*4 32x32 */ PictureAdditionKernel,
357 /*5 */ (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
358 /*6 */ (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
359 /*7 */ (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
360 /*8 64x64 */ PictureAdditionKernel
361 },
362 // AVX2
363 {
364 /*0 4x4 */ PictureAdditionKernel4x4_SSE_INTRIN,
365 /*1 8x8 */ PictureAdditionKernel8x8_SSE2_INTRIN,
366 /*2 16x16 */ PictureAdditionKernel16x16_SSE2_INTRIN,
367 /*3 */ (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
368 /*4 32x32 */ PictureAdditionKernel32x32_SSE2_INTRIN,
369 /*5 */ (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
370 /*6 */ (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
371 /*7 */ (EB_ADDDKERNEL_TYPE)PictureAdditionVoidFunc,
372 /*8 64x64 */ PictureAdditionKernel64x64_SSE2_INTRIN,
373 },
374 };
375
376 static EB_ADDDKERNEL_TYPE_16BIT FUNC_TABLE AdditionKernel_funcPtrArray16bit[EB_ASM_TYPE_TOTAL] = {
377 // C_DEFAULT
378 PictureAdditionKernel16bit,
379 // AVX2
380 PictureAdditionKernel16bit_SSE2_INTRIN,
381 };
382
383 static EB_PICCOPY_TYPE FUNC_TABLE PicCopyKernel_funcPtrArray[EB_ASM_TYPE_TOTAL][9] = {
384 // C_DEFAULT
385 {
386 /*0 4x4 */ CopyKernel8Bit,
387 /*1 8x8 */ CopyKernel8Bit,
388 /*2 16x16 */ CopyKernel8Bit,
389 /*3 */ (EB_PICCOPY_TYPE)PicCopyVoidFunc,
390 /*4 32x32 */ CopyKernel8Bit,
391 /*5 */ (EB_PICCOPY_TYPE)PicCopyVoidFunc,
392 /*6 */ (EB_PICCOPY_TYPE)PicCopyVoidFunc,
393 /*7 */ (EB_PICCOPY_TYPE)PicCopyVoidFunc,
394 /*8 64x64 */ CopyKernel8Bit
395 },
396 // AVX2
397 {
398 /*0 4x4 */ PictureCopyKernel4x4_SSE_INTRIN,
399 /*1 8x8 */ PictureCopyKernel8x8_SSE2_INTRIN,
400 /*2 16x16 */ PictureCopyKernel16x16_SSE2_INTRIN,
401 /*3 */ (EB_PICCOPY_TYPE)PicCopyVoidFunc,
402 /*4 32x32 */ PictureCopyKernel32x32_SSE2_INTRIN,
403 /*5 */ (EB_PICCOPY_TYPE)PicCopyVoidFunc,
404 /*6 */ (EB_PICCOPY_TYPE)PicCopyVoidFunc,
405 /*7 */ (EB_PICCOPY_TYPE)PicCopyVoidFunc,
406 /*8 64x64 */ PictureCopyKernel64x64_SSE2_INTRIN,
407 },
408 };
409
410
411 typedef void(*EB_RESDKERNELSUBSAMPLED_TYPE)(
412 EB_U8 *input,
413 EB_U32 inputStride,
414 EB_U8 *pred,
415 EB_U32 predStride,
416 EB_S16 *residual,
417 EB_U32 residualStride,
418 EB_U32 areaWidth,
419 EB_U32 areaHeight ,
420 EB_U8 lastLine
421 );
422 static EB_RESDKERNELSUBSAMPLED_TYPE FUNC_TABLE ResidualKernelSubSampled_funcPtrArray[EB_ASM_TYPE_TOTAL][9] = {
423 // C_DEFAULT
424 {
425 /*0 4x4 */ ResidualKernelSubSampled,
426 /*1 8x8 */ ResidualKernelSubSampled,
427 /*2 16x16 */ ResidualKernelSubSampled,
428 /*3 */ (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
429 /*4 32x32 */ ResidualKernelSubSampled,
430 /*5 */ (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
431 /*6 */ (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
432 /*7 */ (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
433 /*8 64x64 */ ResidualKernelSubSampled
434 },
435 // AVX2
436 {
437 /*0 4x4 */ ResidualKernelSubSampled4x4_SSE_INTRIN,
438 /*1 8x8 */ ResidualKernelSubSampled8x8_SSE2_INTRIN,
439 /*2 16x16 */ ResidualKernelSubSampled16x16_SSE2_INTRIN,
440 /*3 */ (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
441 /*4 32x32 */ ResidualKernelSubSampled32x32_SSE2_INTRIN,
442 /*5 */ (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
443 /*6 */ (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
444 /*7 */ (EB_RESDKERNELSUBSAMPLED_TYPE)PicResdVoidFunc,
445 /*8 64x64 */ ResidualKernelSubSampled64x64_SSE2_INTRIN,
446 },
447 };
448
449 static EB_RESDKERNEL_TYPE FUNC_TABLE ResidualKernel_funcPtrArray[EB_ASM_TYPE_TOTAL][9] = {
450 // C_DEFAULT
451 {
452 /*0 4x4 */ ResidualKernel,
453 /*1 8x8 */ ResidualKernel,
454 /*2 16x16 */ ResidualKernel,
455 /*3 */ (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
456 /*4 32x32 */ ResidualKernel,
457 /*5 */ (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
458 /*6 */ (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
459 /*7 */ (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
460 /*8 64x64 */ ResidualKernel
461 },
462 // AVX2
463 {
464 /*0 4x4 */ ResidualKernel4x4_SSE_INTRIN,
465 /*1 8x8 */ ResidualKernel8x8_SSE2_INTRIN,
466 /*2 16x16 */ ResidualKernel16x16_SSE2_INTRIN,
467 /*3 */ (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
468 /*4 32x32 */ ResidualKernel32x32_SSE2_INTRIN,
469 /*5 */ (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
470 /*6 */ (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
471 /*7 */ (EB_RESDKERNEL_TYPE)PicResdVoidFunc,
472 /*8 64x64 */ ResidualKernel64x64_SSE2_INTRIN,
473 },
474 };
475
476
477 static EB_RESDKERNEL_TYPE_16BIT FUNC_TABLE ResidualKernel_funcPtrArray16Bit[EB_ASM_TYPE_TOTAL] = {
478 // C_DEFAULT
479 ResidualKernel16bit,
480 // AVX2
481 ResidualKernel16bit_SSE2_INTRIN
482 };
483
484 static EB_ZEROCOEFF_TYPE FUNC_TABLE PicZeroOutCoef_funcPtrArray[EB_ASM_TYPE_TOTAL][5] = {
485 // C_DEFAULT
486 {
487 /*0 4x4 */ ZeroOutCoeffKernel,
488 /*1 8x8 */ ZeroOutCoeffKernel,
489 /*2 16x16 */ ZeroOutCoeffKernel,
490 /*3 */ (EB_ZEROCOEFF_TYPE)PicZeroOutCoefVoidFunc,
491 /*4 32x32 */ ZeroOutCoeffKernel
492 },
493 // AVX2
494 {
495 /*0 4x4 */ ZeroOutCoeff4x4_SSE,
496 /*1 8x8 */ ZeroOutCoeff8x8_SSE2,
497 /*2 16x16 */ ZeroOutCoeff16x16_SSE2,
498 /*3 */ (EB_ZEROCOEFF_TYPE)PicZeroOutCoefVoidFunc,
499 /*4 32x32 */ ZeroOutCoeff32x32_SSE2
500 },
501 };
502
503 static EB_FULLDIST_TYPE FUNC_TABLE FullDistortionIntrinsic_funcPtrArray[EB_ASM_TYPE_TOTAL][2][2][9] = {
504 // C_DEFAULT
505 // It was found that the SSE2 intrinsic code is much faster (~2x) than the SSE4.1 code
506 {
507 {
508 {
509 /*0 4x4 */ FullDistortionKernelCbfZero_32bit,
510 /*1 8x8 */ FullDistortionKernelCbfZero_32bit,
511 /*2 16x16 */ FullDistortionKernelCbfZero_32bit,
512 /*3 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
513 /*4 32x32 */ FullDistortionKernelCbfZero_32bit,
514 /*5 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
515 /*6 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
516 /*7 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
517 /*8 64x64 */ FullDistortionKernelCbfZero_32bit,
518 },
519 {
520 /*0 4x4 */ FullDistortionKernelCbfZero_32bit,
521 /*1 8x8 */ FullDistortionKernelCbfZero_32bit,
522 /*2 16x16 */ FullDistortionKernelCbfZero_32bit,
523 /*3 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
524 /*4 32x32 */ FullDistortionKernelCbfZero_32bit,
525 /*5 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
526 /*6 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
527 /*7 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
528 /*8 64x64 */ FullDistortionKernelCbfZero_32bit,
529 }
530 },
531 {
532 {
533 /*0 4x4 */ FullDistortionKernel_32bit,
534 /*1 8x8 */ FullDistortionKernel_32bit,
535 /*2 16x16 */ FullDistortionKernel_32bit,
536 /*3 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
537 /*4 32x32 */ FullDistortionKernel_32bit,
538 /*5 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
539 /*6 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
540 /*7 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
541 /*8 64x64 */ FullDistortionKernel_32bit,
542 },
543 {
544 /*0 4x4 */ FullDistortionKernelIntra_32bit,
545 /*1 8x8 */ FullDistortionKernelIntra_32bit,
546 /*2 16x16 */ FullDistortionKernelIntra_32bit,
547 /*3 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
548 /*4 32x32 */ FullDistortionKernelIntra_32bit,
549 /*5 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
550 /*6 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
551 /*7 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
552 /*8 64x64 */ FullDistortionKernelIntra_32bit,
553 }
554
555 }
556 },
557 // AVX2
558 // It was found that the SSE2 intrinsic code is much faster (~2x) than the SSE4.1 code
559 {
560 {
561 {
562 /*0 4x4 */ FullDistortionKernelCbfZero4x4_32bit_BT_SSE2,
563 /*1 8x8 */ FullDistortionKernelCbfZero8x8_32bit_BT_SSE2,
564 /*2 16x16 */ FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2,
565 /*3 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
566 /*4 32x32 */ FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2,
567 /*5 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
568 /*6 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
569 /*7 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
570 /*8 64x64 */ FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2,
571 },
572 {
573 /*0 4x4 */ FullDistortionKernelCbfZero4x4_32bit_BT_SSE2,
574 /*1 8x8 */ FullDistortionKernelCbfZero8x8_32bit_BT_SSE2,
575 /*2 16x16 */ FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2,
576 /*3 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
577 /*4 32x32 */ FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2,
578 /*5 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
579 /*6 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
580 /*7 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
581 /*8 64x64 */ FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2,
582 }
583 },
584 {
585 {
586 /*0 4x4 */ FullDistortionKernel4x4_32bit_BT_SSE2,
587 /*1 8x8 */ FullDistortionKernel8x8_32bit_BT_SSE2,
588 /*2 16x16 */ FullDistortionKernel16MxN_32bit_BT_SSE2,
589 /*3 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
590 /*4 32x32 */ FullDistortionKernel16MxN_32bit_BT_SSE2,
591 /*5 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
592 /*6 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
593 /*7 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
594 /*8 64x64 */ FullDistortionKernel16MxN_32bit_BT_SSE2,
595 },
596 {
597 /*0 4x4 */ FullDistortionKernelIntra4x4_32bit_BT_SSE2,
598 /*1 8x8 */ FullDistortionKernelIntra8x8_32bit_BT_SSE2,
599 /*2 16x16 */ FullDistortionKernelIntra16MxN_32bit_BT_SSE2,
600 /*3 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
601 /*4 32x32 */ FullDistortionKernelIntra16MxN_32bit_BT_SSE2,
602 /*5 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
603 /*6 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
604 /*7 */ (EB_FULLDIST_TYPE)FullDistortionVoidFunc,
605 /*8 64x64 */ FullDistortionKernelIntra16MxN_32bit_BT_SSE2,
606 }
607 }
608 },
609 };
610
611 static EB_SATD_TYPE FUNC_TABLE Compute8x8Satd_funcPtrArray[EB_ASM_TYPE_TOTAL] = {
612 // C_DEFAULT
613 Compute8x8Satd,
614 // ASM_AVX2
615 Compute8x8Satd_SSE4
616 };
617
618 static EB_SATD_U8_TYPE FUNC_TABLE Compute8x8Satd_U8_funcPtrArray[EB_ASM_TYPE_TOTAL] = {
619 // C_DEFAULT
620 Compute8x8Satd_U8,
621 // ASM_AVX2
622 Compute8x8Satd_U8_SSE4
623 };
624
625 typedef EB_U64(*EB_SPATIALFULLDIST_TYPE)(
626 EB_U8 *input,
627 EB_U32 inputStride,
628 EB_U8 *recon,
629 EB_U32 reconStride,
630 EB_U32 areaWidth,
631 EB_U32 areaHeight);
632
633 static EB_SPATIALFULLDIST_TYPE FUNC_TABLE SpatialFullDistortionKernel_funcPtrArray[EB_ASM_TYPE_TOTAL][5] = {
634 // C_DEFAULT
635 {
636 // 4x4
637 SpatialFullDistortionKernel,
638 // 8x8
639 SpatialFullDistortionKernel,
640 // 16x16
641 SpatialFullDistortionKernel,
642 // 32x32
643 SpatialFullDistortionKernel,
644 // 64x64
645 SpatialFullDistortionKernel
646 },
647 // ASM_AVX2
648 {
649 // 4x4
650 SpatialFullDistortionKernel4x4_SSSE3_INTRIN,
651 // 8x8
652 SpatialFullDistortionKernel8x8_SSSE3_INTRIN,
653 // 16x16
654 SpatialFullDistortionKernel16MxN_SSSE3_INTRIN,
655 // 32x32
656 SpatialFullDistortionKernel16MxN_SSSE3_INTRIN,
657 // 64x64
658 SpatialFullDistortionKernel16MxN_SSSE3_INTRIN
659 },
660 };
661
662
663
664
665 #ifdef __cplusplus
666 }
667 #endif
668 #endif // EbPictureOperators_h
669