1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5
6 #include "EbDefinitions.h"
7 #include "emmintrin.h"
8 #include "EbComputeMean_SSE2.h"
9
ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(EB_U8 * inputSamples,EB_U16 inputStride)10 EB_U64 ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(
11 EB_U8 * inputSamples, // input parameter, input samples Ptr
12 EB_U16 inputStride) // input parameter, input stride
13
14 {
15 __m128i xmm0, xmm_blockMean, xmm_input;
16
17 xmm0 = _mm_setzero_si128();
18 xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)inputSamples), xmm0);
19 xmm_blockMean = _mm_madd_epi16(xmm_input, xmm_input);
20
21 /*xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples + inputStride)), xmm0);
22 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));*/
23
24 xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+2*inputStride)), xmm0);
25 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
26
27 /*xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+3*inputStride)), xmm0);
28 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));*/
29
30 xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+4*inputStride)), xmm0);
31 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
32
33 //xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+5*inputStride)), xmm0);
34 //xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
35
36 xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+6*inputStride)), xmm0);
37 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
38
39 /*xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+7*inputStride)), xmm0);
40 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));*/
41
42 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 8));
43 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 4));
44
45
46 return (EB_U64)_mm_cvtsi128_si32(xmm_blockMean) << 11;
47
48
49
50
51 }
52
ComputeSubMean8x8_SSE2_INTRIN(EB_U8 * inputSamples,EB_U16 inputStride)53 EB_U64 ComputeSubMean8x8_SSE2_INTRIN(
54 EB_U8 * inputSamples, // input parameter, input samples Ptr
55 EB_U16 inputStride) // input parameter, input stride
56
57 {
58
59 __m128i xmm0 = _mm_setzero_si128(), xmm1, xmm3, xmm_sum1, xmm_sum2;
60
61 xmm1 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples)), xmm0);
62 //xmm2 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples + inputStride)), xmm0);
63 xmm3 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples + 2 * inputStride)), xmm0);
64 //xmm4 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples + 3 * inputStride)), xmm0);
65 xmm_sum1 = _mm_add_epi16(xmm1,xmm3);
66
67 inputSamples += 4 * inputStride;
68 xmm1 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples)), xmm0);
69 //xmm2 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples + inputStride)), xmm0);
70 xmm3 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples + 2 * inputStride)), xmm0);
71 //xmm4 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples + 3 * inputStride)), xmm0);
72 xmm_sum2 = _mm_add_epi16(xmm1, xmm3);
73 xmm_sum2 = _mm_add_epi16(xmm_sum1, xmm_sum2);
74
75 return (EB_U64)_mm_cvtsi128_si32(xmm_sum2) << 3;
76
77 }
78
79
80
ComputeMeanOfSquaredValues8x8_SSE2_INTRIN(EB_U8 * inputSamples,EB_U32 inputStride,EB_U32 inputAreaWidth,EB_U32 inputAreaHeight)81 EB_U64 ComputeMeanOfSquaredValues8x8_SSE2_INTRIN(
82 EB_U8 * inputSamples, // input parameter, input samples Ptr
83 EB_U32 inputStride, // input parameter, input stride
84 EB_U32 inputAreaWidth, // input parameter, input area width
85 EB_U32 inputAreaHeight) // input parameter, input area height
86 {
87 __m128i xmm0, xmm_blockMean, xmm_input;
88 (void)inputAreaWidth;
89 (void)inputAreaHeight;
90 xmm0 = _mm_setzero_si128();
91 xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)inputSamples), xmm0);
92 xmm_blockMean = _mm_madd_epi16(xmm_input, xmm_input);
93
94 xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples + inputStride)), xmm0);
95 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
96
97 xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+2*inputStride)), xmm0);
98 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
99
100 xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+3*inputStride)), xmm0);
101 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
102
103 xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+4*inputStride)), xmm0);
104 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
105
106 xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+5*inputStride)), xmm0);
107 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
108
109 xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+6*inputStride)), xmm0);
110 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
111
112 xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+7*inputStride)), xmm0);
113 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
114
115 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 8));
116 xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 4));
117
118 return (EB_U64)_mm_cvtsi128_si32(xmm_blockMean) << 10;
119 }
120