1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 #include "EbDefinitions.h"
7 #include "emmintrin.h"
8 #include "EbComputeMean_SSE2.h"
9 
ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(EB_U8 * inputSamples,EB_U16 inputStride)10 EB_U64 ComputeSubdMeanOfSquaredValues8x8_SSE2_INTRIN(
11 	EB_U8 *  inputSamples,      // input parameter, input samples Ptr
12 	EB_U16   inputStride)       // input parameter, input stride
13 
14 {
15 	__m128i xmm0, xmm_blockMean, xmm_input;
16 
17 	xmm0 = _mm_setzero_si128();
18 	xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)inputSamples), xmm0);
19 	xmm_blockMean = _mm_madd_epi16(xmm_input, xmm_input);
20 
21 	/*xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples + inputStride)), xmm0);
22 	xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));*/
23 
24 	xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+2*inputStride)), xmm0);
25 	xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
26 
27 	/*xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+3*inputStride)), xmm0);
28 	xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));*/
29 
30 	xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+4*inputStride)), xmm0);
31 	xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
32 
33 	//xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+5*inputStride)), xmm0);
34 	//xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
35 
36 	xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+6*inputStride)), xmm0);
37 	xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
38 
39 	/*xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+7*inputStride)), xmm0);
40 	xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));*/
41 
42 	xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 8));
43 	xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 4));
44 
45 
46 	return (EB_U64)_mm_cvtsi128_si32(xmm_blockMean) << 11;
47 
48 
49 
50 
51 }
52 
ComputeSubMean8x8_SSE2_INTRIN(EB_U8 * inputSamples,EB_U16 inputStride)53 EB_U64 ComputeSubMean8x8_SSE2_INTRIN(
54 	EB_U8 *  inputSamples,      // input parameter, input samples Ptr
55 	EB_U16   inputStride)       // input parameter, input stride
56 
57 {
58 
59 	__m128i xmm0 = _mm_setzero_si128(), xmm1, xmm3, xmm_sum1, xmm_sum2;
60 
61 	xmm1 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples)), xmm0);
62 	//xmm2 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples + inputStride)), xmm0);
63 	xmm3 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples + 2 * inputStride)), xmm0);
64 	//xmm4 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples + 3 * inputStride)), xmm0);
65 	xmm_sum1 = _mm_add_epi16(xmm1,xmm3);
66 
67 	inputSamples += 4 * inputStride;
68 	xmm1 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples)), xmm0);
69 	//xmm2 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples + inputStride)), xmm0);
70 	xmm3 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples + 2 * inputStride)), xmm0);
71 	//xmm4 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(inputSamples + 3 * inputStride)), xmm0);
72 	xmm_sum2 = _mm_add_epi16(xmm1, xmm3);
73 	xmm_sum2 = _mm_add_epi16(xmm_sum1, xmm_sum2);
74 
75 	return (EB_U64)_mm_cvtsi128_si32(xmm_sum2) << 3;
76 
77 }
78 
79 
80 
ComputeMeanOfSquaredValues8x8_SSE2_INTRIN(EB_U8 * inputSamples,EB_U32 inputStride,EB_U32 inputAreaWidth,EB_U32 inputAreaHeight)81 EB_U64 ComputeMeanOfSquaredValues8x8_SSE2_INTRIN(
82     EB_U8 *  inputSamples,      // input parameter, input samples Ptr
83     EB_U32   inputStride,       // input parameter, input stride
84     EB_U32   inputAreaWidth,    // input parameter, input area width
85     EB_U32   inputAreaHeight)   // input parameter, input area height
86 {
87     __m128i xmm0, xmm_blockMean, xmm_input;
88     (void)inputAreaWidth;
89     (void)inputAreaHeight;
90     xmm0 = _mm_setzero_si128();
91     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)inputSamples), xmm0);
92     xmm_blockMean = _mm_madd_epi16(xmm_input, xmm_input);
93 
94     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples + inputStride)), xmm0);
95     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
96 
97     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+2*inputStride)), xmm0);
98     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
99 
100     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+3*inputStride)), xmm0);
101     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
102 
103     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+4*inputStride)), xmm0);
104     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
105 
106     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+5*inputStride)), xmm0);
107     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
108 
109     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+6*inputStride)), xmm0);
110     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
111 
112     xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(inputSamples+7*inputStride)), xmm0);
113     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
114 
115     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 8));
116     xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 4));
117 
118     return (EB_U64)_mm_cvtsi128_si32(xmm_blockMean) << 10;
119 }
120