1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include "EbDefinitions.h"
13 #include "aom_dsp_rtcd.h"
14
15 #ifdef __cplusplus
16 extern "C" {
17 #endif
18 typedef uint32_t (*HighVarianceFn)(const uint16_t *src, int32_t src_stride, const uint16_t *ref,
19 int32_t ref_stride, uint32_t *sse, int32_t *sum);
20
21 uint32_t svt_aom_highbd_calc4x4var_sse2(const uint16_t *src, int32_t src_stride,
22 const uint16_t *ref, int32_t ref_stride, uint32_t *sse,
23 int32_t *sum);
24
25 uint32_t svt_aom_highbd_calc8x8var_sse2(const uint16_t *src, int32_t src_stride,
26 const uint16_t *ref, int32_t ref_stride, uint32_t *sse,
27 int32_t *sum);
28
29 uint32_t svt_aom_highbd_calc16x16var_sse2(const uint16_t *src, int32_t src_stride,
30 const uint16_t *ref, int32_t ref_stride, uint32_t *sse,
31 int32_t *sum);
32
33 #ifdef __cplusplus
34 }
35 #endif // __cplusplus
highbd_8_variance_sse2(const uint16_t * src,int32_t src_stride,const uint16_t * ref,int32_t ref_stride,int32_t w,int32_t h,uint32_t * sse,int32_t * sum,HighVarianceFn var_fn,int32_t block_size)36 static void highbd_8_variance_sse2(const uint16_t *src, int32_t src_stride, const uint16_t *ref,
37 int32_t ref_stride, int32_t w, int32_t h, uint32_t *sse,
38 int32_t *sum, HighVarianceFn var_fn, int32_t block_size) {
39 int32_t i, j;
40
41 *sse = 0;
42 *sum = 0;
43
44 for (i = 0; i < h; i += block_size) {
45 for (j = 0; j < w; j += block_size) {
46 uint32_t sse0;
47 int32_t sum0;
48 var_fn(src + src_stride * i + j,
49 src_stride,
50 ref + ref_stride * i + j,
51 ref_stride,
52 &sse0,
53 &sum0);
54 *sse += sse0;
55 *sum += sum0;
56 }
57 }
58 }
59
highbd_10_variance_sse2(const uint16_t * src,int32_t src_stride,const uint16_t * ref,int32_t ref_stride,int32_t w,int32_t h,uint32_t * sse,int32_t * sum,HighVarianceFn var_fn,int32_t block_size)60 static void highbd_10_variance_sse2(const uint16_t *src, int32_t src_stride, const uint16_t *ref,
61 int32_t ref_stride, int32_t w, int32_t h, uint32_t *sse,
62 int32_t *sum, HighVarianceFn var_fn, int32_t block_size) {
63 int32_t i, j;
64 uint64_t sse_long = 0;
65 int32_t sum_long = 0;
66
67 for (i = 0; i < h; i += block_size) {
68 for (j = 0; j < w; j += block_size) {
69 uint32_t sse0;
70 int32_t sum0;
71 var_fn(src + src_stride * i + j,
72 src_stride,
73 ref + ref_stride * i + j,
74 ref_stride,
75 &sse0,
76 &sum0);
77 sse_long += sse0;
78 sum_long += sum0;
79 }
80 }
81 *sum = ROUND_POWER_OF_TWO(sum_long, 2);
82 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
83 }
84
highbd_12_variance_sse2(const uint16_t * src,int32_t src_stride,const uint16_t * ref,int32_t ref_stride,int32_t w,int32_t h,uint32_t * sse,int32_t * sum,HighVarianceFn var_fn,int32_t block_size)85 static void highbd_12_variance_sse2(const uint16_t *src, int32_t src_stride, const uint16_t *ref,
86 int32_t ref_stride, int32_t w, int32_t h, uint32_t *sse,
87 int32_t *sum, HighVarianceFn var_fn, int32_t block_size) {
88 int32_t i, j;
89 uint64_t sse_long = 0;
90 int32_t sum_long = 0;
91
92 for (i = 0; i < h; i += block_size) {
93 for (j = 0; j < w; j += block_size) {
94 uint32_t sse0;
95 int32_t sum0;
96 var_fn(src + src_stride * i + j,
97 src_stride,
98 ref + ref_stride * i + j,
99 ref_stride,
100 &sse0,
101 &sum0);
102 sse_long += sse0;
103 sum_long += sum0;
104 }
105 }
106 *sum = ROUND_POWER_OF_TWO(sum_long, 4);
107 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
108 }
109
110 #define HIGH_GET_VAR(S) \
111 void svt_aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, \
112 int32_t src_stride, \
113 const uint8_t *ref8, \
114 int32_t ref_stride, \
115 uint32_t * sse, \
116 int32_t * sum) { \
117 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
118 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
119 svt_aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, sum); \
120 } \
121 \
122 void svt_aom_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, \
123 int32_t src_stride, \
124 const uint8_t *ref8, \
125 int32_t ref_stride, \
126 uint32_t * sse, \
127 int32_t * sum) { \
128 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
129 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
130 svt_aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, sum); \
131 *sum = ROUND_POWER_OF_TWO(*sum, 2); \
132 *sse = ROUND_POWER_OF_TWO(*sse, 4); \
133 } \
134 \
135 void svt_aom_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, \
136 int32_t src_stride, \
137 const uint8_t *ref8, \
138 int32_t ref_stride, \
139 uint32_t * sse, \
140 int32_t * sum) { \
141 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
142 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
143 svt_aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, sum); \
144 *sum = ROUND_POWER_OF_TWO(*sum, 4); \
145 *sse = ROUND_POWER_OF_TWO(*sse, 8); \
146 }
147
148 HIGH_GET_VAR(16);
149 HIGH_GET_VAR(8);
150
151 #undef HIGH_GET_VAR
152
153 #define VAR_FN(w, h, block_size, shift) \
154 uint32_t svt_aom_highbd_8_variance##w##x##h##_sse2(const uint8_t *src8, \
155 int32_t src_stride, \
156 const uint8_t *ref8, \
157 int32_t ref_stride, \
158 uint32_t * sse) { \
159 int32_t sum; \
160 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
161 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
162 highbd_8_variance_sse2(src, \
163 src_stride, \
164 ref, \
165 ref_stride, \
166 w, \
167 h, \
168 sse, \
169 &sum, \
170 svt_aom_highbd_calc##block_size##x##block_size##var_sse2, \
171 block_size); \
172 return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
173 } \
174 \
175 uint32_t svt_aom_highbd_10_variance##w##x##h##_sse2(const uint8_t *src8, \
176 int32_t src_stride, \
177 const uint8_t *ref8, \
178 int32_t ref_stride, \
179 uint32_t * sse) { \
180 int32_t sum; \
181 int64_t var; \
182 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
183 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
184 highbd_10_variance_sse2(src, \
185 src_stride, \
186 ref, \
187 ref_stride, \
188 w, \
189 h, \
190 sse, \
191 &sum, \
192 svt_aom_highbd_calc##block_size##x##block_size##var_sse2, \
193 block_size); \
194 var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
195 return (var >= 0) ? (uint32_t)var : 0; \
196 } \
197 \
198 uint32_t svt_aom_highbd_12_variance##w##x##h##_sse2(const uint8_t *src8, \
199 int32_t src_stride, \
200 const uint8_t *ref8, \
201 int32_t ref_stride, \
202 uint32_t * sse) { \
203 int32_t sum; \
204 int64_t var; \
205 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
206 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
207 highbd_12_variance_sse2(src, \
208 src_stride, \
209 ref, \
210 ref_stride, \
211 w, \
212 h, \
213 sse, \
214 &sum, \
215 svt_aom_highbd_calc##block_size##x##block_size##var_sse2, \
216 block_size); \
217 var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
218 return (var >= 0) ? (uint32_t)var : 0; \
219 }
220
221 VAR_FN(64, 64, 16, 12);
222 VAR_FN(64, 32, 16, 11);
223 VAR_FN(32, 64, 16, 11);
224 VAR_FN(32, 32, 16, 10);
225 VAR_FN(32, 16, 16, 9);
226 VAR_FN(16, 32, 16, 9);
227 VAR_FN(16, 16, 16, 8);
228 VAR_FN(16, 8, 8, 7);
229 VAR_FN(8, 16, 8, 7);
230 VAR_FN(8, 8, 8, 6);
231 VAR_FN(16, 4, 4, 6);
232 VAR_FN(8, 32, 8, 8);
233 VAR_FN(32, 8, 8, 8);
234 VAR_FN(16, 64, 16, 10);
235 VAR_FN(64, 16, 16, 10);
236
237 #undef VAR_FN
238
svt_aom_highbd_8_mse16x16_sse2(const uint8_t * src8,int32_t src_stride,const uint8_t * ref8,int32_t ref_stride,uint32_t * sse)239 void svt_aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int32_t src_stride, const uint8_t *ref8,
240 int32_t ref_stride, uint32_t *sse) {
241 int32_t sum;
242 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
243 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
244
245 /*TODO: Remove calculate unused sum.*/
246 highbd_8_variance_sse2(
247 src, src_stride, ref, ref_stride, 16, 16, sse, &sum, svt_aom_highbd_calc16x16var_sse2, 16);
248 }
249