1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <immintrin.h>
14
15 #include "config/aom_config.h"
16
17 #include "aom_ports/mem.h"
18 #include "aom/aom_integer.h"
19
20 #include "aom_dsp/aom_dsp_common.h"
21 #include "aom_dsp/aom_filter.h"
22 #include "aom_dsp/x86/obmc_intrinsic_sse4.h"
23 #include "aom_dsp/x86/synonyms.h"
24
25 ////////////////////////////////////////////////////////////////////////////////
26 // 8 bit
27 ////////////////////////////////////////////////////////////////////////////////
28
29 void aom_var_filter_block2d_bil_first_pass_ssse3(
30 const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
31 unsigned int pixel_step, unsigned int output_height,
32 unsigned int output_width, const uint8_t *filter);
33
34 void aom_var_filter_block2d_bil_second_pass_ssse3(
35 const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
36 unsigned int pixel_step, unsigned int output_height,
37 unsigned int output_width, const uint8_t *filter);
38
obmc_variance_w8n(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,unsigned int * const sse,int * const sum,const int w,const int h)39 static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
40 const int32_t *wsrc, const int32_t *mask,
41 unsigned int *const sse, int *const sum,
42 const int w, const int h) {
43 const int pre_step = pre_stride - w;
44 int n = 0;
45 __m128i v_sum_d = _mm_setzero_si128();
46 __m128i v_sse_d = _mm_setzero_si128();
47
48 assert(w >= 8);
49 assert(IS_POWER_OF_TWO(w));
50 assert(IS_POWER_OF_TWO(h));
51
52 do {
53 const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
54 const __m128i v_m1_d = xx_load_128(mask + n + 4);
55 const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
56 const __m128i v_p0_b = xx_loadl_32(pre + n);
57 const __m128i v_m0_d = xx_load_128(mask + n);
58 const __m128i v_w0_d = xx_load_128(wsrc + n);
59
60 const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
61 const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
62
63 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
64 // boundaries. We use pmaddwd, as it has lower latency on Haswell
65 // than pmulld but produces the same result with these inputs.
66 const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
67 const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
68
69 const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
70 const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
71
72 const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
73 const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
74 const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
75 const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
76
77 v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
78 v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
79 v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
80
81 n += 8;
82
83 if (n % w == 0) pre += pre_step;
84 } while (n < w * h);
85
86 *sum = xx_hsum_epi32_si32(v_sum_d);
87 *sse = xx_hsum_epi32_si32(v_sse_d);
88 }
89
90 #define OBMCVARWXH(W, H) \
91 unsigned int aom_obmc_variance##W##x##H##_sse4_1( \
92 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
93 const int32_t *mask, unsigned int *sse) { \
94 int sum; \
95 if (W == 4) { \
96 obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \
97 } else { \
98 obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
99 } \
100 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
101 }
102
103 OBMCVARWXH(128, 128)
104 OBMCVARWXH(128, 64)
105 OBMCVARWXH(64, 128)
106 OBMCVARWXH(64, 64)
107 OBMCVARWXH(64, 32)
108 OBMCVARWXH(32, 64)
109 OBMCVARWXH(32, 32)
110 OBMCVARWXH(32, 16)
111 OBMCVARWXH(16, 32)
112 OBMCVARWXH(16, 16)
113 OBMCVARWXH(16, 8)
114 OBMCVARWXH(8, 16)
115 OBMCVARWXH(8, 8)
116 OBMCVARWXH(8, 4)
117 OBMCVARWXH(4, 8)
118 OBMCVARWXH(4, 4)
119 OBMCVARWXH(4, 16)
120 OBMCVARWXH(16, 4)
121 OBMCVARWXH(8, 32)
122 OBMCVARWXH(32, 8)
123 OBMCVARWXH(16, 64)
124 OBMCVARWXH(64, 16)
125
126 #include "config/aom_dsp_rtcd.h"
127
128 #define OBMC_SUBPIX_VAR(W, H) \
129 uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \
130 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
131 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
132 uint16_t fdata3[(H + 1) * W]; \
133 uint8_t temp2[H * W]; \
134 \
135 aom_var_filter_block2d_bil_first_pass_ssse3( \
136 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
137 aom_var_filter_block2d_bil_second_pass_ssse3( \
138 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
139 \
140 return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \
141 }
142
143 OBMC_SUBPIX_VAR(128, 128)
144 OBMC_SUBPIX_VAR(128, 64)
145 OBMC_SUBPIX_VAR(64, 128)
146 OBMC_SUBPIX_VAR(64, 64)
147 OBMC_SUBPIX_VAR(64, 32)
148 OBMC_SUBPIX_VAR(32, 64)
149 OBMC_SUBPIX_VAR(32, 32)
150 OBMC_SUBPIX_VAR(32, 16)
151 OBMC_SUBPIX_VAR(16, 32)
152 OBMC_SUBPIX_VAR(16, 16)
153 OBMC_SUBPIX_VAR(16, 8)
154 OBMC_SUBPIX_VAR(8, 16)
155 OBMC_SUBPIX_VAR(8, 8)
156 OBMC_SUBPIX_VAR(8, 4)
157 OBMC_SUBPIX_VAR(4, 8)
158 OBMC_SUBPIX_VAR(4, 4)
159 OBMC_SUBPIX_VAR(4, 16)
160 OBMC_SUBPIX_VAR(16, 4)
161 OBMC_SUBPIX_VAR(8, 32)
162 OBMC_SUBPIX_VAR(32, 8)
163 OBMC_SUBPIX_VAR(16, 64)
164 OBMC_SUBPIX_VAR(64, 16)
165
166 ////////////////////////////////////////////////////////////////////////////////
167 // High bit-depth
168 ////////////////////////////////////////////////////////////////////////////////
169 #if CONFIG_AV1_HIGHBITDEPTH
hbd_obmc_variance_w4(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,uint64_t * const sse,int64_t * const sum,const int h)170 static INLINE void hbd_obmc_variance_w4(
171 const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
172 const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
173 const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
174 const int pre_step = pre_stride - 4;
175 int n = 0;
176 __m128i v_sum_d = _mm_setzero_si128();
177 __m128i v_sse_d = _mm_setzero_si128();
178
179 assert(IS_POWER_OF_TWO(h));
180
181 do {
182 const __m128i v_p_w = xx_loadl_64(pre + n);
183 const __m128i v_m_d = xx_load_128(mask + n);
184 const __m128i v_w_d = xx_load_128(wsrc + n);
185
186 const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
187
188 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
189 // boundaries. We use pmaddwd, as it has lower latency on Haswell
190 // than pmulld but produces the same result with these inputs.
191 const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
192
193 const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
194 const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
195 const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
196
197 v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
198 v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
199
200 n += 4;
201
202 if (n % 4 == 0) pre += pre_step;
203 } while (n < 4 * h);
204
205 *sum = xx_hsum_epi32_si32(v_sum_d);
206 *sse = xx_hsum_epi32_si32(v_sse_d);
207 }
208
hbd_obmc_variance_w8n(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,uint64_t * const sse,int64_t * const sum,const int w,const int h)209 static INLINE void hbd_obmc_variance_w8n(
210 const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
211 const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
212 const int h) {
213 const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
214 const int pre_step = pre_stride - w;
215 int n = 0;
216 __m128i v_sum_d = _mm_setzero_si128();
217 __m128i v_sse_d = _mm_setzero_si128();
218
219 assert(w >= 8);
220 assert(IS_POWER_OF_TWO(w));
221 assert(IS_POWER_OF_TWO(h));
222
223 do {
224 const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
225 const __m128i v_m1_d = xx_load_128(mask + n + 4);
226 const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
227 const __m128i v_p0_w = xx_loadl_64(pre + n);
228 const __m128i v_m0_d = xx_load_128(mask + n);
229 const __m128i v_w0_d = xx_load_128(wsrc + n);
230
231 const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
232 const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
233
234 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
235 // boundaries. We use pmaddwd, as it has lower latency on Haswell
236 // than pmulld but produces the same result with these inputs.
237 const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
238 const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
239
240 const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
241 const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
242
243 const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
244 const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
245 const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
246 const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
247
248 v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
249 v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
250 v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
251
252 n += 8;
253
254 if (n % w == 0) pre += pre_step;
255 } while (n < w * h);
256
257 *sum += xx_hsum_epi32_si64(v_sum_d);
258 *sse += xx_hsum_epi32_si64(v_sse_d);
259 }
260
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)261 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
262 const int32_t *wsrc,
263 const int32_t *mask, int w, int h,
264 unsigned int *sse, int *sum) {
265 int64_t sum64 = 0;
266 uint64_t sse64 = 0;
267 if (w == 4) {
268 hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
269 } else {
270 hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
271 }
272 *sum = (int)sum64;
273 *sse = (unsigned int)sse64;
274 }
275
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)276 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
277 const int32_t *wsrc,
278 const int32_t *mask, int w, int h,
279 unsigned int *sse, int *sum) {
280 int64_t sum64 = 0;
281 uint64_t sse64 = 0;
282 if (w == 4) {
283 hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
284 } else if (w < 128 || h < 128) {
285 hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
286 } else {
287 assert(w == 128 && h == 128);
288
289 do {
290 hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
291 64);
292 pre8 += 64 * pre_stride;
293 wsrc += 64 * w;
294 mask += 64 * w;
295 h -= 64;
296 } while (h > 0);
297 }
298 *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
299 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
300 }
301
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)302 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
303 const int32_t *wsrc,
304 const int32_t *mask, int w, int h,
305 unsigned int *sse, int *sum) {
306 int64_t sum64 = 0;
307 uint64_t sse64 = 0;
308 int max_pel_allowed_per_ovf = 512;
309 if (w == 4) {
310 hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
311 } else if (w * h <= max_pel_allowed_per_ovf) {
312 hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
313 } else {
314 int h_per_ovf = max_pel_allowed_per_ovf / w;
315
316 assert(max_pel_allowed_per_ovf % w == 0);
317 do {
318 hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
319 h_per_ovf);
320 pre8 += h_per_ovf * pre_stride;
321 wsrc += h_per_ovf * w;
322 mask += h_per_ovf * w;
323 h -= h_per_ovf;
324 } while (h > 0);
325 }
326 *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
327 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
328 }
329
330 #define HBD_OBMCVARWXH(W, H) \
331 unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1( \
332 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
333 const int32_t *mask, unsigned int *sse) { \
334 int sum; \
335 highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
336 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
337 } \
338 \
339 unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \
340 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
341 const int32_t *mask, unsigned int *sse) { \
342 int sum; \
343 int64_t var; \
344 highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
345 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
346 return (var >= 0) ? (uint32_t)var : 0; \
347 } \
348 \
349 unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \
350 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
351 const int32_t *mask, unsigned int *sse) { \
352 int sum; \
353 int64_t var; \
354 highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
355 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
356 return (var >= 0) ? (uint32_t)var : 0; \
357 }
358
359 HBD_OBMCVARWXH(128, 128)
360 HBD_OBMCVARWXH(128, 64)
361 HBD_OBMCVARWXH(64, 128)
362 HBD_OBMCVARWXH(64, 64)
363 HBD_OBMCVARWXH(64, 32)
364 HBD_OBMCVARWXH(32, 64)
365 HBD_OBMCVARWXH(32, 32)
366 HBD_OBMCVARWXH(32, 16)
367 HBD_OBMCVARWXH(16, 32)
368 HBD_OBMCVARWXH(16, 16)
369 HBD_OBMCVARWXH(16, 8)
370 HBD_OBMCVARWXH(8, 16)
371 HBD_OBMCVARWXH(8, 8)
372 HBD_OBMCVARWXH(8, 4)
373 HBD_OBMCVARWXH(4, 8)
374 HBD_OBMCVARWXH(4, 4)
375 HBD_OBMCVARWXH(4, 16)
376 HBD_OBMCVARWXH(16, 4)
377 HBD_OBMCVARWXH(8, 32)
378 HBD_OBMCVARWXH(32, 8)
379 HBD_OBMCVARWXH(16, 64)
380 HBD_OBMCVARWXH(64, 16)
381 #endif // CONFIG_AV1_HIGHBITDEPTH
382