1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <immintrin.h>
14 
15 #include "config/aom_config.h"
16 
17 #include "aom_ports/mem.h"
18 #include "aom/aom_integer.h"
19 
20 #include "aom_dsp/aom_dsp_common.h"
21 #include "aom_dsp/aom_filter.h"
22 #include "aom_dsp/x86/obmc_intrinsic_sse4.h"
23 #include "aom_dsp/x86/synonyms.h"
24 
25 ////////////////////////////////////////////////////////////////////////////////
26 // 8 bit
27 ////////////////////////////////////////////////////////////////////////////////
28 
29 void aom_var_filter_block2d_bil_first_pass_ssse3(
30     const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
31     unsigned int pixel_step, unsigned int output_height,
32     unsigned int output_width, const uint8_t *filter);
33 
34 void aom_var_filter_block2d_bil_second_pass_ssse3(
35     const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
36     unsigned int pixel_step, unsigned int output_height,
37     unsigned int output_width, const uint8_t *filter);
38 
obmc_variance_w8n(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,unsigned int * const sse,int * const sum,const int w,const int h)39 static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
40                                      const int32_t *wsrc, const int32_t *mask,
41                                      unsigned int *const sse, int *const sum,
42                                      const int w, const int h) {
43   const int pre_step = pre_stride - w;
44   int n = 0;
45   __m128i v_sum_d = _mm_setzero_si128();
46   __m128i v_sse_d = _mm_setzero_si128();
47 
48   assert(w >= 8);
49   assert(IS_POWER_OF_TWO(w));
50   assert(IS_POWER_OF_TWO(h));
51 
52   do {
53     const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
54     const __m128i v_m1_d = xx_load_128(mask + n + 4);
55     const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
56     const __m128i v_p0_b = xx_loadl_32(pre + n);
57     const __m128i v_m0_d = xx_load_128(mask + n);
58     const __m128i v_w0_d = xx_load_128(wsrc + n);
59 
60     const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
61     const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
62 
63     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
64     // boundaries. We use pmaddwd, as it has lower latency on Haswell
65     // than pmulld but produces the same result with these inputs.
66     const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
67     const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
68 
69     const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
70     const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
71 
72     const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
73     const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
74     const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
75     const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
76 
77     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
78     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
79     v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
80 
81     n += 8;
82 
83     if (n % w == 0) pre += pre_step;
84   } while (n < w * h);
85 
86   *sum = xx_hsum_epi32_si32(v_sum_d);
87   *sse = xx_hsum_epi32_si32(v_sse_d);
88 }
89 
90 #define OBMCVARWXH(W, H)                                               \
91   unsigned int aom_obmc_variance##W##x##H##_sse4_1(                    \
92       const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
93       const int32_t *mask, unsigned int *sse) {                        \
94     int sum;                                                           \
95     if (W == 4) {                                                      \
96       obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);     \
97     } else {                                                           \
98       obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
99     }                                                                  \
100     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
101   }
102 
103 OBMCVARWXH(128, 128)
104 OBMCVARWXH(128, 64)
105 OBMCVARWXH(64, 128)
106 OBMCVARWXH(64, 64)
107 OBMCVARWXH(64, 32)
108 OBMCVARWXH(32, 64)
109 OBMCVARWXH(32, 32)
110 OBMCVARWXH(32, 16)
111 OBMCVARWXH(16, 32)
112 OBMCVARWXH(16, 16)
113 OBMCVARWXH(16, 8)
114 OBMCVARWXH(8, 16)
115 OBMCVARWXH(8, 8)
116 OBMCVARWXH(8, 4)
117 OBMCVARWXH(4, 8)
118 OBMCVARWXH(4, 4)
119 OBMCVARWXH(4, 16)
120 OBMCVARWXH(16, 4)
121 OBMCVARWXH(8, 32)
122 OBMCVARWXH(32, 8)
123 OBMCVARWXH(16, 64)
124 OBMCVARWXH(64, 16)
125 
126 #include "config/aom_dsp_rtcd.h"
127 
128 #define OBMC_SUBPIX_VAR(W, H)                                                \
129   uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1(                    \
130       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,          \
131       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {         \
132     uint16_t fdata3[(H + 1) * W];                                            \
133     uint8_t temp2[H * W];                                                    \
134                                                                              \
135     aom_var_filter_block2d_bil_first_pass_ssse3(                             \
136         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
137     aom_var_filter_block2d_bil_second_pass_ssse3(                            \
138         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
139                                                                              \
140     return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse);   \
141   }
142 
143 OBMC_SUBPIX_VAR(128, 128)
144 OBMC_SUBPIX_VAR(128, 64)
145 OBMC_SUBPIX_VAR(64, 128)
146 OBMC_SUBPIX_VAR(64, 64)
147 OBMC_SUBPIX_VAR(64, 32)
148 OBMC_SUBPIX_VAR(32, 64)
149 OBMC_SUBPIX_VAR(32, 32)
150 OBMC_SUBPIX_VAR(32, 16)
151 OBMC_SUBPIX_VAR(16, 32)
152 OBMC_SUBPIX_VAR(16, 16)
153 OBMC_SUBPIX_VAR(16, 8)
154 OBMC_SUBPIX_VAR(8, 16)
155 OBMC_SUBPIX_VAR(8, 8)
156 OBMC_SUBPIX_VAR(8, 4)
157 OBMC_SUBPIX_VAR(4, 8)
158 OBMC_SUBPIX_VAR(4, 4)
159 OBMC_SUBPIX_VAR(4, 16)
160 OBMC_SUBPIX_VAR(16, 4)
161 OBMC_SUBPIX_VAR(8, 32)
162 OBMC_SUBPIX_VAR(32, 8)
163 OBMC_SUBPIX_VAR(16, 64)
164 OBMC_SUBPIX_VAR(64, 16)
165 
166 ////////////////////////////////////////////////////////////////////////////////
167 // High bit-depth
168 ////////////////////////////////////////////////////////////////////////////////
169 #if CONFIG_AV1_HIGHBITDEPTH
hbd_obmc_variance_w4(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,uint64_t * const sse,int64_t * const sum,const int h)170 static INLINE void hbd_obmc_variance_w4(
171     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
172     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
173   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
174   const int pre_step = pre_stride - 4;
175   int n = 0;
176   __m128i v_sum_d = _mm_setzero_si128();
177   __m128i v_sse_d = _mm_setzero_si128();
178 
179   assert(IS_POWER_OF_TWO(h));
180 
181   do {
182     const __m128i v_p_w = xx_loadl_64(pre + n);
183     const __m128i v_m_d = xx_load_128(mask + n);
184     const __m128i v_w_d = xx_load_128(wsrc + n);
185 
186     const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
187 
188     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
189     // boundaries. We use pmaddwd, as it has lower latency on Haswell
190     // than pmulld but produces the same result with these inputs.
191     const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
192 
193     const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
194     const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
195     const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
196 
197     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
198     v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
199 
200     n += 4;
201 
202     if (n % 4 == 0) pre += pre_step;
203   } while (n < 4 * h);
204 
205   *sum = xx_hsum_epi32_si32(v_sum_d);
206   *sse = xx_hsum_epi32_si32(v_sse_d);
207 }
208 
hbd_obmc_variance_w8n(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,uint64_t * const sse,int64_t * const sum,const int w,const int h)209 static INLINE void hbd_obmc_variance_w8n(
210     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
211     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
212     const int h) {
213   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
214   const int pre_step = pre_stride - w;
215   int n = 0;
216   __m128i v_sum_d = _mm_setzero_si128();
217   __m128i v_sse_d = _mm_setzero_si128();
218 
219   assert(w >= 8);
220   assert(IS_POWER_OF_TWO(w));
221   assert(IS_POWER_OF_TWO(h));
222 
223   do {
224     const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
225     const __m128i v_m1_d = xx_load_128(mask + n + 4);
226     const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
227     const __m128i v_p0_w = xx_loadl_64(pre + n);
228     const __m128i v_m0_d = xx_load_128(mask + n);
229     const __m128i v_w0_d = xx_load_128(wsrc + n);
230 
231     const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
232     const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
233 
234     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
235     // boundaries. We use pmaddwd, as it has lower latency on Haswell
236     // than pmulld but produces the same result with these inputs.
237     const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
238     const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
239 
240     const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
241     const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
242 
243     const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
244     const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
245     const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
246     const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
247 
248     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
249     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
250     v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
251 
252     n += 8;
253 
254     if (n % w == 0) pre += pre_step;
255   } while (n < w * h);
256 
257   *sum += xx_hsum_epi32_si64(v_sum_d);
258   *sse += xx_hsum_epi32_si64(v_sse_d);
259 }
260 
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)261 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
262                                         const int32_t *wsrc,
263                                         const int32_t *mask, int w, int h,
264                                         unsigned int *sse, int *sum) {
265   int64_t sum64 = 0;
266   uint64_t sse64 = 0;
267   if (w == 4) {
268     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
269   } else {
270     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
271   }
272   *sum = (int)sum64;
273   *sse = (unsigned int)sse64;
274 }
275 
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)276 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
277                                            const int32_t *wsrc,
278                                            const int32_t *mask, int w, int h,
279                                            unsigned int *sse, int *sum) {
280   int64_t sum64 = 0;
281   uint64_t sse64 = 0;
282   if (w == 4) {
283     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
284   } else if (w < 128 || h < 128) {
285     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
286   } else {
287     assert(w == 128 && h == 128);
288 
289     do {
290       hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
291                             64);
292       pre8 += 64 * pre_stride;
293       wsrc += 64 * w;
294       mask += 64 * w;
295       h -= 64;
296     } while (h > 0);
297   }
298   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
299   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
300 }
301 
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)302 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
303                                            const int32_t *wsrc,
304                                            const int32_t *mask, int w, int h,
305                                            unsigned int *sse, int *sum) {
306   int64_t sum64 = 0;
307   uint64_t sse64 = 0;
308   int max_pel_allowed_per_ovf = 512;
309   if (w == 4) {
310     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
311   } else if (w * h <= max_pel_allowed_per_ovf) {
312     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
313   } else {
314     int h_per_ovf = max_pel_allowed_per_ovf / w;
315 
316     assert(max_pel_allowed_per_ovf % w == 0);
317     do {
318       hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
319                             h_per_ovf);
320       pre8 += h_per_ovf * pre_stride;
321       wsrc += h_per_ovf * w;
322       mask += h_per_ovf * w;
323       h -= h_per_ovf;
324     } while (h > 0);
325   }
326   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
327   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
328 }
329 
330 #define HBD_OBMCVARWXH(W, H)                                               \
331   unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1(                 \
332       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
333       const int32_t *mask, unsigned int *sse) {                            \
334     int sum;                                                               \
335     highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
336     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
337   }                                                                        \
338                                                                            \
339   unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1(              \
340       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
341       const int32_t *mask, unsigned int *sse) {                            \
342     int sum;                                                               \
343     int64_t var;                                                           \
344     highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
345     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
346     return (var >= 0) ? (uint32_t)var : 0;                                 \
347   }                                                                        \
348                                                                            \
349   unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1(              \
350       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
351       const int32_t *mask, unsigned int *sse) {                            \
352     int sum;                                                               \
353     int64_t var;                                                           \
354     highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
355     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
356     return (var >= 0) ? (uint32_t)var : 0;                                 \
357   }
358 
359 HBD_OBMCVARWXH(128, 128)
360 HBD_OBMCVARWXH(128, 64)
361 HBD_OBMCVARWXH(64, 128)
362 HBD_OBMCVARWXH(64, 64)
363 HBD_OBMCVARWXH(64, 32)
364 HBD_OBMCVARWXH(32, 64)
365 HBD_OBMCVARWXH(32, 32)
366 HBD_OBMCVARWXH(32, 16)
367 HBD_OBMCVARWXH(16, 32)
368 HBD_OBMCVARWXH(16, 16)
369 HBD_OBMCVARWXH(16, 8)
370 HBD_OBMCVARWXH(8, 16)
371 HBD_OBMCVARWXH(8, 8)
372 HBD_OBMCVARWXH(8, 4)
373 HBD_OBMCVARWXH(4, 8)
374 HBD_OBMCVARWXH(4, 4)
375 HBD_OBMCVARWXH(4, 16)
376 HBD_OBMCVARWXH(16, 4)
377 HBD_OBMCVARWXH(8, 32)
378 HBD_OBMCVARWXH(32, 8)
379 HBD_OBMCVARWXH(16, 64)
380 HBD_OBMCVARWXH(64, 16)
381 #endif  // CONFIG_AV1_HIGHBITDEPTH
382