1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/aom_filter.h"
18 #include "aom_dsp/x86/convolve_common_intrin.h"
19 #include "av1/common/convolve.h"
20 
prepare_coeffs(const InterpFilterParams * const filter_params,const int subpel_q4,__m128i * const coeffs)21 static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
22                                   const int subpel_q4,
23                                   __m128i *const coeffs /* [4] */) {
24   const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
25       filter_params, subpel_q4 & SUBPEL_MASK);
26   const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
27   // coeffs 0 1 0 1 2 3 2 3
28   const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
29   // coeffs 4 5 4 5 6 7 6 7
30   const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
31 
32   coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0);  // coeffs 0 1 0 1 0 1 0 1
33   coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
34   coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1);  // coeffs 4 5 4 5 4 5 4 5
35   coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1);  // coeffs 6 7 6 7 6 7 6 7
36 }
37 
convolve(const __m128i * const s,const __m128i * const coeffs)38 static INLINE __m128i convolve(const __m128i *const s,
39                                const __m128i *const coeffs) {
40   const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
41   const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
42   const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
43   const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
44   const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
45   return d;
46 }
47 
convolve_lo_x(const __m128i * const s,const __m128i * const coeffs)48 static INLINE __m128i convolve_lo_x(const __m128i *const s,
49                                     const __m128i *const coeffs) {
50   __m128i ss[4];
51   ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
52   ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
53   ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
54   ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
55   return convolve(ss, coeffs);
56 }
57 
convolve_lo_y(const __m128i * const s,const __m128i * const coeffs)58 static INLINE __m128i convolve_lo_y(const __m128i *const s,
59                                     const __m128i *const coeffs) {
60   __m128i ss[4];
61   ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
62   ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
63   ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
64   ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
65   return convolve(ss, coeffs);
66 }
67 
convolve_hi_y(const __m128i * const s,const __m128i * const coeffs)68 static INLINE __m128i convolve_hi_y(const __m128i *const s,
69                                     const __m128i *const coeffs) {
70   __m128i ss[4];
71   ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
72   ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
73   ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
74   ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
75   return convolve(ss, coeffs);
76 }
77 
av1_convolve_y_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)78 void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
79                             int dst_stride, int w, int h,
80                             const InterpFilterParams *filter_params_x,
81                             const InterpFilterParams *filter_params_y,
82                             const int subpel_x_q4, const int subpel_y_q4,
83                             ConvolveParams *conv_params) {
84   const int fo_vert = filter_params_y->taps / 2 - 1;
85   const uint8_t *src_ptr = src - fo_vert * src_stride;
86   const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
87   const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
88   __m128i coeffs[4];
89 
90   (void)filter_params_x;
91   (void)subpel_x_q4;
92   (void)conv_params;
93 
94   assert(conv_params->round_0 <= FILTER_BITS);
95   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
96          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
97 
98   prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
99 
100   if (w <= 4) {
101     __m128i s[8], src6, res, res_round, res16;
102     uint32_t res_int;
103     src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
104     s[0] = _mm_unpacklo_epi8(
105         _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
106         _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
107     s[1] = _mm_unpacklo_epi8(
108         _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
109         _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
110     s[2] = _mm_unpacklo_epi8(
111         _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
112         _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
113     s[3] = _mm_unpacklo_epi8(
114         _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
115         _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
116     s[4] = _mm_unpacklo_epi8(
117         _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
118         _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
119     s[5] = _mm_unpacklo_epi8(
120         _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
121 
122     do {
123       s[6] = _mm_unpacklo_epi8(
124           src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
125       src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
126       s[7] = _mm_unpacklo_epi8(
127           _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
128 
129       res = convolve_lo_y(s + 0, coeffs);
130       res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
131       res16 = _mm_packs_epi32(res_round, res_round);
132       res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
133 
134       if (w == 2)
135         *(uint16_t *)dst = res_int;
136       else
137         *(uint32_t *)dst = res_int;
138 
139       src_ptr += src_stride;
140       dst += dst_stride;
141 
142       res = convolve_lo_y(s + 1, coeffs);
143       res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
144       res16 = _mm_packs_epi32(res_round, res_round);
145       res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
146 
147       if (w == 2)
148         *(uint16_t *)dst = res_int;
149       else
150         *(uint32_t *)dst = res_int;
151 
152       src_ptr += src_stride;
153       dst += dst_stride;
154 
155       s[0] = s[2];
156       s[1] = s[3];
157       s[2] = s[4];
158       s[3] = s[5];
159       s[4] = s[6];
160       s[5] = s[7];
161       h -= 2;
162     } while (h);
163   } else {
164     assert(!(w % 8));
165     int j = 0;
166     do {
167       __m128i s[8], src6, res_lo, res_hi;
168       __m128i res_lo_round, res_hi_round, res16, res;
169       const uint8_t *data = &src_ptr[j];
170 
171       src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
172       s[0] = _mm_unpacklo_epi8(
173           _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
174           _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
175       s[1] = _mm_unpacklo_epi8(
176           _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
177           _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
178       s[2] = _mm_unpacklo_epi8(
179           _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
180           _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
181       s[3] = _mm_unpacklo_epi8(
182           _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
183           _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
184       s[4] = _mm_unpacklo_epi8(
185           _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
186           _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
187       s[5] = _mm_unpacklo_epi8(
188           _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
189 
190       int i = 0;
191       do {
192         data = &src_ptr[i * src_stride + j];
193         s[6] = _mm_unpacklo_epi8(
194             src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
195         src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
196         s[7] = _mm_unpacklo_epi8(
197             _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
198 
199         res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
200         res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
201 
202         res_lo_round =
203             _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
204         res_hi_round =
205             _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
206 
207         res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
208         res = _mm_packus_epi16(res16, res16);
209 
210         _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
211         i++;
212 
213         res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
214         res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
215 
216         res_lo_round =
217             _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
218         res_hi_round =
219             _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
220 
221         res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
222         res = _mm_packus_epi16(res16, res16);
223 
224         _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
225         i++;
226 
227         s[0] = s[2];
228         s[1] = s[3];
229         s[2] = s[4];
230         s[3] = s[5];
231         s[4] = s[6];
232         s[5] = s[7];
233       } while (i < h);
234       j += 8;
235     } while (j < w);
236   }
237 }
238 
av1_convolve_x_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)239 void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
240                             int dst_stride, int w, int h,
241                             const InterpFilterParams *filter_params_x,
242                             const InterpFilterParams *filter_params_y,
243                             const int subpel_x_q4, const int subpel_y_q4,
244                             ConvolveParams *conv_params) {
245   const int fo_horiz = filter_params_x->taps / 2 - 1;
246   const uint8_t *src_ptr = src - fo_horiz;
247   const int bits = FILTER_BITS - conv_params->round_0;
248   const __m128i round_0_const =
249       _mm_set1_epi32((1 << conv_params->round_0) >> 1);
250   const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
251   const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
252   const __m128i round_shift = _mm_cvtsi32_si128(bits);
253   __m128i coeffs[4];
254 
255   (void)filter_params_y;
256   (void)subpel_y_q4;
257 
258   assert(bits >= 0);
259   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
260          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
261 
262   prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
263 
264   if (w <= 4) {
265     do {
266       const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
267       __m128i s[4];
268 
269       s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
270       s[1] =
271           _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
272       s[2] =
273           _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
274       s[3] =
275           _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
276       const __m128i res_lo = convolve_lo_x(s, coeffs);
277       __m128i res_lo_round =
278           _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
279       res_lo_round =
280           _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift);
281 
282       const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
283       const __m128i res = _mm_packus_epi16(res16, res16);
284 
285       uint32_t r = _mm_cvtsi128_si32(res);
286       if (w == 2)
287         *(uint16_t *)dst = r;
288       else
289         *(uint32_t *)dst = r;
290 
291       src_ptr += src_stride;
292       dst += dst_stride;
293     } while (--h);
294   } else {
295     assert(!(w % 8));
296     int i = 0;
297     do {
298       int j = 0;
299       do {
300         const __m128i data =
301             _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
302         __m128i s[4];
303 
304         // Filter even-index pixels
305         s[0] = data;
306         s[1] = _mm_srli_si128(data, 2);
307         s[2] = _mm_srli_si128(data, 4);
308         s[3] = _mm_srli_si128(data, 6);
309         const __m128i res_even = convolve_lo_x(s, coeffs);
310 
311         // Filter odd-index pixels
312         s[0] = _mm_srli_si128(data, 1);
313         s[1] = _mm_srli_si128(data, 3);
314         s[2] = _mm_srli_si128(data, 5);
315         s[3] = _mm_srli_si128(data, 7);
316         const __m128i res_odd = convolve_lo_x(s, coeffs);
317 
318         // Rearrange pixels back into the order 0 ... 7
319         const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
320         const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
321         __m128i res_lo_round =
322             _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
323         res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
324                                      round_shift);
325         __m128i res_hi_round =
326             _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift);
327         res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
328                                      round_shift);
329 
330         const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
331         const __m128i res = _mm_packus_epi16(res16, res16);
332 
333         _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
334         j += 8;
335       } while (j < w);
336     } while (++i < h);
337   }
338 }
339