1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>
13 #include <assert.h>
14 
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom_dsp/x86/convolve_avx2.h"
18 #include "aom_dsp/x86/synonyms.h"
19 #include "aom_dsp/aom_dsp_common.h"
20 #include "aom_dsp/aom_filter.h"
21 #include "av1/common/convolve.h"
22 
av1_highbd_convolve_2d_sr_avx2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)23 void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
24                                     uint16_t *dst, int dst_stride, int w, int h,
25                                     const InterpFilterParams *filter_params_x,
26                                     const InterpFilterParams *filter_params_y,
27                                     const int subpel_x_q4,
28                                     const int subpel_y_q4,
29                                     ConvolveParams *conv_params, int bd) {
30   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
31   int im_h = h + filter_params_y->taps - 1;
32   int im_stride = 8;
33   int i, j;
34   const int fo_vert = filter_params_y->taps / 2 - 1;
35   const int fo_horiz = filter_params_x->taps / 2 - 1;
36   const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
37 
38   // Check that, even with 12-bit input, the intermediate values will fit
39   // into an unsigned 16-bit intermediate array.
40   assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
41 
42   __m256i s[8], coeffs_y[4], coeffs_x[4];
43 
44   const __m256i round_const_x = _mm256_set1_epi32(
45       ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
46   const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
47 
48   const __m256i round_const_y = _mm256_set1_epi32(
49       ((1 << conv_params->round_1) >> 1) -
50       (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
51   const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
52 
53   const int bits =
54       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
55   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
56   const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
57   const __m256i clip_pixel =
58       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
59   const __m256i zero = _mm256_setzero_si256();
60 
61   prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
62   prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
63 
64   for (j = 0; j < w; j += 8) {
65     /* Horizontal filter */
66     {
67       for (i = 0; i < im_h; i += 2) {
68         const __m256i row0 =
69             _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
70         __m256i row1 = _mm256_set1_epi16(0);
71         if (i + 1 < im_h)
72           row1 =
73               _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
74 
75         const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
76         const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
77 
78         // even pixels
79         s[0] = _mm256_alignr_epi8(r1, r0, 0);
80         s[1] = _mm256_alignr_epi8(r1, r0, 4);
81         s[2] = _mm256_alignr_epi8(r1, r0, 8);
82         s[3] = _mm256_alignr_epi8(r1, r0, 12);
83 
84         __m256i res_even = convolve(s, coeffs_x);
85         res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
86                                     round_shift_x);
87 
88         // odd pixels
89         s[0] = _mm256_alignr_epi8(r1, r0, 2);
90         s[1] = _mm256_alignr_epi8(r1, r0, 6);
91         s[2] = _mm256_alignr_epi8(r1, r0, 10);
92         s[3] = _mm256_alignr_epi8(r1, r0, 14);
93 
94         __m256i res_odd = convolve(s, coeffs_x);
95         res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
96                                    round_shift_x);
97 
98         __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
99         __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
100         __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
101 
102         _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
103       }
104     }
105 
106     /* Vertical filter */
107     {
108       __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
109       __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
110       __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
111       __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
112       __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
113       __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
114 
115       s[0] = _mm256_unpacklo_epi16(s0, s1);
116       s[1] = _mm256_unpacklo_epi16(s2, s3);
117       s[2] = _mm256_unpacklo_epi16(s4, s5);
118 
119       s[4] = _mm256_unpackhi_epi16(s0, s1);
120       s[5] = _mm256_unpackhi_epi16(s2, s3);
121       s[6] = _mm256_unpackhi_epi16(s4, s5);
122 
123       for (i = 0; i < h; i += 2) {
124         const int16_t *data = &im_block[i * im_stride];
125 
126         const __m256i s6 =
127             _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
128         const __m256i s7 =
129             _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
130 
131         s[3] = _mm256_unpacklo_epi16(s6, s7);
132         s[7] = _mm256_unpackhi_epi16(s6, s7);
133 
134         const __m256i res_a = convolve(s, coeffs_y);
135         __m256i res_a_round = _mm256_sra_epi32(
136             _mm256_add_epi32(res_a, round_const_y), round_shift_y);
137 
138         res_a_round = _mm256_sra_epi32(
139             _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
140 
141         if (w - j > 4) {
142           const __m256i res_b = convolve(s + 4, coeffs_y);
143           __m256i res_b_round = _mm256_sra_epi32(
144               _mm256_add_epi32(res_b, round_const_y), round_shift_y);
145           res_b_round =
146               _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
147                                round_shift_bits);
148 
149           __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
150           res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
151           res_16bit = _mm256_max_epi16(res_16bit, zero);
152 
153           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
154                            _mm256_castsi256_si128(res_16bit));
155           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
156                            _mm256_extracti128_si256(res_16bit, 1));
157         } else if (w == 4) {
158           res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
159           res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
160           res_a_round = _mm256_max_epi16(res_a_round, zero);
161 
162           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
163                            _mm256_castsi256_si128(res_a_round));
164           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
165                            _mm256_extracti128_si256(res_a_round, 1));
166         } else {
167           res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
168           res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
169           res_a_round = _mm256_max_epi16(res_a_round, zero);
170 
171           xx_storel_32((__m128i *)&dst[i * dst_stride + j],
172                        _mm256_castsi256_si128(res_a_round));
173           xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
174                        _mm256_extracti128_si256(res_a_round, 1));
175         }
176 
177         s[0] = s[1];
178         s[1] = s[2];
179         s[2] = s[3];
180 
181         s[4] = s[5];
182         s[5] = s[6];
183         s[6] = s[7];
184       }
185     }
186   }
187 }
188 
copy_64(const uint16_t * src,uint16_t * dst)189 static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
190   __m256i s[4];
191   s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
192   s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
193   s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
194   s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
195   _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
196   _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
197   _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
198   _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
199 }
200 
copy_128(const uint16_t * src,uint16_t * dst)201 static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
202   __m256i s[8];
203   s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
204   s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
205   s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
206   s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
207   s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
208   s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
209   s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
210   s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
211 
212   _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
213   _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
214   _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
215   _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
216   _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
217   _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
218   _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
219   _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
220 }
221 
av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)222 void av1_highbd_convolve_2d_copy_sr_avx2(
223     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
224     int h, const InterpFilterParams *filter_params_x,
225     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
226     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
227   (void)filter_params_x;
228   (void)filter_params_y;
229   (void)subpel_x_q4;
230   (void)subpel_y_q4;
231   (void)conv_params;
232   (void)bd;
233 
234   if (w >= 16) {
235     assert(!((intptr_t)dst % 16));
236     assert(!(dst_stride % 16));
237   }
238 
239   if (w == 2) {
240     do {
241       memcpy(dst, src, 2 * sizeof(*src));
242       src += src_stride;
243       dst += dst_stride;
244       memcpy(dst, src, 2 * sizeof(*src));
245       src += src_stride;
246       dst += dst_stride;
247       h -= 2;
248     } while (h);
249   } else if (w == 4) {
250     do {
251       __m128i s[2];
252       s[0] = _mm_loadl_epi64((__m128i *)src);
253       src += src_stride;
254       s[1] = _mm_loadl_epi64((__m128i *)src);
255       src += src_stride;
256       _mm_storel_epi64((__m128i *)dst, s[0]);
257       dst += dst_stride;
258       _mm_storel_epi64((__m128i *)dst, s[1]);
259       dst += dst_stride;
260       h -= 2;
261     } while (h);
262   } else if (w == 8) {
263     do {
264       __m128i s[2];
265       s[0] = _mm_loadu_si128((__m128i *)src);
266       src += src_stride;
267       s[1] = _mm_loadu_si128((__m128i *)src);
268       src += src_stride;
269       _mm_store_si128((__m128i *)dst, s[0]);
270       dst += dst_stride;
271       _mm_store_si128((__m128i *)dst, s[1]);
272       dst += dst_stride;
273       h -= 2;
274     } while (h);
275   } else if (w == 16) {
276     do {
277       __m256i s[2];
278       s[0] = _mm256_loadu_si256((__m256i *)src);
279       src += src_stride;
280       s[1] = _mm256_loadu_si256((__m256i *)src);
281       src += src_stride;
282       _mm256_storeu_si256((__m256i *)dst, s[0]);
283       dst += dst_stride;
284       _mm256_storeu_si256((__m256i *)dst, s[1]);
285       dst += dst_stride;
286       h -= 2;
287     } while (h);
288   } else if (w == 32) {
289     do {
290       __m256i s[4];
291       s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
292       s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
293       src += src_stride;
294       s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
295       s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
296       src += src_stride;
297       _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
298       _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
299       dst += dst_stride;
300       _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
301       _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
302       dst += dst_stride;
303       h -= 2;
304     } while (h);
305   } else if (w == 64) {
306     do {
307       copy_64(src, dst);
308       src += src_stride;
309       dst += dst_stride;
310       copy_64(src, dst);
311       src += src_stride;
312       dst += dst_stride;
313       h -= 2;
314     } while (h);
315   } else {
316     do {
317       copy_128(src, dst);
318       src += src_stride;
319       dst += dst_stride;
320       copy_128(src, dst);
321       src += src_stride;
322       dst += dst_stride;
323       h -= 2;
324     } while (h);
325   }
326 }
327