1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h>
13 #include <assert.h>
14
15 #include "config/aom_dsp_rtcd.h"
16
17 #include "aom_dsp/x86/convolve_avx2.h"
18 #include "aom_dsp/x86/synonyms.h"
19 #include "aom_dsp/aom_dsp_common.h"
20 #include "aom_dsp/aom_filter.h"
21 #include "av1/common/convolve.h"
22
av1_highbd_convolve_2d_sr_avx2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)23 void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
24 uint16_t *dst, int dst_stride, int w, int h,
25 const InterpFilterParams *filter_params_x,
26 const InterpFilterParams *filter_params_y,
27 const int subpel_x_q4,
28 const int subpel_y_q4,
29 ConvolveParams *conv_params, int bd) {
30 DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
31 int im_h = h + filter_params_y->taps - 1;
32 int im_stride = 8;
33 int i, j;
34 const int fo_vert = filter_params_y->taps / 2 - 1;
35 const int fo_horiz = filter_params_x->taps / 2 - 1;
36 const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
37
38 // Check that, even with 12-bit input, the intermediate values will fit
39 // into an unsigned 16-bit intermediate array.
40 assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
41
42 __m256i s[8], coeffs_y[4], coeffs_x[4];
43
44 const __m256i round_const_x = _mm256_set1_epi32(
45 ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
46 const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
47
48 const __m256i round_const_y = _mm256_set1_epi32(
49 ((1 << conv_params->round_1) >> 1) -
50 (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
51 const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
52
53 const int bits =
54 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
55 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
56 const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
57 const __m256i clip_pixel =
58 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
59 const __m256i zero = _mm256_setzero_si256();
60
61 prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
62 prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
63
64 for (j = 0; j < w; j += 8) {
65 /* Horizontal filter */
66 {
67 for (i = 0; i < im_h; i += 2) {
68 const __m256i row0 =
69 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
70 __m256i row1 = _mm256_set1_epi16(0);
71 if (i + 1 < im_h)
72 row1 =
73 _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
74
75 const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
76 const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
77
78 // even pixels
79 s[0] = _mm256_alignr_epi8(r1, r0, 0);
80 s[1] = _mm256_alignr_epi8(r1, r0, 4);
81 s[2] = _mm256_alignr_epi8(r1, r0, 8);
82 s[3] = _mm256_alignr_epi8(r1, r0, 12);
83
84 __m256i res_even = convolve(s, coeffs_x);
85 res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
86 round_shift_x);
87
88 // odd pixels
89 s[0] = _mm256_alignr_epi8(r1, r0, 2);
90 s[1] = _mm256_alignr_epi8(r1, r0, 6);
91 s[2] = _mm256_alignr_epi8(r1, r0, 10);
92 s[3] = _mm256_alignr_epi8(r1, r0, 14);
93
94 __m256i res_odd = convolve(s, coeffs_x);
95 res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
96 round_shift_x);
97
98 __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
99 __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
100 __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
101
102 _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
103 }
104 }
105
106 /* Vertical filter */
107 {
108 __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
109 __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
110 __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
111 __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
112 __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
113 __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
114
115 s[0] = _mm256_unpacklo_epi16(s0, s1);
116 s[1] = _mm256_unpacklo_epi16(s2, s3);
117 s[2] = _mm256_unpacklo_epi16(s4, s5);
118
119 s[4] = _mm256_unpackhi_epi16(s0, s1);
120 s[5] = _mm256_unpackhi_epi16(s2, s3);
121 s[6] = _mm256_unpackhi_epi16(s4, s5);
122
123 for (i = 0; i < h; i += 2) {
124 const int16_t *data = &im_block[i * im_stride];
125
126 const __m256i s6 =
127 _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
128 const __m256i s7 =
129 _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
130
131 s[3] = _mm256_unpacklo_epi16(s6, s7);
132 s[7] = _mm256_unpackhi_epi16(s6, s7);
133
134 const __m256i res_a = convolve(s, coeffs_y);
135 __m256i res_a_round = _mm256_sra_epi32(
136 _mm256_add_epi32(res_a, round_const_y), round_shift_y);
137
138 res_a_round = _mm256_sra_epi32(
139 _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
140
141 if (w - j > 4) {
142 const __m256i res_b = convolve(s + 4, coeffs_y);
143 __m256i res_b_round = _mm256_sra_epi32(
144 _mm256_add_epi32(res_b, round_const_y), round_shift_y);
145 res_b_round =
146 _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
147 round_shift_bits);
148
149 __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
150 res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
151 res_16bit = _mm256_max_epi16(res_16bit, zero);
152
153 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
154 _mm256_castsi256_si128(res_16bit));
155 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
156 _mm256_extracti128_si256(res_16bit, 1));
157 } else if (w == 4) {
158 res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
159 res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
160 res_a_round = _mm256_max_epi16(res_a_round, zero);
161
162 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
163 _mm256_castsi256_si128(res_a_round));
164 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
165 _mm256_extracti128_si256(res_a_round, 1));
166 } else {
167 res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
168 res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
169 res_a_round = _mm256_max_epi16(res_a_round, zero);
170
171 xx_storel_32((__m128i *)&dst[i * dst_stride + j],
172 _mm256_castsi256_si128(res_a_round));
173 xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
174 _mm256_extracti128_si256(res_a_round, 1));
175 }
176
177 s[0] = s[1];
178 s[1] = s[2];
179 s[2] = s[3];
180
181 s[4] = s[5];
182 s[5] = s[6];
183 s[6] = s[7];
184 }
185 }
186 }
187 }
188
copy_64(const uint16_t * src,uint16_t * dst)189 static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
190 __m256i s[4];
191 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
192 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
193 s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
194 s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
195 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
196 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
197 _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
198 _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
199 }
200
copy_128(const uint16_t * src,uint16_t * dst)201 static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
202 __m256i s[8];
203 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
204 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
205 s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
206 s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
207 s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
208 s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
209 s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
210 s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
211
212 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
213 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
214 _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
215 _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
216 _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
217 _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
218 _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
219 _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
220 }
221
av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)222 void av1_highbd_convolve_2d_copy_sr_avx2(
223 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
224 int h, const InterpFilterParams *filter_params_x,
225 const InterpFilterParams *filter_params_y, const int subpel_x_q4,
226 const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
227 (void)filter_params_x;
228 (void)filter_params_y;
229 (void)subpel_x_q4;
230 (void)subpel_y_q4;
231 (void)conv_params;
232 (void)bd;
233
234 if (w >= 16) {
235 assert(!((intptr_t)dst % 16));
236 assert(!(dst_stride % 16));
237 }
238
239 if (w == 2) {
240 do {
241 memcpy(dst, src, 2 * sizeof(*src));
242 src += src_stride;
243 dst += dst_stride;
244 memcpy(dst, src, 2 * sizeof(*src));
245 src += src_stride;
246 dst += dst_stride;
247 h -= 2;
248 } while (h);
249 } else if (w == 4) {
250 do {
251 __m128i s[2];
252 s[0] = _mm_loadl_epi64((__m128i *)src);
253 src += src_stride;
254 s[1] = _mm_loadl_epi64((__m128i *)src);
255 src += src_stride;
256 _mm_storel_epi64((__m128i *)dst, s[0]);
257 dst += dst_stride;
258 _mm_storel_epi64((__m128i *)dst, s[1]);
259 dst += dst_stride;
260 h -= 2;
261 } while (h);
262 } else if (w == 8) {
263 do {
264 __m128i s[2];
265 s[0] = _mm_loadu_si128((__m128i *)src);
266 src += src_stride;
267 s[1] = _mm_loadu_si128((__m128i *)src);
268 src += src_stride;
269 _mm_store_si128((__m128i *)dst, s[0]);
270 dst += dst_stride;
271 _mm_store_si128((__m128i *)dst, s[1]);
272 dst += dst_stride;
273 h -= 2;
274 } while (h);
275 } else if (w == 16) {
276 do {
277 __m256i s[2];
278 s[0] = _mm256_loadu_si256((__m256i *)src);
279 src += src_stride;
280 s[1] = _mm256_loadu_si256((__m256i *)src);
281 src += src_stride;
282 _mm256_storeu_si256((__m256i *)dst, s[0]);
283 dst += dst_stride;
284 _mm256_storeu_si256((__m256i *)dst, s[1]);
285 dst += dst_stride;
286 h -= 2;
287 } while (h);
288 } else if (w == 32) {
289 do {
290 __m256i s[4];
291 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
292 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
293 src += src_stride;
294 s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
295 s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
296 src += src_stride;
297 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
298 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
299 dst += dst_stride;
300 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
301 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
302 dst += dst_stride;
303 h -= 2;
304 } while (h);
305 } else if (w == 64) {
306 do {
307 copy_64(src, dst);
308 src += src_stride;
309 dst += dst_stride;
310 copy_64(src, dst);
311 src += src_stride;
312 dst += dst_stride;
313 h -= 2;
314 } while (h);
315 } else {
316 do {
317 copy_128(src, dst);
318 src += src_stride;
319 dst += dst_stride;
320 copy_128(src, dst);
321 src += src_stride;
322 dst += dst_stride;
323 h -= 2;
324 } while (h);
325 }
326 }
327