1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h>
13
14 #include "config/av1_rtcd.h"
15
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/x86/convolve_common_intrin.h"
18 #include "aom_dsp/x86/convolve_avx2.h"
19 #include "aom_dsp/x86/synonyms.h"
20
av1_convolve_y_sr_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)21 void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
22 int dst_stride, int w, int h,
23 const InterpFilterParams *filter_params_y,
24 const int subpel_y_qn) {
25 int i, j, vert_tap = SUBPEL_TAPS;
26 // right shift is F-1 because we are already dividing
27 // filter co-efficients by 2
28 const int right_shift_bits = (FILTER_BITS - 1);
29 __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
30 __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1);
31
32 __m256i coeffs[6], s[12];
33 __m128i d[10];
34
35 // Condition for checking valid vert_filt taps
36 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
37 filter_params_y, subpel_y_qn & SUBPEL_MASK);
38 if (filter_params_y->taps == 12) {
39 vert_tap = 12;
40 } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
41 vert_tap = 4;
42 } else if (!(filter[0] | filter[7])) {
43 vert_tap = 6;
44 }
45
46 if (vert_tap == 6)
47 prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
48 else if (vert_tap == 12) {
49 prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
50 } else {
51 prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
52 }
53
54 // vert_filt as 4 tap
55 if (vert_tap == 4) {
56 const int fo_vert = 1;
57 const uint8_t *const src_ptr = src - fo_vert * src_stride;
58 for (j = 0; j < w; j += 16) {
59 const uint8_t *data = &src_ptr[j];
60 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
61 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
62 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
63 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
64 d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
65
66 // Load lines a and b. Line a to lower 128, line b to upper 128
67 const __m256i src_01a = _mm256_permute2x128_si256(
68 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
69
70 const __m256i src_12a = _mm256_permute2x128_si256(
71 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
72
73 const __m256i src_23a = _mm256_permute2x128_si256(
74 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
75
76 const __m256i src_34a = _mm256_permute2x128_si256(
77 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
78
79 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
80 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
81
82 s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
83 s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
84
85 for (i = 0; i < h; i += 2) {
86 data = &src_ptr[i * src_stride + j];
87 d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
88 const __m256i src_45a = _mm256_permute2x128_si256(
89 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
90
91 d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
92 const __m256i src_56a = _mm256_permute2x128_si256(
93 _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
94
95 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
96 s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
97
98 const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
99 /* rounding code */
100 // shift by F - 1
101 const __m256i res_16b_lo = _mm256_sra_epi16(
102 _mm256_add_epi16(res_lo, right_shift_const), right_shift);
103 // 8 bit conversion and saturation to uint8
104 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
105
106 if (w - j > 8) {
107 const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
108
109 /* rounding code */
110 // shift by F - 1
111 const __m256i res_16b_hi = _mm256_sra_epi16(
112 _mm256_add_epi16(res_hi, right_shift_const), right_shift);
113 // 8 bit conversion and saturation to uint8
114 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
115
116 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
117
118 const __m128i res_0 = _mm256_castsi256_si128(res_a);
119 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
120
121 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
122 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
123 res_1);
124 } else {
125 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
126 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
127 if (w - j > 4) {
128 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
129 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
130 res_1);
131 } else if (w - j > 2) {
132 xx_storel_32(&dst[i * dst_stride + j], res_0);
133 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
134 } else {
135 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
136 __m128i *const p_1 =
137 (__m128i *)&dst[i * dst_stride + j + dst_stride];
138 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
139 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
140 }
141 }
142 s[0] = s[1];
143 s[1] = s[2];
144
145 s[3] = s[4];
146 s[4] = s[5];
147 }
148 }
149 } else if (vert_tap == 6) {
150 const int fo_vert = vert_tap / 2 - 1;
151 const uint8_t *const src_ptr = src - fo_vert * src_stride;
152
153 for (j = 0; j < w; j += 16) {
154 const uint8_t *data = &src_ptr[j];
155 __m256i src6;
156
157 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
158 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
159 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
160 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
161 // Load lines a and b. Line a to lower 128, line b to upper 128
162 const __m256i src_01a = _mm256_permute2x128_si256(
163 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
164
165 const __m256i src_12a = _mm256_permute2x128_si256(
166 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
167
168 const __m256i src_23a = _mm256_permute2x128_si256(
169 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
170
171 src6 = _mm256_castsi128_si256(
172 _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
173 const __m256i src_34a =
174 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
175
176 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
177 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
178
179 s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
180 s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
181
182 for (i = 0; i < h; i += 2) {
183 data = &src_ptr[i * src_stride + j];
184 const __m256i src_45a = _mm256_permute2x128_si256(
185 src6,
186 _mm256_castsi128_si256(
187 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
188 0x20);
189
190 src6 = _mm256_castsi128_si256(
191 _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
192 const __m256i src_56a = _mm256_permute2x128_si256(
193 _mm256_castsi128_si256(
194 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
195 src6, 0x20);
196
197 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
198 s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
199
200 const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
201
202 /* rounding code */
203 // shift by F - 1
204 const __m256i res_16b_lo = _mm256_sra_epi16(
205 _mm256_add_epi16(res_lo, right_shift_const), right_shift);
206 // 8 bit conversion and saturation to uint8
207 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
208
209 if (w - j > 8) {
210 const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
211
212 /* rounding code */
213 // shift by F - 1
214 const __m256i res_16b_hi = _mm256_sra_epi16(
215 _mm256_add_epi16(res_hi, right_shift_const), right_shift);
216 // 8 bit conversion and saturation to uint8
217 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
218
219 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
220
221 const __m128i res_0 = _mm256_castsi256_si128(res_a);
222 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
223
224 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
225 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
226 res_1);
227 } else {
228 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
229 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
230 if (w - j > 4) {
231 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
232 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
233 res_1);
234 } else if (w - j > 2) {
235 xx_storel_32(&dst[i * dst_stride + j], res_0);
236 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
237 } else {
238 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
239 __m128i *const p_1 =
240 (__m128i *)&dst[i * dst_stride + j + dst_stride];
241 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
242 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
243 }
244 }
245 s[0] = s[1];
246 s[1] = s[2];
247 s[3] = s[4];
248 s[4] = s[5];
249 }
250 }
251 } else if (vert_tap == 12) { // vert_tap == 12
252 const int fo_vert = filter_params_y->taps / 2 - 1;
253 const uint8_t *const src_ptr = src - fo_vert * src_stride;
254 const __m256i v_zero = _mm256_setzero_si256();
255 right_shift = _mm_cvtsi32_si128(FILTER_BITS);
256 right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
257
258 for (j = 0; j < w; j += 8) {
259 const uint8_t *data = &src_ptr[j];
260 __m256i src10;
261
262 d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
263 d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
264 d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
265 d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
266 d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
267 d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
268 d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
269 d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
270 d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
271 d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
272 // Load lines a and b. Line a to lower 128, line b to upper 128
273 const __m256i src_01a = _mm256_permute2x128_si256(
274 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
275
276 const __m256i src_12a = _mm256_permute2x128_si256(
277 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
278
279 const __m256i src_23a = _mm256_permute2x128_si256(
280 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
281
282 const __m256i src_34a = _mm256_permute2x128_si256(
283 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
284
285 const __m256i src_45a = _mm256_permute2x128_si256(
286 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
287
288 const __m256i src_56a = _mm256_permute2x128_si256(
289 _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
290
291 const __m256i src_67a = _mm256_permute2x128_si256(
292 _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
293
294 const __m256i src_78a = _mm256_permute2x128_si256(
295 _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
296
297 const __m256i src_89a = _mm256_permute2x128_si256(
298 _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
299
300 src10 = _mm256_castsi128_si256(
301 _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
302 const __m256i src_910a =
303 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
304
305 const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
306 const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
307 const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
308 const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
309 const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
310 const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
311 const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
312 const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
313 const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
314 const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
315
316 s[0] = _mm256_unpacklo_epi16(src_01, src_12);
317 s[1] = _mm256_unpacklo_epi16(src_23, src_34);
318 s[2] = _mm256_unpacklo_epi16(src_45, src_56);
319 s[3] = _mm256_unpacklo_epi16(src_67, src_78);
320 s[4] = _mm256_unpacklo_epi16(src_89, src_910);
321
322 s[6] = _mm256_unpackhi_epi16(src_01, src_12);
323 s[7] = _mm256_unpackhi_epi16(src_23, src_34);
324 s[8] = _mm256_unpackhi_epi16(src_45, src_56);
325 s[9] = _mm256_unpackhi_epi16(src_67, src_78);
326 s[10] = _mm256_unpackhi_epi16(src_89, src_910);
327
328 for (i = 0; i < h; i += 2) {
329 data = &src_ptr[i * src_stride + j];
330 const __m256i src_1011a = _mm256_permute2x128_si256(
331 src10,
332 _mm256_castsi128_si256(
333 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
334 0x20);
335
336 src10 = _mm256_castsi128_si256(
337 _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
338
339 const __m256i src_1112a = _mm256_permute2x128_si256(
340 _mm256_castsi128_si256(
341 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
342 src10, 0x20);
343
344 const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
345 const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
346
347 s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
348 s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
349
350 const __m256i res_lo = convolve_12taps(s, coeffs);
351
352 const __m256i res_32b_lo = _mm256_sra_epi32(
353 _mm256_add_epi32(res_lo, right_shift_const), right_shift);
354 // 8 bit conversion and saturation to uint8
355 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
356 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
357
358 if (w - j > 4) {
359 const __m256i res_hi = convolve_12taps(s + 6, coeffs);
360
361 const __m256i res_32b_hi = _mm256_sra_epi32(
362 _mm256_add_epi32(res_hi, right_shift_const), right_shift);
363 __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
364 // 8 bit conversion and saturation to uint8
365 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
366
367 __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
368
369 const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
370 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
371
372 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
373 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
374 res_1);
375 } else {
376 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
377 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
378 if (w - j > 2) {
379 *(uint32_t *)&dst[i * dst_stride + j] =
380 (uint32_t)_mm_cvtsi128_si32(res_0);
381 *(uint32_t *)&dst[i * dst_stride + j + dst_stride] =
382 (uint32_t)_mm_cvtsi128_si32(res_1);
383 } else {
384 *(uint16_t *)&dst[i * dst_stride + j] =
385 (uint16_t)_mm_cvtsi128_si32(res_0);
386 *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
387 (uint16_t)_mm_cvtsi128_si32(res_1);
388 }
389 }
390 s[0] = s[1];
391 s[1] = s[2];
392 s[2] = s[3];
393 s[3] = s[4];
394 s[4] = s[5];
395
396 s[6] = s[7];
397 s[7] = s[8];
398 s[8] = s[9];
399 s[9] = s[10];
400 s[10] = s[11];
401 }
402 }
403 } else {
404 const int fo_vert = filter_params_y->taps / 2 - 1;
405 const uint8_t *const src_ptr = src - fo_vert * src_stride;
406
407 for (j = 0; j < w; j += 16) {
408 const uint8_t *data = &src_ptr[j];
409 __m256i src6;
410
411 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
412 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
413 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
414 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
415 d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
416 d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
417 // Load lines a and b. Line a to lower 128, line b to upper 128
418 const __m256i src_01a = _mm256_permute2x128_si256(
419 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
420
421 const __m256i src_12a = _mm256_permute2x128_si256(
422 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
423
424 const __m256i src_23a = _mm256_permute2x128_si256(
425 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
426
427 const __m256i src_34a = _mm256_permute2x128_si256(
428 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
429
430 const __m256i src_45a = _mm256_permute2x128_si256(
431 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
432
433 src6 = _mm256_castsi128_si256(
434 _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
435 const __m256i src_56a =
436 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
437
438 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
439 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
440 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
441
442 s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
443 s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
444 s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
445
446 for (i = 0; i < h; i += 2) {
447 data = &src_ptr[i * src_stride + j];
448 const __m256i src_67a = _mm256_permute2x128_si256(
449 src6,
450 _mm256_castsi128_si256(
451 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
452 0x20);
453
454 src6 = _mm256_castsi128_si256(
455 _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
456 const __m256i src_78a = _mm256_permute2x128_si256(
457 _mm256_castsi128_si256(
458 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
459 src6, 0x20);
460
461 s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
462 s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
463
464 const __m256i res_lo = convolve_lowbd(s, coeffs);
465
466 /* rounding code */
467 // shift by F - 1
468 const __m256i res_16b_lo = _mm256_sra_epi16(
469 _mm256_add_epi16(res_lo, right_shift_const), right_shift);
470 // 8 bit conversion and saturation to uint8
471 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
472
473 if (w - j > 8) {
474 const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
475
476 /* rounding code */
477 // shift by F - 1
478 const __m256i res_16b_hi = _mm256_sra_epi16(
479 _mm256_add_epi16(res_hi, right_shift_const), right_shift);
480 // 8 bit conversion and saturation to uint8
481 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
482
483 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
484
485 const __m128i res_0 = _mm256_castsi256_si128(res_a);
486 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
487
488 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
489 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
490 res_1);
491 } else {
492 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
493 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
494 if (w - j > 4) {
495 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
496 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
497 res_1);
498 } else if (w - j > 2) {
499 xx_storel_32(&dst[i * dst_stride + j], res_0);
500 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
501 } else {
502 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
503 __m128i *const p_1 =
504 (__m128i *)&dst[i * dst_stride + j + dst_stride];
505 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
506 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
507 }
508 }
509 s[0] = s[1];
510 s[1] = s[2];
511 s[2] = s[3];
512
513 s[4] = s[5];
514 s[5] = s[6];
515 s[6] = s[7];
516 }
517 }
518 }
519 }
520
av1_convolve_x_sr_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)521 void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
522 int dst_stride, int w, int h,
523 const InterpFilterParams *filter_params_x,
524 const int subpel_x_qn,
525 ConvolveParams *conv_params) {
526 const int bits = FILTER_BITS - conv_params->round_0;
527 const __m128i round_shift = _mm_cvtsi32_si128(bits);
528 __m256i round_0_const =
529 _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
530 __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
531 __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
532 int i, horiz_tap = SUBPEL_TAPS;
533
534 assert(bits >= 0);
535 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
536 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
537 assert(conv_params->round_0 > 0);
538
539 __m256i coeffs[6], filt[4];
540 filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
541 filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
542
543 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
544 filter_params_x, subpel_x_qn & SUBPEL_MASK);
545 if (filter_params_x->taps == 12) {
546 horiz_tap = 12;
547 } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
548 horiz_tap = 4;
549 } else if (!(filter[0] | filter[7])) {
550 horiz_tap = 6;
551 }
552
553 if (horiz_tap == 6)
554 prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
555 else if (horiz_tap == 12) {
556 prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
557 } else {
558 prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
559 }
560
561 // horz_filt as 4 tap
562 if (horiz_tap == 4) {
563 const int fo_horiz = 1;
564 const uint8_t *const src_ptr = src - fo_horiz;
565 if (w <= 8) {
566 for (i = 0; i < h; i += 2) {
567 const __m256i data = _mm256_permute2x128_si256(
568 _mm256_castsi128_si256(
569 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
570 _mm256_castsi128_si256(_mm_loadu_si128(
571 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
572 0x20);
573
574 __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
575
576 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
577 round_0_shift);
578
579 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
580 round_shift);
581
582 /* rounding code */
583 // 8 bit conversion and saturation to uint8
584 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
585
586 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
587 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
588
589 if (w > 4) {
590 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
591 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
592 } else if (w > 2) {
593 xx_storel_32(&dst[i * dst_stride], res_0);
594 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
595 } else {
596 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
597 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
598 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
599 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
600 }
601 }
602 } else {
603 for (i = 0; i < h; ++i) {
604 for (int j = 0; j < w; j += 16) {
605 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
606 // 18 19 20 21 22 23
607 const __m256i data = _mm256_inserti128_si256(
608 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
609 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
610 1);
611
612 __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
613
614 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
615 round_0_shift);
616
617 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
618 round_shift);
619
620 /* rounding code */
621 // 8 bit conversion and saturation to uint8
622 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
623
624 // Store values into the destination buffer
625 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
626 res_8b = _mm256_permute4x64_epi64(res_8b, 216);
627 __m128i res = _mm256_castsi256_si128(res_8b);
628 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
629 }
630 }
631 }
632 } else if (horiz_tap == 6) {
633 const int fo_horiz = horiz_tap / 2 - 1;
634 const uint8_t *const src_ptr = src - fo_horiz;
635 filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
636 filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
637
638 if (w <= 8) {
639 for (i = 0; i < h; i += 2) {
640 const __m256i data = _mm256_permute2x128_si256(
641 _mm256_castsi128_si256(
642 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
643 _mm256_castsi128_si256(_mm_loadu_si128(
644 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
645 0x20);
646
647 __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
648
649 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
650 round_0_shift);
651
652 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
653 round_shift);
654
655 /* rounding code */
656 // 8 bit conversion and saturation to uint8
657 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
658
659 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
660 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
661 if (w > 4) {
662 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
663 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
664 } else if (w > 2) {
665 xx_storel_32(&dst[i * dst_stride], res_0);
666 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
667 } else {
668 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
669 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
670 *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
671 *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
672 }
673 }
674 } else {
675 for (i = 0; i < h; ++i) {
676 for (int j = 0; j < w; j += 16) {
677 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
678 // 18 19 20 21 22 23
679 const __m256i data = _mm256_inserti128_si256(
680 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
681 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
682 1);
683
684 __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
685
686 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
687 round_0_shift);
688
689 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
690 round_shift);
691
692 /* rounding code */
693 // 8 bit conversion and saturation to uint8
694 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
695
696 // Store values into the destination buffer
697 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
698 res_8b = _mm256_permute4x64_epi64(res_8b, 216);
699 __m128i res = _mm256_castsi256_si128(res_8b);
700 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
701 }
702 }
703 }
704 } else if (horiz_tap == 12) { // horiz_tap == 12
705 const int fo_horiz = filter_params_x->taps / 2 - 1;
706 const uint8_t *const src_ptr = src - fo_horiz;
707 const __m256i v_zero = _mm256_setzero_si256();
708 round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
709 round_const = _mm256_set1_epi32((1 << bits) >> 1);
710 round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
711 __m256i s[6];
712
713 if (w <= 4) {
714 for (i = 0; i < h; i += 2) {
715 const __m256i data = _mm256_permute2x128_si256(
716 _mm256_castsi128_si256(
717 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
718 _mm256_castsi128_si256(_mm_loadu_si128(
719 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
720 0x20);
721 // row0 0..7 row1 0..7
722 const __m256i s_16l = _mm256_unpacklo_epi8(data, v_zero);
723 // row0 8..F row1 8..F
724 const __m256i s_16h = _mm256_unpackhi_epi8(data, v_zero);
725
726 // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
727 const __m256i s_ll = _mm256_unpacklo_epi16(s_16l, s_16l);
728 // row0 04 04 .. 07 07 row1 04 04 .. 07 07
729 const __m256i s_lh = _mm256_unpackhi_epi16(s_16l, s_16l);
730
731 // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
732 const __m256i s_hl = _mm256_unpacklo_epi16(s_16h, s_16h);
733 // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
734 const __m256i s_hh = _mm256_unpackhi_epi16(s_16h, s_16h);
735
736 // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
737 s[0] = _mm256_alignr_epi8(s_lh, s_ll, 2);
738 // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
739 s[1] = _mm256_alignr_epi8(s_lh, s_ll, 10);
740 // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
741 s[2] = _mm256_alignr_epi8(s_hl, s_lh, 2);
742 // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
743 s[3] = _mm256_alignr_epi8(s_hl, s_lh, 10);
744 // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
745 s[4] = _mm256_alignr_epi8(s_hh, s_hl, 2);
746 // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
747 s[5] = _mm256_alignr_epi8(s_hh, s_hl, 10);
748
749 const __m256i res_lo = convolve_12taps(s, coeffs);
750
751 __m256i res_32b_lo = _mm256_sra_epi32(
752 _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
753
754 // 00 01 02 03 10 12 13 14
755 res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
756 round_shift);
757 // 8 bit conversion and saturation to uint8
758 // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
759 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
760 // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
761 // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
762 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
763
764 // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
765 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
766 // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
767 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
768 if (w > 2) {
769 // 00 01 02 03
770 *(uint32_t *)&dst[i * dst_stride] =
771 (uint32_t)_mm_cvtsi128_si32(res_0);
772 // 10 11 12 13
773 *(uint32_t *)&dst[i * dst_stride + dst_stride] =
774 (uint32_t)_mm_cvtsi128_si32(res_1);
775 } else {
776 // 00 01
777 *(uint16_t *)&dst[i * dst_stride] =
778 (uint16_t)_mm_cvtsi128_si32(res_0);
779 // 10 11
780 *(uint16_t *)&dst[i * dst_stride + dst_stride] =
781 (uint16_t)_mm_cvtsi128_si32(res_1);
782 }
783 }
784 } else {
785 for (i = 0; i < h; i++) {
786 for (int j = 0; j < w; j += 8) {
787 const __m256i data = _mm256_permute2x128_si256(
788 _mm256_castsi128_si256(
789 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
790 _mm256_castsi128_si256(_mm_loadu_si128(
791 (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
792 0x20);
793 // row0 0..7 4..B
794 const __m256i s_16l = _mm256_unpacklo_epi8(data, v_zero);
795 // row0 8..F C..13
796 const __m256i s_16h = _mm256_unpackhi_epi8(data, v_zero);
797
798 // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
799 const __m256i s_ll = _mm256_unpacklo_epi16(s_16l, s_16l);
800 // row0 04 04 .. 07 07 08 08 .. 0B 0B
801 const __m256i s_lh = _mm256_unpackhi_epi16(s_16l, s_16l);
802
803 // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
804 const __m256i s_hl = _mm256_unpacklo_epi16(s_16h, s_16h);
805 // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
806 const __m256i s_hh = _mm256_unpackhi_epi16(s_16h, s_16h);
807
808 s[0] = _mm256_alignr_epi8(s_lh, s_ll, 2);
809 s[1] = _mm256_alignr_epi8(s_lh, s_ll, 10);
810 s[2] = _mm256_alignr_epi8(s_hl, s_lh, 2);
811 s[3] = _mm256_alignr_epi8(s_hl, s_lh, 10);
812 s[4] = _mm256_alignr_epi8(s_hh, s_hl, 2);
813 s[5] = _mm256_alignr_epi8(s_hh, s_hl, 10);
814
815 const __m256i res_lo = convolve_12taps(s, coeffs);
816
817 __m256i res_32b_lo = _mm256_sra_epi32(
818 _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
819
820 res_32b_lo = _mm256_sra_epi32(
821 _mm256_add_epi32(res_32b_lo, round_const), round_shift);
822 // 8 bit conversion and saturation to uint8
823 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
824 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
825 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
826 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
827 *(uint32_t *)&dst[i * dst_stride + j] =
828 (uint32_t)_mm_cvtsi128_si32(res_0);
829 *(uint32_t *)&dst[i * dst_stride + j + 4] =
830 (uint32_t)_mm_cvtsi128_si32(res_1);
831 }
832 }
833 }
834 } else {
835 const int fo_horiz = filter_params_x->taps / 2 - 1;
836 const uint8_t *const src_ptr = src - fo_horiz;
837 filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
838 filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
839
840 if (w <= 8) {
841 for (i = 0; i < h; i += 2) {
842 const __m256i data = _mm256_permute2x128_si256(
843 _mm256_castsi128_si256(
844 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
845 _mm256_castsi128_si256(_mm_loadu_si128(
846 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
847 0x20);
848
849 __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
850
851 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
852 round_0_shift);
853
854 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
855 round_shift);
856
857 /* rounding code */
858 // 8 bit conversion and saturation to uint8
859 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
860
861 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
862 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
863 if (w > 4) {
864 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
865 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
866 } else if (w > 2) {
867 xx_storel_32(&dst[i * dst_stride], res_0);
868 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
869 } else {
870 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
871 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
872 *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
873 *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
874 }
875 }
876 } else {
877 for (i = 0; i < h; ++i) {
878 for (int j = 0; j < w; j += 16) {
879 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
880 // 18 19 20 21 22 23
881 const __m256i data = _mm256_inserti128_si256(
882 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
883 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
884 1);
885
886 __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
887
888 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
889 round_0_shift);
890
891 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
892 round_shift);
893
894 /* rounding code */
895 // 8 bit conversion and saturation to uint8
896 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
897
898 // Store values into the destination buffer
899 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
900 res_8b = _mm256_permute4x64_epi64(res_8b, 216);
901 __m128i res = _mm256_castsi256_si128(res_8b);
902 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
903 }
904 }
905 }
906 }
907 }
908