1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h>
13 #include <assert.h>
14
15 #include "config/aom_dsp_rtcd.h"
16
17 #include "aom_dsp/x86/convolve_avx2.h"
18 #include "aom_dsp/x86/synonyms.h"
19 #include "aom_dsp/aom_dsp_common.h"
20 #include "aom_dsp/aom_filter.h"
21 #include "av1/common/convolve.h"
22
23 void av1_highbd_convolve_2d_sr_ssse3(
24 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
25 int h, const InterpFilterParams *filter_params_x,
26 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
27 const int subpel_y_qn, ConvolveParams *conv_params, int bd);
28
av1_highbd_convolve_2d_sr_avx2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)29 void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
30 uint16_t *dst, int dst_stride, int w, int h,
31 const InterpFilterParams *filter_params_x,
32 const InterpFilterParams *filter_params_y,
33 const int subpel_x_qn,
34 const int subpel_y_qn,
35 ConvolveParams *conv_params, int bd) {
36 if (filter_params_x->taps == 12) {
37 av1_highbd_convolve_2d_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
38 filter_params_x, filter_params_y,
39 subpel_x_qn, subpel_y_qn, conv_params, bd);
40 return;
41 }
42
43 DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
44 int im_h = h + filter_params_y->taps - 1;
45 int im_stride = 8;
46 int i, j;
47 const int fo_vert = filter_params_y->taps / 2 - 1;
48 const int fo_horiz = filter_params_x->taps / 2 - 1;
49 const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
50
51 // Check that, even with 12-bit input, the intermediate values will fit
52 // into an unsigned 16-bit intermediate array.
53 assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
54
55 __m256i s[8], coeffs_y[4], coeffs_x[4];
56
57 const __m256i round_const_x = _mm256_set1_epi32(
58 ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
59 const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
60
61 const __m256i round_const_y = _mm256_set1_epi32(
62 ((1 << conv_params->round_1) >> 1) -
63 (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
64 const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
65
66 const int bits =
67 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
68 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
69 const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
70 const __m256i clip_pixel =
71 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
72 const __m256i zero = _mm256_setzero_si256();
73
74 prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
75 prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
76
77 for (j = 0; j < w; j += 8) {
78 /* Horizontal filter */
79 {
80 for (i = 0; i < im_h; i += 2) {
81 const __m256i row0 =
82 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
83 __m256i row1 = _mm256_set1_epi16(0);
84 if (i + 1 < im_h)
85 row1 =
86 _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
87
88 const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
89 const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
90
91 // even pixels
92 s[0] = _mm256_alignr_epi8(r1, r0, 0);
93 s[1] = _mm256_alignr_epi8(r1, r0, 4);
94 s[2] = _mm256_alignr_epi8(r1, r0, 8);
95 s[3] = _mm256_alignr_epi8(r1, r0, 12);
96
97 __m256i res_even = convolve(s, coeffs_x);
98 res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
99 round_shift_x);
100
101 // odd pixels
102 s[0] = _mm256_alignr_epi8(r1, r0, 2);
103 s[1] = _mm256_alignr_epi8(r1, r0, 6);
104 s[2] = _mm256_alignr_epi8(r1, r0, 10);
105 s[3] = _mm256_alignr_epi8(r1, r0, 14);
106
107 __m256i res_odd = convolve(s, coeffs_x);
108 res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
109 round_shift_x);
110
111 __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
112 __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
113 __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
114
115 _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
116 }
117 }
118
119 /* Vertical filter */
120 {
121 __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
122 __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
123 __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
124 __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
125 __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
126 __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
127
128 s[0] = _mm256_unpacklo_epi16(s0, s1);
129 s[1] = _mm256_unpacklo_epi16(s2, s3);
130 s[2] = _mm256_unpacklo_epi16(s4, s5);
131
132 s[4] = _mm256_unpackhi_epi16(s0, s1);
133 s[5] = _mm256_unpackhi_epi16(s2, s3);
134 s[6] = _mm256_unpackhi_epi16(s4, s5);
135
136 for (i = 0; i < h; i += 2) {
137 const int16_t *data = &im_block[i * im_stride];
138
139 const __m256i s6 =
140 _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
141 const __m256i s7 =
142 _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
143
144 s[3] = _mm256_unpacklo_epi16(s6, s7);
145 s[7] = _mm256_unpackhi_epi16(s6, s7);
146
147 const __m256i res_a = convolve(s, coeffs_y);
148 __m256i res_a_round = _mm256_sra_epi32(
149 _mm256_add_epi32(res_a, round_const_y), round_shift_y);
150
151 res_a_round = _mm256_sra_epi32(
152 _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
153
154 if (w - j > 4) {
155 const __m256i res_b = convolve(s + 4, coeffs_y);
156 __m256i res_b_round = _mm256_sra_epi32(
157 _mm256_add_epi32(res_b, round_const_y), round_shift_y);
158 res_b_round =
159 _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
160 round_shift_bits);
161
162 __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
163 res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
164 res_16bit = _mm256_max_epi16(res_16bit, zero);
165
166 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
167 _mm256_castsi256_si128(res_16bit));
168 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
169 _mm256_extracti128_si256(res_16bit, 1));
170 } else if (w == 4) {
171 res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
172 res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
173 res_a_round = _mm256_max_epi16(res_a_round, zero);
174
175 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
176 _mm256_castsi256_si128(res_a_round));
177 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
178 _mm256_extracti128_si256(res_a_round, 1));
179 } else {
180 res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
181 res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
182 res_a_round = _mm256_max_epi16(res_a_round, zero);
183
184 xx_storel_32((__m128i *)&dst[i * dst_stride + j],
185 _mm256_castsi256_si128(res_a_round));
186 xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
187 _mm256_extracti128_si256(res_a_round, 1));
188 }
189
190 s[0] = s[1];
191 s[1] = s[2];
192 s[2] = s[3];
193
194 s[4] = s[5];
195 s[5] = s[6];
196 s[6] = s[7];
197 }
198 }
199 }
200 }
201