1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <tmmintrin.h>
13 #include <smmintrin.h>
14 #include <assert.h>
15
16 #include "config/aom_dsp_rtcd.h"
17
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/aom_filter.h"
20 #include "aom_dsp/x86/convolve_sse2.h"
21 #include "aom_dsp/x86/convolve_sse4_1.h"
22 #include "av1/common/convolve.h"
23
av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(const uint16_t * src,int src_stride,uint16_t * dst0,int dst_stride0,int w,int h,ConvolveParams * conv_params,int bd)24 void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(const uint16_t *src,
25 int src_stride, uint16_t *dst0,
26 int dst_stride0, int w, int h,
27 ConvolveParams *conv_params,
28 int bd) {
29 CONV_BUF_TYPE *dst = conv_params->dst;
30 int dst_stride = conv_params->dst_stride;
31
32 const int bits =
33 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
34 const __m128i left_shift = _mm_cvtsi32_si128(bits);
35 const int do_average = conv_params->do_average;
36 const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
37 const int w0 = conv_params->fwd_offset;
38 const int w1 = conv_params->bck_offset;
39 const __m128i wt0 = _mm_set1_epi32(w0);
40 const __m128i wt1 = _mm_set1_epi32(w1);
41 const __m128i zero = _mm_setzero_si128();
42 int i, j;
43
44 const int offset_0 =
45 bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
46 const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
47 const __m128i offset_const = _mm_set1_epi32(offset);
48 const __m128i offset_const_16b = _mm_set1_epi16(offset);
49 const int rounding_shift =
50 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
51 const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
52 const __m128i clip_pixel_to_bd =
53 _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
54
55 assert(bits <= 4);
56
57 if (!(w % 8)) {
58 for (i = 0; i < h; i += 1) {
59 for (j = 0; j < w; j += 8) {
60 const __m128i src_16bit =
61 _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
62 const __m128i res = _mm_sll_epi16(src_16bit, left_shift);
63 if (do_average) {
64 const __m128i data_0 =
65 _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
66
67 const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
68 const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
69
70 const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero);
71 const __m128i res_unsigned_lo =
72 _mm_add_epi32(res_32b_lo, offset_const);
73
74 const __m128i comp_avg_res_lo =
75 highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
76 &wt1, use_dist_wtd_comp_avg);
77
78 const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
79 const __m128i res_unsigned_hi =
80 _mm_add_epi32(res_32b_hi, offset_const);
81
82 const __m128i comp_avg_res_hi =
83 highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
84 &wt1, use_dist_wtd_comp_avg);
85
86 const __m128i round_result_lo = highbd_convolve_rounding_sse2(
87 &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
88 const __m128i round_result_hi = highbd_convolve_rounding_sse2(
89 &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
90
91 const __m128i res_16b =
92 _mm_packus_epi32(round_result_lo, round_result_hi);
93 const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
94
95 _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
96 } else {
97 const __m128i res_unsigned_16b =
98 _mm_adds_epu16(res, offset_const_16b);
99
100 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]),
101 res_unsigned_16b);
102 }
103 }
104 }
105 } else if (!(w % 4)) {
106 for (i = 0; i < h; i += 2) {
107 for (j = 0; j < w; j += 4) {
108 const __m128i src_row_0 =
109 _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
110 const __m128i src_row_1 =
111 _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
112 const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1);
113
114 const __m128i res = _mm_sll_epi16(src_10, left_shift);
115
116 if (do_average) {
117 const __m128i data_0 =
118 _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
119 const __m128i data_1 = _mm_loadl_epi64(
120 (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
121
122 const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
123 const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
124
125 const __m128i res_32b = _mm_unpacklo_epi16(res, zero);
126 const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const);
127
128 const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
129 const __m128i res_unsigned_hi =
130 _mm_add_epi32(res_32b_hi, offset_const);
131
132 const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
133 &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
134 const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
135 &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg);
136
137 const __m128i round_result_lo = highbd_convolve_rounding_sse2(
138 &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
139 const __m128i round_result_hi = highbd_convolve_rounding_sse2(
140 &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
141
142 const __m128i res_16b =
143 _mm_packus_epi32(round_result_lo, round_result_hi);
144 const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
145
146 const __m128i res_1 = _mm_srli_si128(res_clip, 8);
147
148 _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
149 _mm_storel_epi64(
150 (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
151 } else {
152 const __m128i res_unsigned_16b =
153 _mm_adds_epu16(res, offset_const_16b);
154
155 const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8);
156
157 _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]),
158 res_unsigned_16b);
159 _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
160 res_1);
161 }
162 }
163 }
164 }
165 }
166
av1_highbd_dist_wtd_convolve_2d_sse4_1(const uint16_t * src,int src_stride,uint16_t * dst0,int dst_stride0,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)167 void av1_highbd_dist_wtd_convolve_2d_sse4_1(
168 const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
169 int h, const InterpFilterParams *filter_params_x,
170 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
171 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
172 DECLARE_ALIGNED(16, int16_t,
173 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
174 CONV_BUF_TYPE *dst = conv_params->dst;
175 int dst_stride = conv_params->dst_stride;
176 int im_h = h + filter_params_y->taps - 1;
177 int im_stride = MAX_SB_SIZE;
178 int i, j;
179 const int do_average = conv_params->do_average;
180 const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
181 const int fo_vert = filter_params_y->taps / 2 - 1;
182 const int fo_horiz = filter_params_x->taps / 2 - 1;
183 const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
184
185 const int w0 = conv_params->fwd_offset;
186 const int w1 = conv_params->bck_offset;
187 const __m128i wt0 = _mm_set1_epi32(w0);
188 const __m128i wt1 = _mm_set1_epi32(w1);
189
190 const int offset_0 =
191 bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
192 const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
193 const __m128i offset_const = _mm_set1_epi32(offset);
194 const int rounding_shift =
195 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
196 const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
197 const __m128i clip_pixel_to_bd =
198 _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
199
200 // Check that, even with 12-bit input, the intermediate values will fit
201 // into an unsigned 16-bit intermediate array.
202 assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
203
204 /* Horizontal filter */
205 {
206 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
207 filter_params_x, subpel_x_qn & SUBPEL_MASK);
208 const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
209
210 // coeffs 0 1 0 1 2 3 2 3
211 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
212 // coeffs 4 5 4 5 6 7 6 7
213 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
214
215 // coeffs 0 1 0 1 0 1 0 1
216 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
217 // coeffs 2 3 2 3 2 3 2 3
218 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
219 // coeffs 4 5 4 5 4 5 4 5
220 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
221 // coeffs 6 7 6 7 6 7 6 7
222 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
223
224 const __m128i round_const = _mm_set1_epi32(
225 ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
226 const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
227
228 for (i = 0; i < im_h; ++i) {
229 for (j = 0; j < w; j += 8) {
230 const __m128i data =
231 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
232 const __m128i data2 =
233 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
234
235 // Filter even-index pixels
236 const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
237 const __m128i res_2 =
238 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
239 const __m128i res_4 =
240 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
241 const __m128i res_6 =
242 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
243
244 __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
245 _mm_add_epi32(res_2, res_6));
246 res_even =
247 _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
248
249 // Filter odd-index pixels
250 const __m128i res_1 =
251 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
252 const __m128i res_3 =
253 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
254 const __m128i res_5 =
255 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
256 const __m128i res_7 =
257 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
258
259 __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
260 _mm_add_epi32(res_3, res_7));
261 res_odd =
262 _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
263
264 // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
265 __m128i res = _mm_packs_epi32(res_even, res_odd);
266 _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
267 }
268 }
269 }
270
271 /* Vertical filter */
272 {
273 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
274 filter_params_y, subpel_y_qn & SUBPEL_MASK);
275 const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
276
277 // coeffs 0 1 0 1 2 3 2 3
278 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
279 // coeffs 4 5 4 5 6 7 6 7
280 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
281
282 // coeffs 0 1 0 1 0 1 0 1
283 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
284 // coeffs 2 3 2 3 2 3 2 3
285 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
286 // coeffs 4 5 4 5 4 5 4 5
287 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
288 // coeffs 6 7 6 7 6 7 6 7
289 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
290
291 const __m128i round_const = _mm_set1_epi32(
292 ((1 << conv_params->round_1) >> 1) -
293 (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
294 const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
295
296 for (i = 0; i < h; ++i) {
297 for (j = 0; j < w; j += 8) {
298 // Filter even-index pixels
299 const int16_t *data = &im_block[i * im_stride + j];
300 const __m128i src_0 =
301 _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
302 *(__m128i *)(data + 1 * im_stride));
303 const __m128i src_2 =
304 _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
305 *(__m128i *)(data + 3 * im_stride));
306 const __m128i src_4 =
307 _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
308 *(__m128i *)(data + 5 * im_stride));
309 const __m128i src_6 =
310 _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
311 *(__m128i *)(data + 7 * im_stride));
312
313 const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
314 const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
315 const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
316 const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
317
318 const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
319 _mm_add_epi32(res_4, res_6));
320
321 // Filter odd-index pixels
322 const __m128i src_1 =
323 _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
324 *(__m128i *)(data + 1 * im_stride));
325 const __m128i src_3 =
326 _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
327 *(__m128i *)(data + 3 * im_stride));
328 const __m128i src_5 =
329 _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
330 *(__m128i *)(data + 5 * im_stride));
331 const __m128i src_7 =
332 _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
333 *(__m128i *)(data + 7 * im_stride));
334
335 const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
336 const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
337 const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
338 const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
339
340 const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
341 _mm_add_epi32(res_5, res_7));
342
343 // Rearrange pixels back into the order 0 ... 7
344 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
345 const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
346
347 const __m128i res_lo_round =
348 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
349
350 const __m128i res_unsigned_lo =
351 _mm_add_epi32(res_lo_round, offset_const);
352
353 if (w < 8) {
354 if (do_average) {
355 const __m128i data_0 =
356 _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
357
358 const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
359
360 const __m128i comp_avg_res =
361 highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0,
362 &wt1, use_dist_wtd_comp_avg);
363
364 const __m128i round_result = highbd_convolve_rounding_sse2(
365 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
366
367 const __m128i res_16b =
368 _mm_packus_epi32(round_result, round_result);
369 const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
370
371 _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
372 } else {
373 const __m128i res_16b =
374 _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
375 _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b);
376 }
377 } else {
378 const __m128i res_hi_round =
379 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
380
381 const __m128i res_unsigned_hi =
382 _mm_add_epi32(res_hi_round, offset_const);
383
384 if (do_average) {
385 const __m128i data_lo =
386 _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
387 const __m128i data_hi =
388 _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4]));
389
390 const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
391 const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
392
393 const __m128i comp_avg_res_lo =
394 highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
395 &wt1, use_dist_wtd_comp_avg);
396 const __m128i comp_avg_res_hi =
397 highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
398 &wt1, use_dist_wtd_comp_avg);
399
400 const __m128i round_result_lo =
401 highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
402 &rounding_const, rounding_shift);
403 const __m128i round_result_hi =
404 highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const,
405 &rounding_const, rounding_shift);
406
407 const __m128i res_16b =
408 _mm_packus_epi32(round_result_lo, round_result_hi);
409 const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
410
411 _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
412 } else {
413 const __m128i res_16b =
414 _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
415 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
416 }
417 }
418 }
419 }
420 }
421 }
422