1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <tmmintrin.h>
13 #include <assert.h>
14 
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom_dsp/x86/convolve_sse2.h"
18 #include "aom_dsp/x86/convolve_common_intrin.h"
19 
av1_highbd_convolve_y_sr_ssse3(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)20 void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
21                                     uint16_t *dst, int dst_stride, int w, int h,
22                                     const InterpFilterParams *filter_params_y,
23                                     const int subpel_y_qn, int bd) {
24   int i, j;
25   const int fo_vert = filter_params_y->taps / 2 - 1;
26   const uint16_t *const src_ptr = src - fo_vert * src_stride;
27   const int bits = FILTER_BITS;
28 
29   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
30   const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
31   const __m128i clip_pixel =
32       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
33   const __m128i zero = _mm_setzero_si128();
34   if (filter_params_y->taps == 12) {
35     __m128i s[24], coeffs_y[6];
36 
37     prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y);
38 
39     for (j = 0; j < w; j += 8) {
40       const uint16_t *data = &src_ptr[j];
41       /* Vertical filter */
42       __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
43       __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
44       __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
45       __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
46       __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
47       __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
48       __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
49       __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
50       __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
51       __m128i s9 = _mm_loadu_si128((__m128i *)(data + 9 * src_stride));
52       __m128i s10 = _mm_loadu_si128((__m128i *)(data + 10 * src_stride));
53 
54       s[0] = _mm_unpacklo_epi16(s0, s1);
55       s[1] = _mm_unpacklo_epi16(s2, s3);
56       s[2] = _mm_unpacklo_epi16(s4, s5);
57       s[3] = _mm_unpacklo_epi16(s6, s7);
58       s[4] = _mm_unpacklo_epi16(s8, s9);
59 
60       s[6] = _mm_unpackhi_epi16(s0, s1);
61       s[7] = _mm_unpackhi_epi16(s2, s3);
62       s[8] = _mm_unpackhi_epi16(s4, s5);
63       s[9] = _mm_unpackhi_epi16(s6, s7);
64       s[10] = _mm_unpackhi_epi16(s8, s9);
65 
66       s[12] = _mm_unpacklo_epi16(s1, s2);
67       s[13] = _mm_unpacklo_epi16(s3, s4);
68       s[14] = _mm_unpacklo_epi16(s5, s6);
69       s[15] = _mm_unpacklo_epi16(s7, s8);
70       s[16] = _mm_unpacklo_epi16(s9, s10);
71 
72       s[18] = _mm_unpackhi_epi16(s1, s2);
73       s[19] = _mm_unpackhi_epi16(s3, s4);
74       s[20] = _mm_unpackhi_epi16(s5, s6);
75       s[21] = _mm_unpackhi_epi16(s7, s8);
76       s[22] = _mm_unpackhi_epi16(s9, s10);
77 
78       for (i = 0; i < h; i += 2) {
79         data = &src_ptr[i * src_stride + j];
80 
81         __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * src_stride));
82         __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * src_stride));
83 
84         s[5] = _mm_unpacklo_epi16(s10, s11);
85         s[11] = _mm_unpackhi_epi16(s10, s11);
86 
87         s[17] = _mm_unpacklo_epi16(s11, s12);
88         s[23] = _mm_unpackhi_epi16(s11, s12);
89 
90         const __m128i res_a0 = convolve_12tap(s, coeffs_y);
91         __m128i res_a_round0 = _mm_sra_epi32(
92             _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
93 
94         const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y);
95         __m128i res_a_round1 = _mm_sra_epi32(
96             _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
97 
98         if (w - j > 4) {
99           const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y);
100           __m128i res_b_round0 = _mm_sra_epi32(
101               _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
102 
103           const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y);
104           __m128i res_b_round1 = _mm_sra_epi32(
105               _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
106 
107           __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
108           res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
109           res_16bit0 = _mm_max_epi16(res_16bit0, zero);
110 
111           __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
112           res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
113           res_16bit1 = _mm_max_epi16(res_16bit1, zero);
114 
115           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
116           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
117                            res_16bit1);
118         } else if (w == 4) {
119           res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
120           res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
121           res_a_round0 = _mm_max_epi16(res_a_round0, zero);
122 
123           res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
124           res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
125           res_a_round1 = _mm_max_epi16(res_a_round1, zero);
126 
127           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
128           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
129                            res_a_round1);
130         } else {
131           res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
132           res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
133           res_a_round0 = _mm_max_epi16(res_a_round0, zero);
134 
135           res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
136           res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
137           res_a_round1 = _mm_max_epi16(res_a_round1, zero);
138 
139           *((uint32_t *)(&dst[i * dst_stride + j])) =
140               _mm_cvtsi128_si32(res_a_round0);
141 
142           *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
143               _mm_cvtsi128_si32(res_a_round1);
144         }
145 
146         s[0] = s[1];
147         s[1] = s[2];
148         s[2] = s[3];
149         s[3] = s[4];
150         s[4] = s[5];
151 
152         s[6] = s[7];
153         s[7] = s[8];
154         s[8] = s[9];
155         s[9] = s[10];
156         s[10] = s[11];
157 
158         s[12] = s[13];
159         s[13] = s[14];
160         s[14] = s[15];
161         s[15] = s[16];
162         s[16] = s[17];
163 
164         s[18] = s[19];
165         s[19] = s[20];
166         s[20] = s[21];
167         s[21] = s[22];
168         s[22] = s[23];
169 
170         s10 = s12;
171       }
172     }
173   } else {
174     __m128i s[16], coeffs_y[4];
175 
176     prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
177 
178     for (j = 0; j < w; j += 8) {
179       const uint16_t *data = &src_ptr[j];
180       /* Vertical filter */
181       {
182         __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
183         __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
184         __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
185         __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
186         __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
187         __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
188         __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
189 
190         s[0] = _mm_unpacklo_epi16(s0, s1);
191         s[1] = _mm_unpacklo_epi16(s2, s3);
192         s[2] = _mm_unpacklo_epi16(s4, s5);
193 
194         s[4] = _mm_unpackhi_epi16(s0, s1);
195         s[5] = _mm_unpackhi_epi16(s2, s3);
196         s[6] = _mm_unpackhi_epi16(s4, s5);
197 
198         s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
199         s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
200         s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
201 
202         s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
203         s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
204         s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
205 
206         for (i = 0; i < h; i += 2) {
207           data = &src_ptr[i * src_stride + j];
208 
209           __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
210           __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
211 
212           s[3] = _mm_unpacklo_epi16(s6, s7);
213           s[7] = _mm_unpackhi_epi16(s6, s7);
214 
215           s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
216           s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
217 
218           const __m128i res_a0 = convolve(s, coeffs_y);
219           __m128i res_a_round0 = _mm_sra_epi32(
220               _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
221 
222           const __m128i res_a1 = convolve(s + 8, coeffs_y);
223           __m128i res_a_round1 = _mm_sra_epi32(
224               _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
225 
226           if (w - j > 4) {
227             const __m128i res_b0 = convolve(s + 4, coeffs_y);
228             __m128i res_b_round0 = _mm_sra_epi32(
229                 _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
230 
231             const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
232             __m128i res_b_round1 = _mm_sra_epi32(
233                 _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
234 
235             __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
236             res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
237             res_16bit0 = _mm_max_epi16(res_16bit0, zero);
238 
239             __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
240             res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
241             res_16bit1 = _mm_max_epi16(res_16bit1, zero);
242 
243             _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
244             _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
245                              res_16bit1);
246           } else if (w == 4) {
247             res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
248             res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
249             res_a_round0 = _mm_max_epi16(res_a_round0, zero);
250 
251             res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
252             res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
253             res_a_round1 = _mm_max_epi16(res_a_round1, zero);
254 
255             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
256             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
257                              res_a_round1);
258           } else {
259             res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
260             res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
261             res_a_round0 = _mm_max_epi16(res_a_round0, zero);
262 
263             res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
264             res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
265             res_a_round1 = _mm_max_epi16(res_a_round1, zero);
266 
267             *((uint32_t *)(&dst[i * dst_stride + j])) =
268                 _mm_cvtsi128_si32(res_a_round0);
269 
270             *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
271                 _mm_cvtsi128_si32(res_a_round1);
272           }
273 
274           s[0] = s[1];
275           s[1] = s[2];
276           s[2] = s[3];
277 
278           s[4] = s[5];
279           s[5] = s[6];
280           s[6] = s[7];
281 
282           s[0 + 8] = s[1 + 8];
283           s[1 + 8] = s[2 + 8];
284           s[2 + 8] = s[3 + 8];
285 
286           s[4 + 8] = s[5 + 8];
287           s[5 + 8] = s[6 + 8];
288           s[6 + 8] = s[7 + 8];
289 
290           s6 = s8;
291         }
292       }
293     }
294   }
295 }
296 
av1_highbd_convolve_x_sr_ssse3(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)297 void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
298                                     uint16_t *dst, int dst_stride, int w, int h,
299                                     const InterpFilterParams *filter_params_x,
300                                     const int subpel_x_qn,
301                                     ConvolveParams *conv_params, int bd) {
302   int i, j;
303   const int fo_horiz = filter_params_x->taps / 2 - 1;
304   const uint16_t *const src_ptr = src - fo_horiz;
305 
306   // Check that, even with 12-bit input, the intermediate values will fit
307   // into an unsigned 16-bit intermediate array.
308   assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
309 
310   const __m128i round_const_x =
311       _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
312   const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
313 
314   const int bits = FILTER_BITS - conv_params->round_0;
315 
316   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
317   const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
318   const __m128i clip_pixel =
319       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
320   const __m128i zero = _mm_setzero_si128();
321 
322   if (filter_params_x->taps == 12) {
323     __m128i s[6], coeffs_x[6];
324 
325     prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x);
326 
327     for (j = 0; j < w; j += 8) {
328       /* Horizontal filter */
329       {
330         for (i = 0; i < h; i += 1) {
331           const __m128i row00 =
332               _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
333           const __m128i row01 =
334               _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
335           const __m128i row02 =
336               _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]);
337 
338           // even pixels
339           s[0] = _mm_alignr_epi8(row01, row00, 0);
340           s[1] = _mm_alignr_epi8(row01, row00, 4);
341           s[2] = _mm_alignr_epi8(row01, row00, 8);
342           s[3] = _mm_alignr_epi8(row01, row00, 12);
343           s[4] = _mm_alignr_epi8(row02, row01, 0);
344           s[5] = _mm_alignr_epi8(row02, row01, 4);
345 
346           __m128i res_even = convolve_12tap(s, coeffs_x);
347           res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
348                                    round_shift_x);
349           res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
350                                    round_shift_bits);
351 
352           // odd pixels
353           s[0] = _mm_alignr_epi8(row01, row00, 2);
354           s[1] = _mm_alignr_epi8(row01, row00, 6);
355           s[2] = _mm_alignr_epi8(row01, row00, 10);
356           s[3] = _mm_alignr_epi8(row01, row00, 14);
357           s[4] = _mm_alignr_epi8(row02, row01, 2);
358           s[5] = _mm_alignr_epi8(row02, row01, 6);
359 
360           __m128i res_odd = convolve_12tap(s, coeffs_x);
361           res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
362                                   round_shift_x);
363           res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
364                                   round_shift_bits);
365 
366           __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
367           __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
368           __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
369 
370           res = _mm_min_epi16(res, clip_pixel);
371           res = _mm_max_epi16(res, zero);
372 
373           if (w - j > 4) {
374             _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
375           } else if (w == 4) {
376             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
377           } else {
378             *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
379           }
380         }
381       }
382     }
383   } else {
384     __m128i s[4], coeffs_x[4];
385     prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
386 
387     for (j = 0; j < w; j += 8) {
388       /* Horizontal filter */
389       {
390         for (i = 0; i < h; i += 1) {
391           const __m128i row00 =
392               _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
393           const __m128i row01 =
394               _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
395 
396           // even pixels
397           s[0] = _mm_alignr_epi8(row01, row00, 0);
398           s[1] = _mm_alignr_epi8(row01, row00, 4);
399           s[2] = _mm_alignr_epi8(row01, row00, 8);
400           s[3] = _mm_alignr_epi8(row01, row00, 12);
401 
402           __m128i res_even = convolve(s, coeffs_x);
403           res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
404                                    round_shift_x);
405 
406           // odd pixels
407           s[0] = _mm_alignr_epi8(row01, row00, 2);
408           s[1] = _mm_alignr_epi8(row01, row00, 6);
409           s[2] = _mm_alignr_epi8(row01, row00, 10);
410           s[3] = _mm_alignr_epi8(row01, row00, 14);
411 
412           __m128i res_odd = convolve(s, coeffs_x);
413           res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
414                                   round_shift_x);
415 
416           res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
417                                    round_shift_bits);
418           res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
419                                   round_shift_bits);
420 
421           __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
422           __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
423           __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
424 
425           res = _mm_min_epi16(res, clip_pixel);
426           res = _mm_max_epi16(res, zero);
427 
428           if (w - j > 4) {
429             _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
430           } else if (w == 4) {
431             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
432           } else {
433             *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
434           }
435         }
436       }
437     }
438   }
439 }
440