1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <smmintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "av1/common/warped_motion.h"
17 
18 static const uint8_t warp_highbd_arrange_bytes[16] = {
19   0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
20 };
21 
22 static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
23   0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
24 };
25 static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
26   4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
27 };
28 static const uint8_t highbd_shuffle_alpha0_mask2[16] = {
29   8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
30 };
31 static const uint8_t highbd_shuffle_alpha0_mask3[16] = {
32   12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
33 };
34 
highbd_prepare_horizontal_filter_coeff(int alpha,int sx,__m128i * coeff)35 static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
36                                                           __m128i *coeff) {
37   // Filter even-index pixels
38   const __m128i tmp_0 = _mm_loadu_si128(
39       (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
40   const __m128i tmp_2 = _mm_loadu_si128(
41       (__m128i *)(warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
42   const __m128i tmp_4 = _mm_loadu_si128(
43       (__m128i *)(warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
44   const __m128i tmp_6 = _mm_loadu_si128(
45       (__m128i *)(warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
46 
47   // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
48   const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
49   // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
50   const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
51   // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
52   const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
53   // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
54   const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
55 
56   // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
57   coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
58   // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
59   coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
60   // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
61   coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
62   // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
63   coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
64 
65   // Filter odd-index pixels
66   const __m128i tmp_1 = _mm_loadu_si128(
67       (__m128i *)(warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
68   const __m128i tmp_3 = _mm_loadu_si128(
69       (__m128i *)(warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
70   const __m128i tmp_5 = _mm_loadu_si128(
71       (__m128i *)(warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
72   const __m128i tmp_7 = _mm_loadu_si128(
73       (__m128i *)(warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
74 
75   const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
76   const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
77   const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
78   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
79 
80   coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
81   coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
82   coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
83   coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
84 }
85 
highbd_prepare_horizontal_filter_coeff_alpha0(int sx,__m128i * coeff)86 static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
87     int sx, __m128i *coeff) {
88   // Filter coeff
89   const __m128i tmp_0 = _mm_loadu_si128(
90       (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
91 
92   coeff[0] = _mm_shuffle_epi8(
93       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
94   coeff[2] = _mm_shuffle_epi8(
95       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
96   coeff[4] = _mm_shuffle_epi8(
97       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
98   coeff[6] = _mm_shuffle_epi8(
99       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
100 
101   coeff[1] = coeff[0];
102   coeff[3] = coeff[2];
103   coeff[5] = coeff[4];
104   coeff[7] = coeff[6];
105 }
106 
highbd_filter_src_pixels(const __m128i * src,const __m128i * src2,__m128i * tmp,__m128i * coeff,const int offset_bits_horiz,const int reduce_bits_horiz,int k)107 static INLINE void highbd_filter_src_pixels(
108     const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
109     const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
110   const __m128i src_1 = *src;
111   const __m128i src2_1 = *src2;
112 
113   const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
114                                              ((1 << reduce_bits_horiz) >> 1));
115 
116   const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
117   const __m128i res_2 =
118       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
119   const __m128i res_4 =
120       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
121   const __m128i res_6 =
122       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
123 
124   __m128i res_even =
125       _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
126   res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
127                            _mm_cvtsi32_si128(reduce_bits_horiz));
128 
129   const __m128i res_1 =
130       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
131   const __m128i res_3 =
132       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
133   const __m128i res_5 =
134       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
135   const __m128i res_7 =
136       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
137 
138   __m128i res_odd =
139       _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
140   res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
141                           _mm_cvtsi32_si128(reduce_bits_horiz));
142 
143   // Combine results into one register.
144   // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
145   // as this order helps with the vertical filter.
146   tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
147 }
148 
highbd_horiz_filter(const __m128i * src,const __m128i * src2,__m128i * tmp,int sx,int alpha,int k,const int offset_bits_horiz,const int reduce_bits_horiz)149 static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
150                                        __m128i *tmp, int sx, int alpha, int k,
151                                        const int offset_bits_horiz,
152                                        const int reduce_bits_horiz) {
153   __m128i coeff[8];
154   highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
155   highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
156                            reduce_bits_horiz, k);
157 }
158 
highbd_warp_horizontal_filter_alpha0_beta0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)159 static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
160     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
161     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
162     const int offset_bits_horiz, const int reduce_bits_horiz) {
163   (void)beta;
164   (void)alpha;
165   int k;
166 
167   __m128i coeff[8];
168   highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
169 
170   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
171     int iy = iy4 + k;
172     if (iy < 0)
173       iy = 0;
174     else if (iy > height - 1)
175       iy = height - 1;
176 
177     // Load source pixels
178     const __m128i src =
179         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
180     const __m128i src2 =
181         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
182     highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
183                              reduce_bits_horiz, k);
184   }
185 }
186 
highbd_warp_horizontal_filter_alpha0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)187 static INLINE void highbd_warp_horizontal_filter_alpha0(
188     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
189     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
190     const int offset_bits_horiz, const int reduce_bits_horiz) {
191   (void)alpha;
192   int k;
193   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
194     int iy = iy4 + k;
195     if (iy < 0)
196       iy = 0;
197     else if (iy > height - 1)
198       iy = height - 1;
199     int sx = sx4 + beta * (k + 4);
200 
201     // Load source pixels
202     const __m128i src =
203         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
204     const __m128i src2 =
205         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
206 
207     __m128i coeff[8];
208     highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
209     highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
210                              reduce_bits_horiz, k);
211   }
212 }
213 
highbd_warp_horizontal_filter_beta0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)214 static INLINE void highbd_warp_horizontal_filter_beta0(
215     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
216     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
217     const int offset_bits_horiz, const int reduce_bits_horiz) {
218   (void)beta;
219   int k;
220   __m128i coeff[8];
221   highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
222 
223   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
224     int iy = iy4 + k;
225     if (iy < 0)
226       iy = 0;
227     else if (iy > height - 1)
228       iy = height - 1;
229 
230     // Load source pixels
231     const __m128i src =
232         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
233     const __m128i src2 =
234         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
235     highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
236                              reduce_bits_horiz, k);
237   }
238 }
239 
highbd_warp_horizontal_filter(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)240 static INLINE void highbd_warp_horizontal_filter(
241     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
242     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
243     const int offset_bits_horiz, const int reduce_bits_horiz) {
244   int k;
245   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
246     int iy = iy4 + k;
247     if (iy < 0)
248       iy = 0;
249     else if (iy > height - 1)
250       iy = height - 1;
251     int sx = sx4 + beta * (k + 4);
252 
253     // Load source pixels
254     const __m128i src =
255         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
256     const __m128i src2 =
257         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
258 
259     highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
260                         reduce_bits_horiz);
261   }
262 }
263 
highbd_prepare_warp_horizontal_filter(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)264 static INLINE void highbd_prepare_warp_horizontal_filter(
265     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
266     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
267     const int offset_bits_horiz, const int reduce_bits_horiz) {
268   if (alpha == 0 && beta == 0)
269     highbd_warp_horizontal_filter_alpha0_beta0(
270         ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
271         offset_bits_horiz, reduce_bits_horiz);
272 
273   else if (alpha == 0 && beta != 0)
274     highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
275                                          beta, p_height, height, i,
276                                          offset_bits_horiz, reduce_bits_horiz);
277 
278   else if (alpha != 0 && beta == 0)
279     highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
280                                         beta, p_height, height, i,
281                                         offset_bits_horiz, reduce_bits_horiz);
282   else
283     highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
284                                   p_height, height, i, offset_bits_horiz,
285                                   reduce_bits_horiz);
286 }
287 
av1_highbd_warp_affine_sse4_1(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)288 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
289                                    int width, int height, int stride,
290                                    uint16_t *pred, int p_col, int p_row,
291                                    int p_width, int p_height, int p_stride,
292                                    int subsampling_x, int subsampling_y, int bd,
293                                    ConvolveParams *conv_params, int16_t alpha,
294                                    int16_t beta, int16_t gamma, int16_t delta) {
295   __m128i tmp[15];
296   int i, j, k;
297   const int reduce_bits_horiz =
298       conv_params->round_0 +
299       AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
300   const int reduce_bits_vert = conv_params->is_compound
301                                    ? conv_params->round_1
302                                    : 2 * FILTER_BITS - reduce_bits_horiz;
303   const int offset_bits_horiz = bd + FILTER_BITS - 1;
304   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
305   assert(!(bd == 12 && reduce_bits_horiz < 5));
306   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
307 
308   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
309   const __m128i clip_pixel =
310       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
311   const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
312   const __m128i reduce_bits_vert_const =
313       _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
314   const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
315   const int round_bits =
316       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
317   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
318   const __m128i res_sub_const =
319       _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
320                      (1 << (offset_bits - conv_params->round_1 - 1)));
321   __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
322   __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
323 
324   const int w0 = conv_params->fwd_offset;
325   const int w1 = conv_params->bck_offset;
326   const __m128i wt0 = _mm_set1_epi32(w0);
327   const __m128i wt1 = _mm_set1_epi32(w1);
328 
329   /* Note: For this code to work, the left/right frame borders need to be
330   extended by at least 13 pixels each. By the time we get here, other
331   code will have set up this border, but we allow an explicit check
332   for debugging purposes.
333   */
334   /*for (i = 0; i < height; ++i) {
335   for (j = 0; j < 13; ++j) {
336   assert(ref[i * stride - 13 + j] == ref[i * stride]);
337   assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
338   }
339   }*/
340 
341   for (i = 0; i < p_height; i += 8) {
342     for (j = 0; j < p_width; j += 8) {
343       const int32_t src_x = (p_col + j + 4) << subsampling_x;
344       const int32_t src_y = (p_row + i + 4) << subsampling_y;
345       const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
346       const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
347       const int32_t x4 = dst_x >> subsampling_x;
348       const int32_t y4 = dst_y >> subsampling_y;
349 
350       int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
351       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
352       int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
353       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
354 
355       // Add in all the constant terms, including rounding and offset
356       sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
357              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
358       sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
359              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
360 
361       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
362       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
363 
364       // Horizontal filter
365       // If the block is aligned such that, after clamping, every sample
366       // would be taken from the leftmost/rightmost column, then we can
367       // skip the expensive horizontal filter.
368       if (ix4 <= -7) {
369         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
370           int iy = iy4 + k;
371           if (iy < 0)
372             iy = 0;
373           else if (iy > height - 1)
374             iy = height - 1;
375           tmp[k + 7] = _mm_set1_epi16(
376               (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
377               ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
378         }
379       } else if (ix4 >= width + 6) {
380         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
381           int iy = iy4 + k;
382           if (iy < 0)
383             iy = 0;
384           else if (iy > height - 1)
385             iy = height - 1;
386           tmp[k + 7] =
387               _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
388                              ref[iy * stride + (width - 1)] *
389                                  (1 << (FILTER_BITS - reduce_bits_horiz)));
390         }
391       } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
392         const int out_of_boundary_left = -(ix4 - 6);
393         const int out_of_boundary_right = (ix4 + 8) - width;
394 
395         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
396           int iy = iy4 + k;
397           if (iy < 0)
398             iy = 0;
399           else if (iy > height - 1)
400             iy = height - 1;
401           int sx = sx4 + beta * (k + 4);
402 
403           // Load source pixels
404           const __m128i src =
405               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
406           const __m128i src2 =
407               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
408 
409           const __m128i src_01 = _mm_shuffle_epi8(
410               src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
411           const __m128i src2_01 = _mm_shuffle_epi8(
412               src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
413 
414           __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
415           __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
416 
417           if (out_of_boundary_left >= 0) {
418             const __m128i shuffle_reg_left =
419                 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
420             src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
421             src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
422           }
423 
424           if (out_of_boundary_right >= 0) {
425             const __m128i shuffle_reg_right = _mm_loadu_si128(
426                 (__m128i *)warp_pad_right[out_of_boundary_right]);
427             src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
428             src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
429           }
430 
431           const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
432           const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
433 
434           highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
435                               offset_bits_horiz, reduce_bits_horiz);
436         }
437       } else {
438         highbd_prepare_warp_horizontal_filter(
439             ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
440             offset_bits_horiz, reduce_bits_horiz);
441       }
442 
443       // Vertical filter
444       for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
445         int sy = sy4 + delta * (k + 4);
446 
447         // Load from tmp and rearrange pairs of consecutive rows into the
448         // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
449         const __m128i *src = tmp + (k + 4);
450         const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
451         const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
452         const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
453         const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
454 
455         // Filter even-index pixels
456         const __m128i tmp_0 = _mm_loadu_si128(
457             (__m128i *)(warped_filter +
458                         ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
459         const __m128i tmp_2 = _mm_loadu_si128(
460             (__m128i *)(warped_filter +
461                         ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
462         const __m128i tmp_4 = _mm_loadu_si128(
463             (__m128i *)(warped_filter +
464                         ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
465         const __m128i tmp_6 = _mm_loadu_si128(
466             (__m128i *)(warped_filter +
467                         ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
468 
469         const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
470         const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
471         const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
472         const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
473 
474         const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
475         const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
476         const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
477         const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
478 
479         const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
480         const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
481         const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
482         const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
483 
484         const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
485                                                _mm_add_epi32(res_4, res_6));
486 
487         // Filter odd-index pixels
488         const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
489         const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
490         const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
491         const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
492 
493         const __m128i tmp_1 = _mm_loadu_si128(
494             (__m128i *)(warped_filter +
495                         ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
496         const __m128i tmp_3 = _mm_loadu_si128(
497             (__m128i *)(warped_filter +
498                         ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
499         const __m128i tmp_5 = _mm_loadu_si128(
500             (__m128i *)(warped_filter +
501                         ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
502         const __m128i tmp_7 = _mm_loadu_si128(
503             (__m128i *)(warped_filter +
504                         ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
505 
506         const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
507         const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
508         const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
509         const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
510 
511         const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
512         const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
513         const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
514         const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
515 
516         const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
517         const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
518         const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
519         const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
520 
521         const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
522                                               _mm_add_epi32(res_5, res_7));
523 
524         // Rearrange pixels back into the order 0 ... 7
525         __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
526         __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
527 
528         if (conv_params->is_compound) {
529           __m128i *const p =
530               (__m128i *)&conv_params
531                   ->dst[(i + k + 4) * conv_params->dst_stride + j];
532           res_lo = _mm_add_epi32(res_lo, res_add_const);
533           res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
534                                  reduce_bits_vert_shift);
535 
536           if (conv_params->do_average) {
537             __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
538             __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
539 
540             if (conv_params->use_jnt_comp_avg) {
541               res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
542                                      _mm_mullo_epi32(res_lo, wt1));
543               res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
544             } else {
545               res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
546             }
547 
548             __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
549             res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
550                                      round_bits_shift);
551 
552             __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
553             res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
554             _mm_storel_epi64(dst16, res16_lo);
555           } else {
556             res_lo = _mm_packus_epi32(res_lo, res_lo);
557             _mm_storel_epi64(p, res_lo);
558           }
559           if (p_width > 4) {
560             __m128i *const p4 =
561                 (__m128i *)&conv_params
562                     ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
563 
564             res_hi = _mm_add_epi32(res_hi, res_add_const);
565             res_hi =
566                 _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
567                               reduce_bits_vert_shift);
568             if (conv_params->do_average) {
569               __m128i *const dst16_4 =
570                   (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
571               __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
572 
573               if (conv_params->use_jnt_comp_avg) {
574                 res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
575                                        _mm_mullo_epi32(res_hi, wt1));
576                 res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
577               } else {
578                 res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
579               }
580 
581               __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
582               res32_hi = _mm_sra_epi32(
583                   _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
584               __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
585               res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
586               _mm_storel_epi64(dst16_4, res16_hi);
587             } else {
588               res_hi = _mm_packus_epi32(res_hi, res_hi);
589               _mm_storel_epi64(p4, res_hi);
590             }
591           }
592         } else {
593           // Round and pack into 8 bits
594           const __m128i round_const =
595               _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
596                              ((1 << reduce_bits_vert) >> 1));
597 
598           const __m128i res_lo_round = _mm_srai_epi32(
599               _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
600           const __m128i res_hi_round = _mm_srai_epi32(
601               _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
602 
603           __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
604           // Clamp res_16bit to the range [0, 2^bd - 1]
605           const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
606           const __m128i zero = _mm_setzero_si128();
607           res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
608 
609           // Store, blending with 'pred' if needed
610           __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
611 
612           // Note: If we're outputting a 4x4 block, we need to be very careful
613           // to only output 4 pixels at this point, to avoid encode/decode
614           // mismatches when encoding with multiple threads.
615           if (p_width == 4) {
616             _mm_storel_epi64(p, res_16bit);
617           } else {
618             _mm_storeu_si128(p, res_16bit);
619           }
620         }
621       }
622     }
623   }
624 }
625