1 /*
2  *
3  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
4  *
5  * This source code is subject to the terms of the BSD 2 Clause License and
6  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7  * was not distributed with this source code in the LICENSE file, you can
8  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9  * Media Patent License 1.0 was not distributed with this source code in the
10  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11  */
12 
13 #include <assert.h>
14 #include <arm_neon.h>
15 
16 #include "config/av1_rtcd.h"
17 
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_ports/mem.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/arm/convolve_neon.h"
23 #include "av1/common/arm/mem_neon.h"
24 #include "av1/common/arm/transpose_neon.h"
25 
convolve8_4x4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * filter)26 static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
27                                       const int16x4_t s2, const int16x4_t s3,
28                                       const int16x4_t s4, const int16x4_t s5,
29                                       const int16x4_t s6, const int16x4_t s7,
30                                       const int16_t *filter) {
31   int16x4_t sum;
32 
33   sum = vmul_n_s16(s0, filter[0]);
34   sum = vmla_n_s16(sum, s1, filter[1]);
35   sum = vmla_n_s16(sum, s2, filter[2]);
36   sum = vmla_n_s16(sum, s5, filter[5]);
37   sum = vmla_n_s16(sum, s6, filter[6]);
38   sum = vmla_n_s16(sum, s7, filter[7]);
39   /* filter[3] can take a max value of 128. So the max value of the result :
40    * 128*255 + sum > 16 bits
41    */
42   sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
43   sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
44 
45   return sum;
46 }
47 
convolve8_horiz_8x8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * filter,const int16x8_t shift_round_0,const int16x8_t shift_by_bits)48 static INLINE uint8x8_t convolve8_horiz_8x8(
49     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
50     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
51     const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
52     const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
53   int16x8_t sum;
54 
55   sum = vmulq_n_s16(s0, filter[0]);
56   sum = vmlaq_n_s16(sum, s1, filter[1]);
57   sum = vmlaq_n_s16(sum, s2, filter[2]);
58   sum = vmlaq_n_s16(sum, s5, filter[5]);
59   sum = vmlaq_n_s16(sum, s6, filter[6]);
60   sum = vmlaq_n_s16(sum, s7, filter[7]);
61   /* filter[3] can take a max value of 128. So the max value of the result :
62    * 128*255 + sum > 16 bits
63    */
64   sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
65   sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
66 
67   sum = vqrshlq_s16(sum, shift_round_0);
68   sum = vqrshlq_s16(sum, shift_by_bits);
69 
70   return vqmovun_s16(sum);
71 }
72 
73 #if !defined(__aarch64__)
convolve8_horiz_4x1(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * filter,const int16x4_t shift_round_0,const int16x4_t shift_by_bits)74 static INLINE uint8x8_t convolve8_horiz_4x1(
75     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
76     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
77     const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
78     const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
79   int16x4_t sum;
80 
81   sum = vmul_n_s16(s0, filter[0]);
82   sum = vmla_n_s16(sum, s1, filter[1]);
83   sum = vmla_n_s16(sum, s2, filter[2]);
84   sum = vmla_n_s16(sum, s5, filter[5]);
85   sum = vmla_n_s16(sum, s6, filter[6]);
86   sum = vmla_n_s16(sum, s7, filter[7]);
87   /* filter[3] can take a max value of 128. So the max value of the result :
88    * 128*255 + sum > 16 bits
89    */
90   sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
91   sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
92 
93   sum = vqrshl_s16(sum, shift_round_0);
94   sum = vqrshl_s16(sum, shift_by_bits);
95 
96   return vqmovun_s16(vcombine_s16(sum, sum));
97 }
98 #endif  // !defined(__arch64__)
99 
convolve8_vert_8x4(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * filter)100 static INLINE uint8x8_t convolve8_vert_8x4(
101     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
102     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
103     const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
104   int16x8_t sum;
105 
106   sum = vmulq_n_s16(s0, filter[0]);
107   sum = vmlaq_n_s16(sum, s1, filter[1]);
108   sum = vmlaq_n_s16(sum, s2, filter[2]);
109   sum = vmlaq_n_s16(sum, s5, filter[5]);
110   sum = vmlaq_n_s16(sum, s6, filter[6]);
111   sum = vmlaq_n_s16(sum, s7, filter[7]);
112   /* filter[3] can take a max value of 128. So the max value of the result :
113    * 128*255 + sum > 16 bits
114    */
115   sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
116   sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
117 
118   return vqrshrun_n_s16(sum, FILTER_BITS);
119 }
120 
convolve8_vert_4x4_s32(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * y_filter,const int32x4_t round_shift_vec,const int32x4_t offset_const,const int32x4_t sub_const_vec)121 static INLINE uint16x4_t convolve8_vert_4x4_s32(
122     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
123     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
124     const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
125     const int32x4_t round_shift_vec, const int32x4_t offset_const,
126     const int32x4_t sub_const_vec) {
127   int32x4_t sum0;
128   uint16x4_t res;
129   const int32x4_t zero = vdupq_n_s32(0);
130 
131   sum0 = vmull_n_s16(s0, y_filter[0]);
132   sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
133   sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
134   sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
135   sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
136   sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
137   sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
138   sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
139 
140   sum0 = vaddq_s32(sum0, offset_const);
141   sum0 = vqrshlq_s32(sum0, round_shift_vec);
142   sum0 = vsubq_s32(sum0, sub_const_vec);
143   sum0 = vmaxq_s32(sum0, zero);
144 
145   res = vmovn_u32(vreinterpretq_u32_s32(sum0));
146 
147   return res;
148 }
149 
convolve8_vert_8x4_s32(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * y_filter,const int32x4_t round_shift_vec,const int32x4_t offset_const,const int32x4_t sub_const_vec,const int16x8_t vec_round_bits)150 static INLINE uint8x8_t convolve8_vert_8x4_s32(
151     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
152     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
153     const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
154     const int32x4_t round_shift_vec, const int32x4_t offset_const,
155     const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
156   int32x4_t sum0, sum1;
157   uint16x8_t res;
158   const int32x4_t zero = vdupq_n_s32(0);
159 
160   sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
161   sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
162   sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
163   sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
164   sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
165   sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
166   sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
167   sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
168 
169   sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
170   sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
171   sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
172   sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
173   sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
174   sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
175   sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
176   sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
177 
178   sum0 = vaddq_s32(sum0, offset_const);
179   sum1 = vaddq_s32(sum1, offset_const);
180   sum0 = vqrshlq_s32(sum0, round_shift_vec);
181   sum1 = vqrshlq_s32(sum1, round_shift_vec);
182   sum0 = vsubq_s32(sum0, sub_const_vec);
183   sum1 = vsubq_s32(sum1, sub_const_vec);
184   sum0 = vmaxq_s32(sum0, zero);
185   sum1 = vmaxq_s32(sum1, zero);
186   res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
187                      vqmovn_u32(vreinterpretq_u32_s32(sum1)));
188 
189   res = vqrshlq_u16(res, vec_round_bits);
190 
191   return vqmovn_u16(res);
192 }
193 
av1_convolve_x_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)194 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
195                             int dst_stride, int w, int h,
196                             const InterpFilterParams *filter_params_x,
197                             const int subpel_x_qn,
198                             ConvolveParams *conv_params) {
199   if (filter_params_x->taps > 8) {
200     av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
201                         subpel_x_qn, conv_params);
202     return;
203   }
204   const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
205   const int8_t bits = FILTER_BITS - conv_params->round_0;
206 
207   uint8x8_t t0;
208 #if defined(__aarch64__)
209   uint8x8_t t1, t2, t3;
210 #endif
211 
212   assert(bits >= 0);
213   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
214          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
215 
216   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
217       filter_params_x, subpel_x_qn & SUBPEL_MASK);
218 
219   const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
220   const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
221 
222   src -= horiz_offset;
223 #if defined(__aarch64__)
224   if (h == 4) {
225     uint8x8_t d01, d23;
226     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
227     int16x8_t d01_temp, d23_temp;
228 
229     __builtin_prefetch(src + 0 * src_stride);
230     __builtin_prefetch(src + 1 * src_stride);
231     __builtin_prefetch(src + 2 * src_stride);
232     __builtin_prefetch(src + 3 * src_stride);
233 
234     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
235     transpose_u8_8x4(&t0, &t1, &t2, &t3);
236 
237     s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
238     s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
239     s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
240     s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
241     s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
242     s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
243     s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
244     __builtin_prefetch(dst + 0 * dst_stride);
245     __builtin_prefetch(dst + 1 * dst_stride);
246     __builtin_prefetch(dst + 2 * dst_stride);
247     __builtin_prefetch(dst + 3 * dst_stride);
248     src += 7;
249 
250     do {
251       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
252       transpose_u8_8x4(&t0, &t1, &t2, &t3);
253 
254       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
255       s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
256       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
257       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
258 
259       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
260 
261       d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
262 
263       d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
264 
265       d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
266 
267       d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0);
268       d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0);
269 
270       d01_temp = vqrshlq_s16(d01_temp, shift_by_bits);
271       d23_temp = vqrshlq_s16(d23_temp, shift_by_bits);
272 
273       d01 = vqmovun_s16(d01_temp);
274       d23 = vqmovun_s16(d23_temp);
275 
276       transpose_u8_4x4(&d01, &d23);
277 
278       if (w != 2) {
279         vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),  // 00 01 02 03
280                       vreinterpret_u32_u8(d01), 0);
281         vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),  // 10 11 12 13
282                       vreinterpret_u32_u8(d23), 0);
283         vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),  // 20 21 22 23
284                       vreinterpret_u32_u8(d01), 1);
285         vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),  // 30 31 32 33
286                       vreinterpret_u32_u8(d23), 1);
287       } else {
288         vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride),  // 00 01
289                       vreinterpret_u16_u8(d01), 0);
290         vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride),  // 10 11
291                       vreinterpret_u16_u8(d23), 0);
292         vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride),  // 20 21
293                       vreinterpret_u16_u8(d01), 2);
294         vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride),  // 30 31
295                       vreinterpret_u16_u8(d23), 2);
296       }
297 
298       s0 = s4;
299       s1 = s5;
300       s2 = s6;
301       s3 = s7;
302       s4 = s8;
303       s5 = s9;
304       s6 = s10;
305       src += 4;
306       dst += 4;
307       w -= 4;
308     } while (w > 0);
309   } else {
310 #endif
311     int width;
312     const uint8_t *s;
313     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
314 
315 #if defined(__aarch64__)
316     int16x8_t s8, s9, s10;
317     uint8x8_t t4, t5, t6, t7;
318 #endif
319 
320     if (w <= 4) {
321 #if defined(__aarch64__)
322       do {
323         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
324         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
325         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
326         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
327         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
328         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
329         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
330         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
331         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
332 
333         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
334                     &t7);
335         src += 8 * src_stride;
336         __builtin_prefetch(dst + 0 * dst_stride);
337         __builtin_prefetch(dst + 1 * dst_stride);
338         __builtin_prefetch(dst + 2 * dst_stride);
339         __builtin_prefetch(dst + 3 * dst_stride);
340         __builtin_prefetch(dst + 4 * dst_stride);
341         __builtin_prefetch(dst + 5 * dst_stride);
342         __builtin_prefetch(dst + 6 * dst_stride);
343         __builtin_prefetch(dst + 7 * dst_stride);
344 
345         transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
346 
347         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
348         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
349         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
350         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
351 
352         __builtin_prefetch(src + 0 * src_stride);
353         __builtin_prefetch(src + 1 * src_stride);
354         __builtin_prefetch(src + 2 * src_stride);
355         __builtin_prefetch(src + 3 * src_stride);
356         __builtin_prefetch(src + 4 * src_stride);
357         __builtin_prefetch(src + 5 * src_stride);
358         __builtin_prefetch(src + 6 * src_stride);
359         __builtin_prefetch(src + 7 * src_stride);
360         t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
361                                  shift_round_0, shift_by_bits);
362         t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
363                                  shift_round_0, shift_by_bits);
364         t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
365                                  shift_round_0, shift_by_bits);
366         t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
367                                  shift_round_0, shift_by_bits);
368 
369         transpose_u8_8x4(&t0, &t1, &t2, &t3);
370 
371         if ((w == 4) && (h > 4)) {
372           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
373                         0);  // 00 01 02 03
374           dst += dst_stride;
375           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
376                         0);  // 10 11 12 13
377           dst += dst_stride;
378           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
379                         0);  // 20 21 22 23
380           dst += dst_stride;
381           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
382                         0);  // 30 31 32 33
383           dst += dst_stride;
384           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
385                         1);  // 40 41 42 43
386           dst += dst_stride;
387           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
388                         1);  // 50 51 52 53
389           dst += dst_stride;
390           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
391                         1);  // 60 61 62 63
392           dst += dst_stride;
393           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
394                         1);  // 70 71 72 73
395           dst += dst_stride;
396         } else if ((w == 4) && (h == 2)) {
397           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
398                         0);  // 00 01 02 03
399           dst += dst_stride;
400           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
401                         0);  // 10 11 12 13
402           dst += dst_stride;
403         } else if ((w == 2) && (h > 4)) {
404           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0),
405                         0);  // 00 01
406           dst += dst_stride;
407           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1),
408                         0);  // 10 11
409           dst += dst_stride;
410           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2),
411                         0);  // 20 21
412           dst += dst_stride;
413           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3),
414                         0);  // 30 31
415           dst += dst_stride;
416           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0),
417                         2);  // 40 41
418           dst += dst_stride;
419           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1),
420                         2);  // 50 51
421           dst += dst_stride;
422           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2),
423                         2);  // 60 61
424           dst += dst_stride;
425           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3),
426                         2);  // 70 71
427           dst += dst_stride;
428         } else if ((w == 2) && (h == 2)) {
429           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0),
430                         0);  // 00 01
431           dst += dst_stride;
432           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1),
433                         0);  // 10 11
434           dst += dst_stride;
435         }
436         h -= 8;
437       } while (h > 0);
438 #else
439     int16x8_t tt0;
440     int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
441     const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
442     const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
443     do {
444       t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
445       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
446       x0 = vget_low_s16(tt0);   // a0 a1 a2 a3
447       x4 = vget_high_s16(tt0);  // a4 a5 a6 a7
448 
449       t0 = vld1_u8(src + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
450       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
451       x7 = vget_low_s16(tt0);  // a8 a9 a10 a11
452 
453       x1 = vext_s16(x0, x4, 1);  // a1 a2 a3 a4
454       x2 = vext_s16(x0, x4, 2);  // a2 a3 a4 a5
455       x3 = vext_s16(x0, x4, 3);  // a3 a4 a5 a6
456       x5 = vext_s16(x4, x7, 1);  // a5 a6 a7 a8
457       x6 = vext_s16(x4, x7, 2);  // a6 a7 a8 a9
458       x7 = vext_s16(x4, x7, 3);  // a7 a8 a9 a10
459 
460       src += src_stride;
461 
462       t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
463                                shift_round_0_low, shift_by_bits_low);
464 
465       if (w == 4) {
466         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
467                       0);  // 00 01 02 03
468         dst += dst_stride;
469       } else if (w == 2) {
470         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
471         dst += dst_stride;
472       }
473       h -= 1;
474     } while (h > 0);
475 #endif
476     } else {
477       uint8_t *d;
478       int16x8_t s11;
479 #if defined(__aarch64__)
480       int16x8_t s12, s13, s14;
481       do {
482         __builtin_prefetch(src + 0 * src_stride);
483         __builtin_prefetch(src + 1 * src_stride);
484         __builtin_prefetch(src + 2 * src_stride);
485         __builtin_prefetch(src + 3 * src_stride);
486         __builtin_prefetch(src + 4 * src_stride);
487         __builtin_prefetch(src + 5 * src_stride);
488         __builtin_prefetch(src + 6 * src_stride);
489         __builtin_prefetch(src + 7 * src_stride);
490         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
491         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
492         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
493         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
494         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
495         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
496         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
497         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
498         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
499 
500         width = w;
501         s = src + 7;
502         d = dst;
503         __builtin_prefetch(dst + 0 * dst_stride);
504         __builtin_prefetch(dst + 1 * dst_stride);
505         __builtin_prefetch(dst + 2 * dst_stride);
506         __builtin_prefetch(dst + 3 * dst_stride);
507         __builtin_prefetch(dst + 4 * dst_stride);
508         __builtin_prefetch(dst + 5 * dst_stride);
509         __builtin_prefetch(dst + 6 * dst_stride);
510         __builtin_prefetch(dst + 7 * dst_stride);
511 
512         do {
513           load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
514           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
515           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
516           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
517           s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
518           s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
519           s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
520           s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
521           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
522           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
523 
524           t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
525                                    shift_round_0, shift_by_bits);
526 
527           t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
528                                    shift_round_0, shift_by_bits);
529 
530           t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
531                                    shift_round_0, shift_by_bits);
532 
533           t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
534                                    shift_round_0, shift_by_bits);
535 
536           t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
537                                    shift_round_0, shift_by_bits);
538 
539           t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
540                                    shift_round_0, shift_by_bits);
541 
542           t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
543                                    shift_round_0, shift_by_bits);
544 
545           t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
546                                    x_filter, shift_round_0, shift_by_bits);
547 
548           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
549           if (h != 2) {
550             store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
551           } else {
552             store_row2_u8_8x8(d, dst_stride, t0, t1);
553           }
554           s0 = s8;
555           s1 = s9;
556           s2 = s10;
557           s3 = s11;
558           s4 = s12;
559           s5 = s13;
560           s6 = s14;
561           s += 8;
562           d += 8;
563           width -= 8;
564         } while (width > 0);
565         src += 8 * src_stride;
566         dst += 8 * dst_stride;
567         h -= 8;
568       } while (h > 0);
569 #else
570     do {
571       t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
572       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
573 
574       width = w;
575       s = src + 8;
576       d = dst;
577       __builtin_prefetch(dst);
578 
579       do {
580         t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
581         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
582         s11 = s0;
583         s0 = s7;
584 
585         s1 = vextq_s16(s11, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
586         s2 = vextq_s16(s11, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
587         s3 = vextq_s16(s11, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
588         s4 = vextq_s16(s11, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
589         s5 = vextq_s16(s11, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
590         s6 = vextq_s16(s11, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
591         s7 = vextq_s16(s11, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
592 
593         t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
594                                  shift_round_0, shift_by_bits);
595         vst1_u8(d, t0);
596 
597         s += 8;
598         d += 8;
599         width -= 8;
600       } while (width > 0);
601       src += src_stride;
602       dst += dst_stride;
603       h -= 1;
604     } while (h > 0);
605 #endif
606     }
607 #if defined(__aarch64__)
608   }
609 #endif
610 }
611 
av1_convolve_y_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)612 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
613                             int dst_stride, int w, int h,
614                             const InterpFilterParams *filter_params_y,
615                             const int subpel_y_qn) {
616   if (filter_params_y->taps > 8) {
617     av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
618                         subpel_y_qn);
619     return;
620   }
621   const int vert_offset = filter_params_y->taps / 2 - 1;
622 
623   src -= vert_offset * src_stride;
624 
625   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
626       filter_params_y, subpel_y_qn & SUBPEL_MASK);
627 
628   if (w <= 4) {
629     uint8x8_t d01;
630     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
631 #if defined(__aarch64__)
632     uint8x8_t d23;
633     int16x4_t s8, s9, s10, d1, d2, d3;
634 #endif
635     s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
636     src += src_stride;
637     s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
638     src += src_stride;
639     s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
640     src += src_stride;
641     s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
642     src += src_stride;
643     s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
644     src += src_stride;
645     s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
646     src += src_stride;
647     s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
648     src += src_stride;
649 
650     do {
651       s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
652       src += src_stride;
653 #if defined(__aarch64__)
654       s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
655       src += src_stride;
656       s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
657       src += src_stride;
658       s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
659       src += src_stride;
660 
661       __builtin_prefetch(dst + 0 * dst_stride);
662       __builtin_prefetch(dst + 1 * dst_stride);
663       __builtin_prefetch(dst + 2 * dst_stride);
664       __builtin_prefetch(dst + 3 * dst_stride);
665       __builtin_prefetch(src + 0 * src_stride);
666       __builtin_prefetch(src + 1 * src_stride);
667       __builtin_prefetch(src + 2 * src_stride);
668       __builtin_prefetch(src + 3 * src_stride);
669       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
670       d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
671       d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
672       d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
673 
674       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
675       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
676       if ((w == 4) && (h != 2)) {
677         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
678                       0);  // 00 01 02 03
679         dst += dst_stride;
680         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
681                       1);  // 10 11 12 13
682         dst += dst_stride;
683         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
684                       0);  // 20 21 22 23
685         dst += dst_stride;
686         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
687                       1);  // 30 31 32 33
688         dst += dst_stride;
689       } else if ((w == 4) && (h == 2)) {
690         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
691                       0);  // 00 01 02 03
692         dst += dst_stride;
693         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
694                       1);  // 10 11 12 13
695         dst += dst_stride;
696       } else if ((w == 2) && (h != 2)) {
697         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
698         dst += dst_stride;
699         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
700         dst += dst_stride;
701         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0);  // 20 21
702         dst += dst_stride;
703         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2);  // 30 31
704         dst += dst_stride;
705       } else if ((w == 2) && (h == 2)) {
706         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
707         dst += dst_stride;
708         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
709         dst += dst_stride;
710       }
711       s0 = s4;
712       s1 = s5;
713       s2 = s6;
714       s3 = s7;
715       s4 = s8;
716       s5 = s9;
717       s6 = s10;
718       h -= 4;
719 #else
720       __builtin_prefetch(dst + 0 * dst_stride);
721       __builtin_prefetch(src + 0 * src_stride);
722 
723       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
724 
725       d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
726 
727       if (w == 4) {
728         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
729         dst += dst_stride;
730       } else if (w == 2) {
731         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
732         dst += dst_stride;
733       }
734       s0 = s1;
735       s1 = s2;
736       s2 = s3;
737       s3 = s4;
738       s4 = s5;
739       s5 = s6;
740       s6 = s7;
741       h -= 1;
742 #endif
743     } while (h > 0);
744   } else {
745     int height;
746     const uint8_t *s;
747     uint8_t *d;
748     uint8x8_t t0;
749     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
750 #if defined(__aarch64__)
751     uint8x8_t t1, t2, t3;
752     int16x8_t s8, s9, s10;
753 #endif
754     do {
755       __builtin_prefetch(src + 0 * src_stride);
756       __builtin_prefetch(src + 1 * src_stride);
757       __builtin_prefetch(src + 2 * src_stride);
758       __builtin_prefetch(src + 3 * src_stride);
759       __builtin_prefetch(src + 4 * src_stride);
760       __builtin_prefetch(src + 5 * src_stride);
761       __builtin_prefetch(src + 6 * src_stride);
762       s = src;
763       s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
764       s += src_stride;
765       s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
766       s += src_stride;
767       s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
768       s += src_stride;
769       s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
770       s += src_stride;
771       s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
772       s += src_stride;
773       s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
774       s += src_stride;
775       s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
776       s += src_stride;
777       d = dst;
778       height = h;
779 
780       do {
781         s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
782         s += src_stride;
783 #if defined(__aarch64__)
784         s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
785         s += src_stride;
786         s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
787         s += src_stride;
788         s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
789         s += src_stride;
790 
791         __builtin_prefetch(d + 0 * dst_stride);
792         __builtin_prefetch(d + 1 * dst_stride);
793         __builtin_prefetch(d + 2 * dst_stride);
794         __builtin_prefetch(d + 3 * dst_stride);
795         __builtin_prefetch(s + 0 * src_stride);
796         __builtin_prefetch(s + 1 * src_stride);
797         __builtin_prefetch(s + 2 * src_stride);
798         __builtin_prefetch(s + 3 * src_stride);
799         t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
800         t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
801         t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
802         t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
803         if (h != 2) {
804           vst1_u8(d, t0);
805           d += dst_stride;
806           vst1_u8(d, t1);
807           d += dst_stride;
808           vst1_u8(d, t2);
809           d += dst_stride;
810           vst1_u8(d, t3);
811           d += dst_stride;
812         } else {
813           vst1_u8(d, t0);
814           d += dst_stride;
815           vst1_u8(d, t1);
816           d += dst_stride;
817         }
818         s0 = s4;
819         s1 = s5;
820         s2 = s6;
821         s3 = s7;
822         s4 = s8;
823         s5 = s9;
824         s6 = s10;
825         height -= 4;
826 #else
827         __builtin_prefetch(d);
828         __builtin_prefetch(s);
829 
830         t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
831 
832         vst1_u8(d, t0);
833         d += dst_stride;
834 
835         s0 = s1;
836         s1 = s2;
837         s2 = s3;
838         s3 = s4;
839         s4 = s5;
840         s5 = s6;
841         s6 = s7;
842         height -= 1;
843 #endif
844       } while (height > 0);
845       src += 8;
846       dst += 8;
847       w -= 8;
848     } while (w > 0);
849   }
850 }
851 
852 // Horizontal filtering for convolve_2d_sr for width multiple of 8
853 // Processes one row at a time
horiz_filter_w8_single_row(const uint8_t * src_ptr,int src_stride,int16_t * dst_ptr,const int dst_stride,int width,int height,const int16_t * x_filter,const int16x8_t horiz_const,const int16x8_t shift_round_0)854 static INLINE void horiz_filter_w8_single_row(
855     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
856     const int dst_stride, int width, int height, const int16_t *x_filter,
857     const int16x8_t horiz_const, const int16x8_t shift_round_0) {
858   int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
859   do {
860     uint8x8_t t0 = vld1_u8(src_ptr);
861     s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
862 
863     int width_tmp = width;
864     const uint8_t *s = src_ptr + 8;
865     int16_t *dst_tmp = dst_ptr;
866 
867     __builtin_prefetch(dst_ptr);
868 
869     do {
870       t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
871       s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
872       int16x8_t sum = s0;
873       s0 = s7;
874 
875       s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
876       s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
877       s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
878       s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
879       s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
880       s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
881       s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
882 
883       int16x8_t res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7,
884                                          x_filter, horiz_const, shift_round_0);
885 
886       vst1q_s16(dst_tmp, res0);
887 
888       s += 8;
889       dst_tmp += 8;
890       width_tmp -= 8;
891     } while (width_tmp > 0);
892     src_ptr += src_stride;
893     dst_ptr += dst_stride;
894     height--;
895   } while (height > 0);
896 }
897 
898 // Horizontal filtering for convolve_2d_sr for width <= 4
899 // Processes one row at a time
horiz_filter_w4_single_row(const uint8_t * src_ptr,int src_stride,int16_t * dst_ptr,const int dst_stride,int width,int height,const int16_t * x_filter,const int16x4_t horiz_const,const int16x4_t shift_round_0)900 static INLINE void horiz_filter_w4_single_row(
901     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
902     const int dst_stride, int width, int height, const int16_t *x_filter,
903     const int16x4_t horiz_const, const int16x4_t shift_round_0) {
904   int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
905   do {
906     const uint8_t *s = src_ptr;
907 
908     __builtin_prefetch(s);
909 
910     uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
911     int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
912     s0 = vget_low_s16(tt0);
913     s4 = vget_high_s16(tt0);
914 
915     __builtin_prefetch(dst_ptr);
916     s += 8;
917 
918     t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
919     s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
920 
921     s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
922     s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
923     s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
924     s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
925     s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
926     s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
927 
928     int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
929                                      horiz_const, shift_round_0);
930 
931     if (width == 4) {
932       vst1_s16(dst_ptr, d0);
933       dst_ptr += dst_stride;
934     } else if (width == 2) {
935       vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
936       dst_ptr += dst_stride;
937     }
938 
939     src_ptr += src_stride;
940     height--;
941   } while (height > 0);
942 }
943 
av1_convolve_2d_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)944 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
945                              int dst_stride, int w, int h,
946                              const InterpFilterParams *filter_params_x,
947                              const InterpFilterParams *filter_params_y,
948                              const int subpel_x_qn, const int subpel_y_qn,
949                              ConvolveParams *conv_params) {
950   if (filter_params_x->taps > 8) {
951     av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
952                          filter_params_x, filter_params_y, subpel_x_qn,
953                          subpel_y_qn, conv_params);
954     return;
955   }
956   int im_dst_stride;
957   int width, height;
958 #if defined(__aarch64__)
959   uint8x8_t t0;
960   uint8x8_t t1, t2, t3, t4, t5, t6, t7;
961   const uint8_t *s;
962 #endif
963 
964   DECLARE_ALIGNED(16, int16_t,
965                   im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
966 
967   const int bd = 8;
968   const int im_h = h + filter_params_y->taps - 1;
969   const int im_stride = MAX_SB_SIZE;
970   const int vert_offset = filter_params_y->taps / 2 - 1;
971   const int horiz_offset = filter_params_x->taps / 2 - 1;
972 
973   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
974 
975   int16_t *dst_ptr;
976 
977   dst_ptr = im_block;
978   im_dst_stride = im_stride;
979   height = im_h;
980   width = w;
981 
982   const int16_t round_bits =
983       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
984   const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
985   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
986   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
987       filter_params_x, subpel_x_qn & SUBPEL_MASK);
988 
989   int16_t x_filter_tmp[8];
990   int16x8_t filter_x_coef = vld1q_s16(x_filter);
991 
992   // filter coeffs are even, so downshifting by 1 to reduce intermediate
993   // precision requirements.
994   filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
995   vst1q_s16(&x_filter_tmp[0], filter_x_coef);
996 
997   assert(conv_params->round_0 > 0);
998 
999   if (w <= 4) {
1000     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
1001     const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
1002 
1003 #if defined(__aarch64__)
1004     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
1005     do {
1006       assert(height >= 4);
1007       s = src_ptr;
1008       __builtin_prefetch(s + 0 * src_stride);
1009       __builtin_prefetch(s + 1 * src_stride);
1010       __builtin_prefetch(s + 2 * src_stride);
1011       __builtin_prefetch(s + 3 * src_stride);
1012 
1013       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1014       transpose_u8_8x4(&t0, &t1, &t2, &t3);
1015 
1016       s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1017       s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1018       s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1019       s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1020       s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1021       s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1022       s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1023 
1024       __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
1025       __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
1026       __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
1027       __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
1028       s += 7;
1029 
1030       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1031       transpose_u8_8x4(&t0, &t1, &t2, &t3);
1032 
1033       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1034       s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1035       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1036       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1037 
1038       d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
1039                              horiz_const, shift_round_0);
1040       d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
1041                              horiz_const, shift_round_0);
1042       d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
1043                              horiz_const, shift_round_0);
1044       d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
1045                              horiz_const, shift_round_0);
1046 
1047       transpose_s16_4x4d(&d0, &d1, &d2, &d3);
1048       if (w == 4) {
1049         vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
1050         vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
1051         vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
1052         vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
1053       } else if (w == 2) {
1054         vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
1055                       vreinterpret_u32_s16(d0), 0);
1056         vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
1057                       vreinterpret_u32_s16(d1), 0);
1058         vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
1059                       vreinterpret_u32_s16(d2), 0);
1060         vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
1061                       vreinterpret_u32_s16(d3), 0);
1062       }
1063       src_ptr += 4 * src_stride;
1064       dst_ptr += 4 * im_dst_stride;
1065       height -= 4;
1066     } while (height >= 4);
1067 
1068     if (height) {
1069       assert(height < 4);
1070       horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
1071                                  height, x_filter_tmp, horiz_const,
1072                                  shift_round_0);
1073     }
1074 #else
1075     horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
1076                                height, x_filter_tmp, horiz_const,
1077                                shift_round_0);
1078 #endif
1079 
1080   } else {
1081     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
1082     const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
1083 
1084 #if defined(__aarch64__)
1085     int16_t *d_tmp;
1086     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
1087     int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
1088     do {
1089       assert(height >= 8);
1090       __builtin_prefetch(src_ptr + 0 * src_stride);
1091       __builtin_prefetch(src_ptr + 1 * src_stride);
1092       __builtin_prefetch(src_ptr + 2 * src_stride);
1093       __builtin_prefetch(src_ptr + 3 * src_stride);
1094       __builtin_prefetch(src_ptr + 4 * src_stride);
1095       __builtin_prefetch(src_ptr + 5 * src_stride);
1096       __builtin_prefetch(src_ptr + 6 * src_stride);
1097       __builtin_prefetch(src_ptr + 7 * src_stride);
1098 
1099       load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1100 
1101       transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1102 
1103       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1104       s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1105       s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1106       s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1107       s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1108       s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1109       s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1110 
1111       width = w;
1112       s = src_ptr + 7;
1113       d_tmp = dst_ptr;
1114 
1115       __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
1116       __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
1117       __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
1118       __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
1119       __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
1120       __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
1121       __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
1122       __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
1123 
1124       do {
1125         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1126 
1127         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1128 
1129         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1130         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1131         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
1132         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
1133         s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
1134         s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
1135         s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
1136         s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
1137 
1138         res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
1139                                  horiz_const, shift_round_0);
1140         res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
1141                                  horiz_const, shift_round_0);
1142         res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
1143                                  horiz_const, shift_round_0);
1144         res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
1145                                  horiz_const, shift_round_0);
1146         res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
1147                                  horiz_const, shift_round_0);
1148         res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
1149                                  x_filter_tmp, horiz_const, shift_round_0);
1150         res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
1151                                  x_filter_tmp, horiz_const, shift_round_0);
1152         res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
1153                                  x_filter_tmp, horiz_const, shift_round_0);
1154 
1155         transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
1156                           &res7);
1157 
1158         store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
1159                       res6, res7);
1160 
1161         s0 = s8;
1162         s1 = s9;
1163         s2 = s10;
1164         s3 = s11;
1165         s4 = s12;
1166         s5 = s13;
1167         s6 = s14;
1168         s += 8;
1169         d_tmp += 8;
1170         width -= 8;
1171       } while (width > 0);
1172       src_ptr += 8 * src_stride;
1173       dst_ptr += 8 * im_dst_stride;
1174       height -= 8;
1175     } while (height >= 8);
1176 
1177     if (height >= 4) {
1178       assert(height < 8);
1179       int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
1180           reg10, reg11, reg12, reg13, reg14;
1181       int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
1182       int16x8_t out0, out1, out2, out3;
1183 
1184       __builtin_prefetch(src_ptr + 0 * src_stride);
1185       __builtin_prefetch(src_ptr + 1 * src_stride);
1186       __builtin_prefetch(src_ptr + 2 * src_stride);
1187       __builtin_prefetch(src_ptr + 3 * src_stride);
1188 
1189       load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
1190       transpose_u8_8x4(&t0, &t1, &t2, &t3);
1191 
1192       reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1193       reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1194       reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1195       reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1196       reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1197       reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1198       reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1199 
1200       __builtin_prefetch(dst_ptr + 0 * dst_stride);
1201       __builtin_prefetch(dst_ptr + 1 * dst_stride);
1202       __builtin_prefetch(dst_ptr + 2 * dst_stride);
1203       __builtin_prefetch(dst_ptr + 3 * dst_stride);
1204 
1205       s = src_ptr + 7;
1206       d_tmp = dst_ptr;
1207       width = w;
1208 
1209       do {
1210         load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1211         transpose_u8_8x4(&t0, &t1, &t2, &t3);
1212 
1213         reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1214         reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1215         reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1216         reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1217         reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1218         reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1219         reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1220         reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1221 
1222         d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
1223                            x_filter_tmp);
1224 
1225         d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
1226                            x_filter_tmp);
1227 
1228         d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
1229                            x_filter_tmp);
1230 
1231         d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
1232                            x_filter_tmp);
1233 
1234         d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
1235                            x_filter_tmp);
1236 
1237         d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
1238                            x_filter_tmp);
1239 
1240         d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
1241                            x_filter_tmp);
1242 
1243         d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
1244                            x_filter_tmp);
1245 
1246         transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
1247                           &out2, &out3);
1248 
1249         out0 = vaddq_s16(out0, horiz_const);
1250         out0 = vqrshlq_s16(out0, shift_round_0);
1251 
1252         out1 = vaddq_s16(out1, horiz_const);
1253         out1 = vqrshlq_s16(out1, shift_round_0);
1254 
1255         out2 = vaddq_s16(out2, horiz_const);
1256         out2 = vqrshlq_s16(out2, shift_round_0);
1257 
1258         out3 = vaddq_s16(out3, horiz_const);
1259         out3 = vqrshlq_s16(out3, shift_round_0);
1260 
1261         store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
1262 
1263         reg0 = reg8;
1264         reg1 = reg9;
1265         reg2 = reg10;
1266         reg3 = reg11;
1267         reg4 = reg12;
1268         reg5 = reg13;
1269         reg6 = reg14;
1270         s += 8;
1271         d_tmp += 8;
1272         width -= 8;
1273       } while (width > 0);
1274       src_ptr += 4 * src_stride;
1275       dst_ptr += 4 * im_dst_stride;
1276       height -= 4;
1277     }
1278 
1279     if (height) {
1280       assert(height < 4);
1281       horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
1282                                  height, x_filter_tmp, horiz_const,
1283                                  shift_round_0);
1284     }
1285 #else
1286 
1287     horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
1288                                height, x_filter_tmp, horiz_const,
1289                                shift_round_0);
1290 #endif
1291   }
1292 
1293   // vertical
1294   {
1295     uint8_t *dst_u8_ptr, *d_u8;
1296     int16_t *v_src_ptr, *v_s;
1297 
1298     const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
1299                               (1 << (offset_bits - conv_params->round_1 - 1));
1300     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1301         filter_params_y, subpel_y_qn & SUBPEL_MASK);
1302 
1303     const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
1304     const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
1305     const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
1306 
1307     src_stride = im_stride;
1308     v_src_ptr = im_block;
1309     dst_u8_ptr = dst;
1310 
1311     height = h;
1312     width = w;
1313 
1314     if (width <= 4) {
1315       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
1316       uint16x4_t d0;
1317       uint16x8_t dd0;
1318       uint8x8_t d01;
1319 
1320 #if defined(__aarch64__)
1321       int16x4_t s8, s9, s10;
1322       uint16x4_t d1, d2, d3;
1323       uint16x8_t dd1;
1324       uint8x8_t d23;
1325 #endif
1326 
1327       d_u8 = dst_u8_ptr;
1328       v_s = v_src_ptr;
1329 
1330       __builtin_prefetch(v_s + 0 * im_stride);
1331       __builtin_prefetch(v_s + 1 * im_stride);
1332       __builtin_prefetch(v_s + 2 * im_stride);
1333       __builtin_prefetch(v_s + 3 * im_stride);
1334       __builtin_prefetch(v_s + 4 * im_stride);
1335       __builtin_prefetch(v_s + 5 * im_stride);
1336       __builtin_prefetch(v_s + 6 * im_stride);
1337       __builtin_prefetch(v_s + 7 * im_stride);
1338 
1339       load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1340       v_s += (7 * im_stride);
1341 
1342       do {
1343 #if defined(__aarch64__)
1344         load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
1345         v_s += (im_stride << 2);
1346 
1347         __builtin_prefetch(d_u8 + 0 * dst_stride);
1348         __builtin_prefetch(d_u8 + 1 * dst_stride);
1349         __builtin_prefetch(d_u8 + 2 * dst_stride);
1350         __builtin_prefetch(d_u8 + 3 * dst_stride);
1351 
1352         d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1353                                     round_shift_vec, offset_const,
1354                                     sub_const_vec);
1355         d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1356                                     round_shift_vec, offset_const,
1357                                     sub_const_vec);
1358         d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1359                                     round_shift_vec, offset_const,
1360                                     sub_const_vec);
1361         d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1362                                     round_shift_vec, offset_const,
1363                                     sub_const_vec);
1364 
1365         dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
1366         dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
1367 
1368         d01 = vqmovn_u16(dd0);
1369         d23 = vqmovn_u16(dd1);
1370 
1371         if ((w == 4) && (h != 2)) {
1372           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1373                         0);  // 00 01 02 03
1374           d_u8 += dst_stride;
1375           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1376                         1);  // 10 11 12 13
1377           d_u8 += dst_stride;
1378           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
1379                         0);  // 20 21 22 23
1380           d_u8 += dst_stride;
1381           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
1382                         1);  // 30 31 32 33
1383           d_u8 += dst_stride;
1384         } else if ((w == 2) && (h != 2)) {
1385           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1386                         0);  // 00 01
1387           d_u8 += dst_stride;
1388           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1389                         2);  // 10 11
1390           d_u8 += dst_stride;
1391           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
1392                         0);  // 20 21
1393           d_u8 += dst_stride;
1394           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
1395                         2);  // 30 31
1396           d_u8 += dst_stride;
1397         } else if ((w == 4) && (h == 2)) {
1398           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1399                         0);  // 00 01 02 03
1400           d_u8 += dst_stride;
1401           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1402                         1);  // 10 11 12 13
1403           d_u8 += dst_stride;
1404         } else if ((w == 2) && (h == 2)) {
1405           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1406                         0);  // 00 01
1407           d_u8 += dst_stride;
1408           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1409                         2);  // 10 11
1410           d_u8 += dst_stride;
1411         }
1412 
1413         s0 = s4;
1414         s1 = s5;
1415         s2 = s6;
1416         s3 = s7;
1417         s4 = s8;
1418         s5 = s9;
1419         s6 = s10;
1420         height -= 4;
1421 #else
1422         s7 = vld1_s16(v_s);
1423         v_s += im_stride;
1424 
1425         __builtin_prefetch(d_u8 + 0 * dst_stride);
1426 
1427         d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1428                                     round_shift_vec, offset_const,
1429                                     sub_const_vec);
1430 
1431         dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
1432         d01 = vqmovn_u16(dd0);
1433 
1434         if (w == 4) {
1435           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1436                         0);  // 00 01 02 03
1437           d_u8 += dst_stride;
1438 
1439         } else if (w == 2) {
1440           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1441                         0);  // 00 01
1442           d_u8 += dst_stride;
1443         }
1444 
1445         s0 = s1;
1446         s1 = s2;
1447         s2 = s3;
1448         s3 = s4;
1449         s4 = s5;
1450         s5 = s6;
1451         s6 = s7;
1452         height -= 1;
1453 #endif
1454       } while (height > 0);
1455     } else {
1456       // if width is a multiple of 8 & height is a multiple of 4
1457       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
1458       uint8x8_t res0;
1459 #if defined(__aarch64__)
1460       int16x8_t s8, s9, s10;
1461       uint8x8_t res1, res2, res3;
1462 #endif
1463 
1464       do {
1465         __builtin_prefetch(v_src_ptr + 0 * im_stride);
1466         __builtin_prefetch(v_src_ptr + 1 * im_stride);
1467         __builtin_prefetch(v_src_ptr + 2 * im_stride);
1468         __builtin_prefetch(v_src_ptr + 3 * im_stride);
1469         __builtin_prefetch(v_src_ptr + 4 * im_stride);
1470         __builtin_prefetch(v_src_ptr + 5 * im_stride);
1471         __builtin_prefetch(v_src_ptr + 6 * im_stride);
1472         __builtin_prefetch(v_src_ptr + 7 * im_stride);
1473 
1474         v_s = v_src_ptr;
1475         load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1476         v_s += (7 * im_stride);
1477 
1478         d_u8 = dst_u8_ptr;
1479         height = h;
1480 
1481         do {
1482 #if defined(__aarch64__)
1483           load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
1484           v_s += (im_stride << 2);
1485 
1486           __builtin_prefetch(d_u8 + 4 * dst_stride);
1487           __builtin_prefetch(d_u8 + 5 * dst_stride);
1488           __builtin_prefetch(d_u8 + 6 * dst_stride);
1489           __builtin_prefetch(d_u8 + 7 * dst_stride);
1490 
1491           res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
1492                                         y_filter, round_shift_vec, offset_const,
1493                                         sub_const_vec, vec_round_bits);
1494           res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
1495                                         y_filter, round_shift_vec, offset_const,
1496                                         sub_const_vec, vec_round_bits);
1497           res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
1498                                         y_filter, round_shift_vec, offset_const,
1499                                         sub_const_vec, vec_round_bits);
1500           res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
1501                                         y_filter, round_shift_vec, offset_const,
1502                                         sub_const_vec, vec_round_bits);
1503 
1504           if (h != 2) {
1505             vst1_u8(d_u8, res0);
1506             d_u8 += dst_stride;
1507             vst1_u8(d_u8, res1);
1508             d_u8 += dst_stride;
1509             vst1_u8(d_u8, res2);
1510             d_u8 += dst_stride;
1511             vst1_u8(d_u8, res3);
1512             d_u8 += dst_stride;
1513           } else {
1514             vst1_u8(d_u8, res0);
1515             d_u8 += dst_stride;
1516             vst1_u8(d_u8, res1);
1517             d_u8 += dst_stride;
1518           }
1519           s0 = s4;
1520           s1 = s5;
1521           s2 = s6;
1522           s3 = s7;
1523           s4 = s8;
1524           s5 = s9;
1525           s6 = s10;
1526           height -= 4;
1527 #else
1528           s7 = vld1q_s16(v_s);
1529           v_s += im_stride;
1530 
1531           __builtin_prefetch(d_u8 + 0 * dst_stride);
1532 
1533           res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
1534                                         y_filter, round_shift_vec, offset_const,
1535                                         sub_const_vec, vec_round_bits);
1536 
1537           vst1_u8(d_u8, res0);
1538           d_u8 += dst_stride;
1539 
1540           s0 = s1;
1541           s1 = s2;
1542           s2 = s3;
1543           s3 = s4;
1544           s4 = s5;
1545           s5 = s6;
1546           s6 = s7;
1547           height -= 1;
1548 #endif
1549         } while (height > 0);
1550         v_src_ptr += 8;
1551         dst_u8_ptr += 8;
1552         w -= 8;
1553       } while (w > 0);
1554     }
1555   }
1556 }
1557 
scaledconvolve_horiz_w4(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const x_filters,const int x0_q4,const int x_step_q4,const int w,const int h)1558 static INLINE void scaledconvolve_horiz_w4(
1559     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1560     const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
1561     const int x0_q4, const int x_step_q4, const int w, const int h) {
1562   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
1563   int x, y, z;
1564 
1565   src -= SUBPEL_TAPS / 2 - 1;
1566 
1567   y = h;
1568   do {
1569     int x_q4 = x0_q4;
1570     x = 0;
1571     do {
1572       // process 4 src_x steps
1573       for (z = 0; z < 4; ++z) {
1574         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1575         if (x_q4 & SUBPEL_MASK) {
1576           const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
1577           const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
1578           const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
1579           uint8x8_t s[8], d;
1580           int16x8_t ss[4];
1581           int16x4_t t[8], tt;
1582 
1583           load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
1584           transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
1585 
1586           ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
1587           ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
1588           ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
1589           ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
1590           t[0] = vget_low_s16(ss[0]);
1591           t[1] = vget_low_s16(ss[1]);
1592           t[2] = vget_low_s16(ss[2]);
1593           t[3] = vget_low_s16(ss[3]);
1594           t[4] = vget_high_s16(ss[0]);
1595           t[5] = vget_high_s16(ss[1]);
1596           t[6] = vget_high_s16(ss[2]);
1597           t[7] = vget_high_s16(ss[3]);
1598 
1599           tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
1600                            filters, filter3, filter4);
1601           d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
1602           vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
1603         } else {
1604           int i;
1605           for (i = 0; i < 4; ++i) {
1606             temp[z * 4 + i] = src_x[i * src_stride + 3];
1607           }
1608         }
1609         x_q4 += x_step_q4;
1610       }
1611 
1612       // transpose the 4x4 filters values back to dst
1613       {
1614         const uint8x8x4_t d4 = vld4_u8(temp);
1615         vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
1616                       vreinterpret_u32_u8(d4.val[0]), 0);
1617         vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
1618                       vreinterpret_u32_u8(d4.val[1]), 0);
1619         vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
1620                       vreinterpret_u32_u8(d4.val[2]), 0);
1621         vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
1622                       vreinterpret_u32_u8(d4.val[3]), 0);
1623       }
1624       x += 4;
1625     } while (x < w);
1626 
1627     src += src_stride * 4;
1628     dst += dst_stride * 4;
1629     y -= 4;
1630   } while (y > 0);
1631 }
1632 
scaledconvolve_horiz_w8(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const x_filters,const int x0_q4,const int x_step_q4,const int w,const int h)1633 static INLINE void scaledconvolve_horiz_w8(
1634     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1635     const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
1636     const int x0_q4, const int x_step_q4, const int w, const int h) {
1637   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
1638   int x, y, z;
1639   src -= SUBPEL_TAPS / 2 - 1;
1640 
1641   // This function processes 8x8 areas. The intermediate height is not always
1642   // a multiple of 8, so force it to be a multiple of 8 here.
1643   y = (h + 7) & ~7;
1644 
1645   do {
1646     int x_q4 = x0_q4;
1647     x = 0;
1648     do {
1649       uint8x8_t d[8];
1650       // process 8 src_x steps
1651       for (z = 0; z < 8; ++z) {
1652         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1653 
1654         if (x_q4 & SUBPEL_MASK) {
1655           const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
1656           uint8x8_t s[8];
1657           load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
1658                       &s[5], &s[6], &s[7]);
1659           transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
1660                            &s[7]);
1661           d[0] = scale_filter_8(s, filters);
1662           vst1_u8(&temp[8 * z], d[0]);
1663         } else {
1664           int i;
1665           for (i = 0; i < 8; ++i) {
1666             temp[z * 8 + i] = src_x[i * src_stride + 3];
1667           }
1668         }
1669         x_q4 += x_step_q4;
1670       }
1671 
1672       // transpose the 8x8 filters values back to dst
1673       load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
1674                   &d[7]);
1675       transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
1676       vst1_u8(&dst[x + 0 * dst_stride], d[0]);
1677       vst1_u8(&dst[x + 1 * dst_stride], d[1]);
1678       vst1_u8(&dst[x + 2 * dst_stride], d[2]);
1679       vst1_u8(&dst[x + 3 * dst_stride], d[3]);
1680       vst1_u8(&dst[x + 4 * dst_stride], d[4]);
1681       vst1_u8(&dst[x + 5 * dst_stride], d[5]);
1682       vst1_u8(&dst[x + 6 * dst_stride], d[6]);
1683       vst1_u8(&dst[x + 7 * dst_stride], d[7]);
1684       x += 8;
1685     } while (x < w);
1686 
1687     src += src_stride * 8;
1688     dst += dst_stride * 8;
1689   } while (y -= 8);
1690 }
1691 
scaledconvolve_vert_w4(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1692 static INLINE void scaledconvolve_vert_w4(
1693     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1694     const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1695     const int y0_q4, const int y_step_q4, const int w, const int h) {
1696   int y;
1697   int y_q4 = y0_q4;
1698 
1699   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1700   y = h;
1701   do {
1702     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1703 
1704     if (y_q4 & SUBPEL_MASK) {
1705       const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
1706       const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
1707       const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
1708       uint8x8_t s[8], d;
1709       int16x4_t t[8], tt;
1710 
1711       load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
1712                   &s[6], &s[7]);
1713       t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
1714       t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
1715       t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
1716       t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
1717       t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
1718       t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
1719       t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
1720       t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
1721 
1722       tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
1723                        filter3, filter4);
1724       d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
1725       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
1726     } else {
1727       memcpy(dst, &src_y[3 * src_stride], w);
1728     }
1729 
1730     dst += dst_stride;
1731     y_q4 += y_step_q4;
1732   } while (--y);
1733 }
1734 
scaledconvolve_vert_w8(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1735 static INLINE void scaledconvolve_vert_w8(
1736     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1737     const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1738     const int y0_q4, const int y_step_q4, const int w, const int h) {
1739   int y;
1740   int y_q4 = y0_q4;
1741 
1742   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1743   y = h;
1744   do {
1745     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1746     if (y_q4 & SUBPEL_MASK) {
1747       const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
1748       uint8x8_t s[8], d;
1749       load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
1750                   &s[6], &s[7]);
1751       d = scale_filter_8(s, filters);
1752       vst1_u8(dst, d);
1753     } else {
1754       memcpy(dst, &src_y[3 * src_stride], w);
1755     }
1756     dst += dst_stride;
1757     y_q4 += y_step_q4;
1758   } while (--y);
1759 }
1760 
scaledconvolve_vert_w16(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1761 static INLINE void scaledconvolve_vert_w16(
1762     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1763     const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1764     const int y0_q4, const int y_step_q4, const int w, const int h) {
1765   int x, y;
1766   int y_q4 = y0_q4;
1767 
1768   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1769   y = h;
1770   do {
1771     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1772     if (y_q4 & SUBPEL_MASK) {
1773       x = 0;
1774       do {
1775         const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
1776         uint8x16_t ss[8];
1777         uint8x8_t s[8], d[2];
1778         load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
1779                      &ss[5], &ss[6], &ss[7]);
1780         s[0] = vget_low_u8(ss[0]);
1781         s[1] = vget_low_u8(ss[1]);
1782         s[2] = vget_low_u8(ss[2]);
1783         s[3] = vget_low_u8(ss[3]);
1784         s[4] = vget_low_u8(ss[4]);
1785         s[5] = vget_low_u8(ss[5]);
1786         s[6] = vget_low_u8(ss[6]);
1787         s[7] = vget_low_u8(ss[7]);
1788         d[0] = scale_filter_8(s, filters);
1789 
1790         s[0] = vget_high_u8(ss[0]);
1791         s[1] = vget_high_u8(ss[1]);
1792         s[2] = vget_high_u8(ss[2]);
1793         s[3] = vget_high_u8(ss[3]);
1794         s[4] = vget_high_u8(ss[4]);
1795         s[5] = vget_high_u8(ss[5]);
1796         s[6] = vget_high_u8(ss[6]);
1797         s[7] = vget_high_u8(ss[7]);
1798         d[1] = scale_filter_8(s, filters);
1799         vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
1800         src_y += 16;
1801         x += 16;
1802       } while (x < w);
1803     } else {
1804       memcpy(dst, &src_y[3 * src_stride], w);
1805     }
1806     dst += dst_stride;
1807     y_q4 += y_step_q4;
1808   } while (--y);
1809 }
1810 
aom_scaled_2d_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)1811 void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1812                         ptrdiff_t dst_stride, const InterpKernel *filter,
1813                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
1814                         int w, int h) {
1815   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1816   // 2d filtering proceeds in 2 steps:
1817   //   (1) Interpolate horizontally into an intermediate buffer, temp.
1818   //   (2) Interpolate temp vertically to derive the sub-pixel result.
1819   // Deriving the maximum number of rows in the temp buffer (135):
1820   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1821   // --Largest block size is 64x64 pixels.
1822   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1823   //   original frame (in 1/16th pixel units).
1824   // --Must round-up because block may be located at sub-pixel position.
1825   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1826   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1827   // --Require an additional 8 rows for the horiz_w8 transpose tail.
1828   // When calling in frame scaling function, the smallest scaling factor is x1/4
1829   // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
1830   // big enough.
1831   DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
1832   const int intermediate_height =
1833       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1834 
1835   assert(w <= 64);
1836   assert(h <= 64);
1837   assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
1838   assert(x_step_q4 <= 64);
1839 
1840   if (w >= 8) {
1841     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1842                             src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
1843                             intermediate_height);
1844   } else {
1845     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1846                             src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
1847                             intermediate_height);
1848   }
1849 
1850   if (w >= 16) {
1851     scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1852                             dst_stride, filter, y0_q4, y_step_q4, w, h);
1853   } else if (w == 8) {
1854     scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1855                            dst_stride, filter, y0_q4, y_step_q4, w, h);
1856   } else {
1857     scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1858                            dst_stride, filter, y0_q4, y_step_q4, w, h);
1859   }
1860 }
1861