1 /*
2  *
3  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
4  *
5  * This source code is subject to the terms of the BSD 2 Clause License and
6  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7  * was not distributed with this source code in the LICENSE file, you can
8  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9  * Media Patent License 1.0 was not distributed with this source code in the
10  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11  */
12 
13 #include <assert.h>
14 #include <arm_neon.h>
15 
16 #include "config/av1_rtcd.h"
17 
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_ports/mem.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/arm/convolve_neon.h"
23 #include "av1/common/arm/mem_neon.h"
24 #include "av1/common/arm/transpose_neon.h"
25 
convolve8_4x4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * filter)26 static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
27                                       const int16x4_t s2, const int16x4_t s3,
28                                       const int16x4_t s4, const int16x4_t s5,
29                                       const int16x4_t s6, const int16x4_t s7,
30                                       const int16_t *filter) {
31   int16x4_t sum;
32 
33   sum = vmul_n_s16(s0, filter[0]);
34   sum = vmla_n_s16(sum, s1, filter[1]);
35   sum = vmla_n_s16(sum, s2, filter[2]);
36   sum = vmla_n_s16(sum, s5, filter[5]);
37   sum = vmla_n_s16(sum, s6, filter[6]);
38   sum = vmla_n_s16(sum, s7, filter[7]);
39   /* filter[3] can take a max value of 128. So the max value of the result :
40    * 128*255 + sum > 16 bits
41    */
42   sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
43   sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
44 
45   return sum;
46 }
47 
convolve8_horiz_8x8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * filter,const int16x8_t shift_round_0,const int16x8_t shift_by_bits)48 static INLINE uint8x8_t convolve8_horiz_8x8(
49     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
50     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
51     const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
52     const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
53   int16x8_t sum;
54 
55   sum = vmulq_n_s16(s0, filter[0]);
56   sum = vmlaq_n_s16(sum, s1, filter[1]);
57   sum = vmlaq_n_s16(sum, s2, filter[2]);
58   sum = vmlaq_n_s16(sum, s5, filter[5]);
59   sum = vmlaq_n_s16(sum, s6, filter[6]);
60   sum = vmlaq_n_s16(sum, s7, filter[7]);
61   /* filter[3] can take a max value of 128. So the max value of the result :
62    * 128*255 + sum > 16 bits
63    */
64   sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
65   sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
66 
67   sum = vqrshlq_s16(sum, shift_round_0);
68   sum = vqrshlq_s16(sum, shift_by_bits);
69 
70   return vqmovun_s16(sum);
71 }
72 
73 #if !defined(__aarch64__)
convolve8_horiz_4x1(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * filter,const int16x4_t shift_round_0,const int16x4_t shift_by_bits)74 static INLINE uint8x8_t convolve8_horiz_4x1(
75     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
76     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
77     const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
78     const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
79   int16x4_t sum;
80 
81   sum = vmul_n_s16(s0, filter[0]);
82   sum = vmla_n_s16(sum, s1, filter[1]);
83   sum = vmla_n_s16(sum, s2, filter[2]);
84   sum = vmla_n_s16(sum, s5, filter[5]);
85   sum = vmla_n_s16(sum, s6, filter[6]);
86   sum = vmla_n_s16(sum, s7, filter[7]);
87   /* filter[3] can take a max value of 128. So the max value of the result :
88    * 128*255 + sum > 16 bits
89    */
90   sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
91   sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
92 
93   sum = vqrshl_s16(sum, shift_round_0);
94   sum = vqrshl_s16(sum, shift_by_bits);
95 
96   return vqmovun_s16(vcombine_s16(sum, sum));
97 }
98 #endif  // !defined(__arch64__)
99 
convolve8_vert_8x4(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * filter)100 static INLINE uint8x8_t convolve8_vert_8x4(
101     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
102     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
103     const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
104   int16x8_t sum;
105 
106   sum = vmulq_n_s16(s0, filter[0]);
107   sum = vmlaq_n_s16(sum, s1, filter[1]);
108   sum = vmlaq_n_s16(sum, s2, filter[2]);
109   sum = vmlaq_n_s16(sum, s5, filter[5]);
110   sum = vmlaq_n_s16(sum, s6, filter[6]);
111   sum = vmlaq_n_s16(sum, s7, filter[7]);
112   /* filter[3] can take a max value of 128. So the max value of the result :
113    * 128*255 + sum > 16 bits
114    */
115   sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
116   sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
117 
118   return vqrshrun_n_s16(sum, FILTER_BITS);
119 }
120 
convolve8_vert_4x4_s32(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * y_filter,const int32x4_t round_shift_vec,const int32x4_t offset_const,const int32x4_t sub_const_vec)121 static INLINE uint16x4_t convolve8_vert_4x4_s32(
122     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
123     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
124     const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
125     const int32x4_t round_shift_vec, const int32x4_t offset_const,
126     const int32x4_t sub_const_vec) {
127   int32x4_t sum0;
128   uint16x4_t res;
129   const int32x4_t zero = vdupq_n_s32(0);
130 
131   sum0 = vmull_n_s16(s0, y_filter[0]);
132   sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
133   sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
134   sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
135   sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
136   sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
137   sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
138   sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
139 
140   sum0 = vaddq_s32(sum0, offset_const);
141   sum0 = vqrshlq_s32(sum0, round_shift_vec);
142   sum0 = vsubq_s32(sum0, sub_const_vec);
143   sum0 = vmaxq_s32(sum0, zero);
144 
145   res = vmovn_u32(vreinterpretq_u32_s32(sum0));
146 
147   return res;
148 }
149 
convolve8_vert_8x4_s32(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * y_filter,const int32x4_t round_shift_vec,const int32x4_t offset_const,const int32x4_t sub_const_vec,const int16x8_t vec_round_bits)150 static INLINE uint8x8_t convolve8_vert_8x4_s32(
151     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
152     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
153     const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
154     const int32x4_t round_shift_vec, const int32x4_t offset_const,
155     const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
156   int32x4_t sum0, sum1;
157   uint16x8_t res;
158   const int32x4_t zero = vdupq_n_s32(0);
159 
160   sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
161   sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
162   sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
163   sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
164   sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
165   sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
166   sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
167   sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
168 
169   sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
170   sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
171   sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
172   sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
173   sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
174   sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
175   sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
176   sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
177 
178   sum0 = vaddq_s32(sum0, offset_const);
179   sum1 = vaddq_s32(sum1, offset_const);
180   sum0 = vqrshlq_s32(sum0, round_shift_vec);
181   sum1 = vqrshlq_s32(sum1, round_shift_vec);
182   sum0 = vsubq_s32(sum0, sub_const_vec);
183   sum1 = vsubq_s32(sum1, sub_const_vec);
184   sum0 = vmaxq_s32(sum0, zero);
185   sum1 = vmaxq_s32(sum1, zero);
186   res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
187                      vqmovn_u32(vreinterpretq_u32_s32(sum1)));
188 
189   res = vqrshlq_u16(res, vec_round_bits);
190 
191   return vqmovn_u16(res);
192 }
193 
av1_convolve_x_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)194 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
195                             int dst_stride, int w, int h,
196                             const InterpFilterParams *filter_params_x,
197                             const int subpel_x_qn,
198                             ConvolveParams *conv_params) {
199   const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
200   const int8_t bits = FILTER_BITS - conv_params->round_0;
201 
202   uint8x8_t t0;
203 #if defined(__aarch64__)
204   uint8x8_t t1, t2, t3;
205 #endif
206 
207   assert(bits >= 0);
208   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
209          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
210 
211   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
212       filter_params_x, subpel_x_qn & SUBPEL_MASK);
213 
214   const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
215   const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
216 
217   src -= horiz_offset;
218 #if defined(__aarch64__)
219   if (h == 4) {
220     uint8x8_t d01, d23;
221     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
222     int16x8_t d01_temp, d23_temp;
223 
224     __builtin_prefetch(src + 0 * src_stride);
225     __builtin_prefetch(src + 1 * src_stride);
226     __builtin_prefetch(src + 2 * src_stride);
227     __builtin_prefetch(src + 3 * src_stride);
228 
229     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
230     transpose_u8_8x4(&t0, &t1, &t2, &t3);
231 
232     s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
233     s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
234     s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
235     s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
236     s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
237     s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
238     s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
239     __builtin_prefetch(dst + 0 * dst_stride);
240     __builtin_prefetch(dst + 1 * dst_stride);
241     __builtin_prefetch(dst + 2 * dst_stride);
242     __builtin_prefetch(dst + 3 * dst_stride);
243     src += 7;
244 
245     do {
246       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
247       transpose_u8_8x4(&t0, &t1, &t2, &t3);
248 
249       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
250       s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
251       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
252       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
253 
254       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
255 
256       d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
257 
258       d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
259 
260       d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
261 
262       d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0);
263       d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0);
264 
265       d01_temp = vqrshlq_s16(d01_temp, shift_by_bits);
266       d23_temp = vqrshlq_s16(d23_temp, shift_by_bits);
267 
268       d01 = vqmovun_s16(d01_temp);
269       d23 = vqmovun_s16(d23_temp);
270 
271       transpose_u8_4x4(&d01, &d23);
272 
273       if (w != 2) {
274         vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),  // 00 01 02 03
275                       vreinterpret_u32_u8(d01), 0);
276         vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),  // 10 11 12 13
277                       vreinterpret_u32_u8(d23), 0);
278         vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),  // 20 21 22 23
279                       vreinterpret_u32_u8(d01), 1);
280         vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),  // 30 31 32 33
281                       vreinterpret_u32_u8(d23), 1);
282       } else {
283         vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride),  // 00 01
284                       vreinterpret_u16_u8(d01), 0);
285         vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride),  // 10 11
286                       vreinterpret_u16_u8(d23), 0);
287         vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride),  // 20 21
288                       vreinterpret_u16_u8(d01), 2);
289         vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride),  // 30 31
290                       vreinterpret_u16_u8(d23), 2);
291       }
292 
293       s0 = s4;
294       s1 = s5;
295       s2 = s6;
296       s3 = s7;
297       s4 = s8;
298       s5 = s9;
299       s6 = s10;
300       src += 4;
301       dst += 4;
302       w -= 4;
303     } while (w > 0);
304   } else {
305 #endif
306     int width;
307     const uint8_t *s;
308     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
309 
310 #if defined(__aarch64__)
311     int16x8_t s8, s9, s10;
312     uint8x8_t t4, t5, t6, t7;
313 #endif
314 
315     if (w <= 4) {
316 #if defined(__aarch64__)
317       do {
318         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
319         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
320         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
321         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
322         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
323         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
324         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
325         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
326         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
327 
328         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
329                     &t7);
330         src += 8 * src_stride;
331         __builtin_prefetch(dst + 0 * dst_stride);
332         __builtin_prefetch(dst + 1 * dst_stride);
333         __builtin_prefetch(dst + 2 * dst_stride);
334         __builtin_prefetch(dst + 3 * dst_stride);
335         __builtin_prefetch(dst + 4 * dst_stride);
336         __builtin_prefetch(dst + 5 * dst_stride);
337         __builtin_prefetch(dst + 6 * dst_stride);
338         __builtin_prefetch(dst + 7 * dst_stride);
339 
340         transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
341 
342         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
343         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
344         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
345         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
346 
347         __builtin_prefetch(src + 0 * src_stride);
348         __builtin_prefetch(src + 1 * src_stride);
349         __builtin_prefetch(src + 2 * src_stride);
350         __builtin_prefetch(src + 3 * src_stride);
351         __builtin_prefetch(src + 4 * src_stride);
352         __builtin_prefetch(src + 5 * src_stride);
353         __builtin_prefetch(src + 6 * src_stride);
354         __builtin_prefetch(src + 7 * src_stride);
355         t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
356                                  shift_round_0, shift_by_bits);
357         t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
358                                  shift_round_0, shift_by_bits);
359         t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
360                                  shift_round_0, shift_by_bits);
361         t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
362                                  shift_round_0, shift_by_bits);
363 
364         transpose_u8_8x4(&t0, &t1, &t2, &t3);
365 
366         if ((w == 4) && (h > 4)) {
367           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
368                         0);  // 00 01 02 03
369           dst += dst_stride;
370           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
371                         0);  // 10 11 12 13
372           dst += dst_stride;
373           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
374                         0);  // 20 21 22 23
375           dst += dst_stride;
376           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
377                         0);  // 30 31 32 33
378           dst += dst_stride;
379           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
380                         1);  // 40 41 42 43
381           dst += dst_stride;
382           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
383                         1);  // 50 51 52 53
384           dst += dst_stride;
385           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
386                         1);  // 60 61 62 63
387           dst += dst_stride;
388           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
389                         1);  // 70 71 72 73
390           dst += dst_stride;
391         } else if ((w == 4) && (h == 2)) {
392           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
393                         0);  // 00 01 02 03
394           dst += dst_stride;
395           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
396                         0);  // 10 11 12 13
397           dst += dst_stride;
398         } else if ((w == 2) && (h > 4)) {
399           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
400           dst += dst_stride;
401           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
402           dst += dst_stride;
403           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0);  // 20 21
404           dst += dst_stride;
405           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0);  // 30 31
406           dst += dst_stride;
407           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2);  // 40 41
408           dst += dst_stride;
409           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2);  // 50 51
410           dst += dst_stride;
411           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2);  // 60 61
412           dst += dst_stride;
413           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2);  // 70 71
414           dst += dst_stride;
415         } else if ((w == 2) && (h == 2)) {
416           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
417           dst += dst_stride;
418           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
419           dst += dst_stride;
420         }
421         h -= 8;
422       } while (h > 0);
423 #else
424     int16x8_t tt0;
425     int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
426     const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
427     const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
428     do {
429       t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
430       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
431       x0 = vget_low_s16(tt0);   // a0 a1 a2 a3
432       x4 = vget_high_s16(tt0);  // a4 a5 a6 a7
433 
434       t0 = vld1_u8(src + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
435       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
436       x7 = vget_low_s16(tt0);  // a8 a9 a10 a11
437 
438       x1 = vext_s16(x0, x4, 1);  // a1 a2 a3 a4
439       x2 = vext_s16(x0, x4, 2);  // a2 a3 a4 a5
440       x3 = vext_s16(x0, x4, 3);  // a3 a4 a5 a6
441       x5 = vext_s16(x4, x7, 1);  // a5 a6 a7 a8
442       x6 = vext_s16(x4, x7, 2);  // a6 a7 a8 a9
443       x7 = vext_s16(x4, x7, 3);  // a7 a8 a9 a10
444 
445       src += src_stride;
446 
447       t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
448                                shift_round_0_low, shift_by_bits_low);
449 
450       if (w == 4) {
451         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
452                       0);  // 00 01 02 03
453         dst += dst_stride;
454       } else if (w == 2) {
455         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
456         dst += dst_stride;
457       }
458       h -= 1;
459     } while (h > 0);
460 #endif
461     } else {
462       uint8_t *d;
463       int16x8_t s11;
464 #if defined(__aarch64__)
465       int16x8_t s12, s13, s14;
466       do {
467         __builtin_prefetch(src + 0 * src_stride);
468         __builtin_prefetch(src + 1 * src_stride);
469         __builtin_prefetch(src + 2 * src_stride);
470         __builtin_prefetch(src + 3 * src_stride);
471         __builtin_prefetch(src + 4 * src_stride);
472         __builtin_prefetch(src + 5 * src_stride);
473         __builtin_prefetch(src + 6 * src_stride);
474         __builtin_prefetch(src + 7 * src_stride);
475         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
476         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
477         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
478         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
479         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
480         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
481         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
482         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
483         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
484 
485         width = w;
486         s = src + 7;
487         d = dst;
488         __builtin_prefetch(dst + 0 * dst_stride);
489         __builtin_prefetch(dst + 1 * dst_stride);
490         __builtin_prefetch(dst + 2 * dst_stride);
491         __builtin_prefetch(dst + 3 * dst_stride);
492         __builtin_prefetch(dst + 4 * dst_stride);
493         __builtin_prefetch(dst + 5 * dst_stride);
494         __builtin_prefetch(dst + 6 * dst_stride);
495         __builtin_prefetch(dst + 7 * dst_stride);
496 
497         do {
498           load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
499           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
500           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
501           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
502           s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
503           s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
504           s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
505           s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
506           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
507           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
508 
509           t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
510                                    shift_round_0, shift_by_bits);
511 
512           t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
513                                    shift_round_0, shift_by_bits);
514 
515           t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
516                                    shift_round_0, shift_by_bits);
517 
518           t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
519                                    shift_round_0, shift_by_bits);
520 
521           t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
522                                    shift_round_0, shift_by_bits);
523 
524           t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
525                                    shift_round_0, shift_by_bits);
526 
527           t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
528                                    shift_round_0, shift_by_bits);
529 
530           t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
531                                    x_filter, shift_round_0, shift_by_bits);
532 
533           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
534           if (h != 2) {
535             store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
536           } else {
537             store_row2_u8_8x8(d, dst_stride, t0, t1);
538           }
539           s0 = s8;
540           s1 = s9;
541           s2 = s10;
542           s3 = s11;
543           s4 = s12;
544           s5 = s13;
545           s6 = s14;
546           s += 8;
547           d += 8;
548           width -= 8;
549         } while (width > 0);
550         src += 8 * src_stride;
551         dst += 8 * dst_stride;
552         h -= 8;
553       } while (h > 0);
554 #else
555     do {
556       t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
557       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
558 
559       width = w;
560       s = src + 8;
561       d = dst;
562       __builtin_prefetch(dst);
563 
564       do {
565         t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
566         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
567         s11 = s0;
568         s0 = s7;
569 
570         s1 = vextq_s16(s11, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
571         s2 = vextq_s16(s11, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
572         s3 = vextq_s16(s11, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
573         s4 = vextq_s16(s11, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
574         s5 = vextq_s16(s11, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
575         s6 = vextq_s16(s11, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
576         s7 = vextq_s16(s11, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
577 
578         t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
579                                  shift_round_0, shift_by_bits);
580         vst1_u8(d, t0);
581 
582         s += 8;
583         d += 8;
584         width -= 8;
585       } while (width > 0);
586       src += src_stride;
587       dst += dst_stride;
588       h -= 1;
589     } while (h > 0);
590 #endif
591     }
592 #if defined(__aarch64__)
593   }
594 #endif
595 }
596 
av1_convolve_y_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)597 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
598                             int dst_stride, int w, int h,
599                             const InterpFilterParams *filter_params_y,
600                             const int subpel_y_qn) {
601   const int vert_offset = filter_params_y->taps / 2 - 1;
602 
603   src -= vert_offset * src_stride;
604 
605   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
606       filter_params_y, subpel_y_qn & SUBPEL_MASK);
607 
608   if (w <= 4) {
609     uint8x8_t d01;
610     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
611 #if defined(__aarch64__)
612     uint8x8_t d23;
613     int16x4_t s8, s9, s10, d1, d2, d3;
614 #endif
615     s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
616     src += src_stride;
617     s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
618     src += src_stride;
619     s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
620     src += src_stride;
621     s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
622     src += src_stride;
623     s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
624     src += src_stride;
625     s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
626     src += src_stride;
627     s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
628     src += src_stride;
629 
630     do {
631       s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
632       src += src_stride;
633 #if defined(__aarch64__)
634       s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
635       src += src_stride;
636       s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
637       src += src_stride;
638       s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
639       src += src_stride;
640 
641       __builtin_prefetch(dst + 0 * dst_stride);
642       __builtin_prefetch(dst + 1 * dst_stride);
643       __builtin_prefetch(dst + 2 * dst_stride);
644       __builtin_prefetch(dst + 3 * dst_stride);
645       __builtin_prefetch(src + 0 * src_stride);
646       __builtin_prefetch(src + 1 * src_stride);
647       __builtin_prefetch(src + 2 * src_stride);
648       __builtin_prefetch(src + 3 * src_stride);
649       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
650       d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
651       d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
652       d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
653 
654       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
655       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
656       if ((w == 4) && (h != 2)) {
657         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
658                       0);  // 00 01 02 03
659         dst += dst_stride;
660         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
661                       1);  // 10 11 12 13
662         dst += dst_stride;
663         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
664                       0);  // 20 21 22 23
665         dst += dst_stride;
666         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
667                       1);  // 30 31 32 33
668         dst += dst_stride;
669       } else if ((w == 4) && (h == 2)) {
670         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
671                       0);  // 00 01 02 03
672         dst += dst_stride;
673         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
674                       1);  // 10 11 12 13
675         dst += dst_stride;
676       } else if ((w == 2) && (h != 2)) {
677         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
678         dst += dst_stride;
679         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
680         dst += dst_stride;
681         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0);  // 20 21
682         dst += dst_stride;
683         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2);  // 30 31
684         dst += dst_stride;
685       } else if ((w == 2) && (h == 2)) {
686         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
687         dst += dst_stride;
688         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
689         dst += dst_stride;
690       }
691       s0 = s4;
692       s1 = s5;
693       s2 = s6;
694       s3 = s7;
695       s4 = s8;
696       s5 = s9;
697       s6 = s10;
698       h -= 4;
699 #else
700       __builtin_prefetch(dst + 0 * dst_stride);
701       __builtin_prefetch(src + 0 * src_stride);
702 
703       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
704 
705       d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
706 
707       if (w == 4) {
708         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
709         dst += dst_stride;
710       } else if (w == 2) {
711         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
712         dst += dst_stride;
713       }
714       s0 = s1;
715       s1 = s2;
716       s2 = s3;
717       s3 = s4;
718       s4 = s5;
719       s5 = s6;
720       s6 = s7;
721       h -= 1;
722 #endif
723     } while (h > 0);
724   } else {
725     int height;
726     const uint8_t *s;
727     uint8_t *d;
728     uint8x8_t t0;
729     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
730 #if defined(__aarch64__)
731     uint8x8_t t1, t2, t3;
732     int16x8_t s8, s9, s10;
733 #endif
734     do {
735       __builtin_prefetch(src + 0 * src_stride);
736       __builtin_prefetch(src + 1 * src_stride);
737       __builtin_prefetch(src + 2 * src_stride);
738       __builtin_prefetch(src + 3 * src_stride);
739       __builtin_prefetch(src + 4 * src_stride);
740       __builtin_prefetch(src + 5 * src_stride);
741       __builtin_prefetch(src + 6 * src_stride);
742       s = src;
743       s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
744       s += src_stride;
745       s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
746       s += src_stride;
747       s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
748       s += src_stride;
749       s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
750       s += src_stride;
751       s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
752       s += src_stride;
753       s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
754       s += src_stride;
755       s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
756       s += src_stride;
757       d = dst;
758       height = h;
759 
760       do {
761         s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
762         s += src_stride;
763 #if defined(__aarch64__)
764         s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
765         s += src_stride;
766         s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
767         s += src_stride;
768         s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
769         s += src_stride;
770 
771         __builtin_prefetch(d + 0 * dst_stride);
772         __builtin_prefetch(d + 1 * dst_stride);
773         __builtin_prefetch(d + 2 * dst_stride);
774         __builtin_prefetch(d + 3 * dst_stride);
775         __builtin_prefetch(s + 0 * src_stride);
776         __builtin_prefetch(s + 1 * src_stride);
777         __builtin_prefetch(s + 2 * src_stride);
778         __builtin_prefetch(s + 3 * src_stride);
779         t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
780         t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
781         t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
782         t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
783         if (h != 2) {
784           vst1_u8(d, t0);
785           d += dst_stride;
786           vst1_u8(d, t1);
787           d += dst_stride;
788           vst1_u8(d, t2);
789           d += dst_stride;
790           vst1_u8(d, t3);
791           d += dst_stride;
792         } else {
793           vst1_u8(d, t0);
794           d += dst_stride;
795           vst1_u8(d, t1);
796           d += dst_stride;
797         }
798         s0 = s4;
799         s1 = s5;
800         s2 = s6;
801         s3 = s7;
802         s4 = s8;
803         s5 = s9;
804         s6 = s10;
805         height -= 4;
806 #else
807         __builtin_prefetch(d);
808         __builtin_prefetch(s);
809 
810         t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
811 
812         vst1_u8(d, t0);
813         d += dst_stride;
814 
815         s0 = s1;
816         s1 = s2;
817         s2 = s3;
818         s3 = s4;
819         s4 = s5;
820         s5 = s6;
821         s6 = s7;
822         height -= 1;
823 #endif
824       } while (height > 0);
825       src += 8;
826       dst += 8;
827       w -= 8;
828     } while (w > 0);
829   }
830 }
831 
832 // Horizontal filtering for convolve_2d_sr for width multiple of 8
833 // Processes one row at a time
horiz_filter_w8_single_row(const uint8_t * src_ptr,int src_stride,int16_t * dst_ptr,const int dst_stride,int width,int height,const int16_t * x_filter,const int16x8_t horiz_const,const int16x8_t shift_round_0)834 static INLINE void horiz_filter_w8_single_row(
835     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
836     const int dst_stride, int width, int height, const int16_t *x_filter,
837     const int16x8_t horiz_const, const int16x8_t shift_round_0) {
838   int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
839   do {
840     uint8x8_t t0 = vld1_u8(src_ptr);
841     s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
842 
843     int width_tmp = width;
844     const uint8_t *s = src_ptr + 8;
845     int16_t *dst_tmp = dst_ptr;
846 
847     __builtin_prefetch(dst_ptr);
848 
849     do {
850       t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
851       s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
852       int16x8_t sum = s0;
853       s0 = s7;
854 
855       s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
856       s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
857       s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
858       s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
859       s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
860       s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
861       s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
862 
863       int16x8_t res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7,
864                                          x_filter, horiz_const, shift_round_0);
865 
866       vst1q_s16(dst_tmp, res0);
867 
868       s += 8;
869       dst_tmp += 8;
870       width_tmp -= 8;
871     } while (width_tmp > 0);
872     src_ptr += src_stride;
873     dst_ptr += dst_stride;
874     height--;
875   } while (height > 0);
876 }
877 
878 // Horizontal filtering for convolve_2d_sr for width <= 4
879 // Processes one row at a time
horiz_filter_w4_single_row(const uint8_t * src_ptr,int src_stride,int16_t * dst_ptr,const int dst_stride,int width,int height,const int16_t * x_filter,const int16x4_t horiz_const,const int16x4_t shift_round_0)880 static INLINE void horiz_filter_w4_single_row(
881     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
882     const int dst_stride, int width, int height, const int16_t *x_filter,
883     const int16x4_t horiz_const, const int16x4_t shift_round_0) {
884   int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
885   do {
886     const uint8_t *s = src_ptr;
887 
888     __builtin_prefetch(s);
889 
890     uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
891     int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
892     s0 = vget_low_s16(tt0);
893     s4 = vget_high_s16(tt0);
894 
895     __builtin_prefetch(dst_ptr);
896     s += 8;
897 
898     t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
899     s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
900 
901     s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
902     s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
903     s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
904     s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
905     s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
906     s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
907 
908     int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
909                                      horiz_const, shift_round_0);
910 
911     if (width == 4) {
912       vst1_s16(dst_ptr, d0);
913       dst_ptr += dst_stride;
914     } else if (width == 2) {
915       vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
916       dst_ptr += dst_stride;
917     }
918 
919     src_ptr += src_stride;
920     height--;
921   } while (height > 0);
922 }
923 
av1_convolve_2d_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)924 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
925                              int dst_stride, int w, int h,
926                              const InterpFilterParams *filter_params_x,
927                              const InterpFilterParams *filter_params_y,
928                              const int subpel_x_qn, const int subpel_y_qn,
929                              ConvolveParams *conv_params) {
930   int im_dst_stride;
931   int width, height;
932 #if defined(__aarch64__)
933   uint8x8_t t0;
934   uint8x8_t t1, t2, t3, t4, t5, t6, t7;
935   const uint8_t *s;
936 #endif
937 
938   DECLARE_ALIGNED(16, int16_t,
939                   im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
940 
941   const int bd = 8;
942   const int im_h = h + filter_params_y->taps - 1;
943   const int im_stride = MAX_SB_SIZE;
944   const int vert_offset = filter_params_y->taps / 2 - 1;
945   const int horiz_offset = filter_params_x->taps / 2 - 1;
946 
947   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
948 
949   int16_t *dst_ptr;
950 
951   dst_ptr = im_block;
952   im_dst_stride = im_stride;
953   height = im_h;
954   width = w;
955 
956   const int16_t round_bits =
957       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
958   const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
959   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
960   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
961       filter_params_x, subpel_x_qn & SUBPEL_MASK);
962 
963   int16_t x_filter_tmp[8];
964   int16x8_t filter_x_coef = vld1q_s16(x_filter);
965 
966   // filter coeffs are even, so downshifting by 1 to reduce intermediate
967   // precision requirements.
968   filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
969   vst1q_s16(&x_filter_tmp[0], filter_x_coef);
970 
971   assert(conv_params->round_0 > 0);
972 
973   if (w <= 4) {
974     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
975     const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
976 
977 #if defined(__aarch64__)
978     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
979     do {
980       assert(height >= 4);
981       s = src_ptr;
982       __builtin_prefetch(s + 0 * src_stride);
983       __builtin_prefetch(s + 1 * src_stride);
984       __builtin_prefetch(s + 2 * src_stride);
985       __builtin_prefetch(s + 3 * src_stride);
986 
987       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
988       transpose_u8_8x4(&t0, &t1, &t2, &t3);
989 
990       s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
991       s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
992       s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
993       s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
994       s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
995       s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
996       s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
997 
998       __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
999       __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
1000       __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
1001       __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
1002       s += 7;
1003 
1004       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1005       transpose_u8_8x4(&t0, &t1, &t2, &t3);
1006 
1007       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1008       s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1009       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1010       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1011 
1012       d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
1013                              horiz_const, shift_round_0);
1014       d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
1015                              horiz_const, shift_round_0);
1016       d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
1017                              horiz_const, shift_round_0);
1018       d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
1019                              horiz_const, shift_round_0);
1020 
1021       transpose_s16_4x4d(&d0, &d1, &d2, &d3);
1022       if (w == 4) {
1023         vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
1024         vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
1025         vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
1026         vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
1027       } else if (w == 2) {
1028         vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
1029                       vreinterpret_u32_s16(d0), 0);
1030         vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
1031                       vreinterpret_u32_s16(d1), 0);
1032         vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
1033                       vreinterpret_u32_s16(d2), 0);
1034         vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
1035                       vreinterpret_u32_s16(d3), 0);
1036       }
1037       src_ptr += 4 * src_stride;
1038       dst_ptr += 4 * im_dst_stride;
1039       height -= 4;
1040     } while (height >= 4);
1041 
1042     if (height) {
1043       assert(height < 4);
1044       horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
1045                                  height, x_filter_tmp, horiz_const,
1046                                  shift_round_0);
1047     }
1048 #else
1049     horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
1050                                height, x_filter_tmp, horiz_const,
1051                                shift_round_0);
1052 #endif
1053 
1054   } else {
1055     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
1056     const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
1057 
1058 #if defined(__aarch64__)
1059     int16_t *d_tmp;
1060     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
1061     int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
1062     do {
1063       assert(height >= 8);
1064       __builtin_prefetch(src_ptr + 0 * src_stride);
1065       __builtin_prefetch(src_ptr + 1 * src_stride);
1066       __builtin_prefetch(src_ptr + 2 * src_stride);
1067       __builtin_prefetch(src_ptr + 3 * src_stride);
1068       __builtin_prefetch(src_ptr + 4 * src_stride);
1069       __builtin_prefetch(src_ptr + 5 * src_stride);
1070       __builtin_prefetch(src_ptr + 6 * src_stride);
1071       __builtin_prefetch(src_ptr + 7 * src_stride);
1072 
1073       load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1074 
1075       transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1076 
1077       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1078       s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1079       s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1080       s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1081       s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1082       s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1083       s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1084 
1085       width = w;
1086       s = src_ptr + 7;
1087       d_tmp = dst_ptr;
1088 
1089       __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
1090       __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
1091       __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
1092       __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
1093       __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
1094       __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
1095       __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
1096       __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
1097 
1098       do {
1099         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1100 
1101         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1102 
1103         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1104         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1105         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
1106         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
1107         s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
1108         s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
1109         s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
1110         s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
1111 
1112         res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
1113                                  horiz_const, shift_round_0);
1114         res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
1115                                  horiz_const, shift_round_0);
1116         res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
1117                                  horiz_const, shift_round_0);
1118         res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
1119                                  horiz_const, shift_round_0);
1120         res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
1121                                  horiz_const, shift_round_0);
1122         res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
1123                                  x_filter_tmp, horiz_const, shift_round_0);
1124         res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
1125                                  x_filter_tmp, horiz_const, shift_round_0);
1126         res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
1127                                  x_filter_tmp, horiz_const, shift_round_0);
1128 
1129         transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
1130                           &res7);
1131 
1132         store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
1133                       res6, res7);
1134 
1135         s0 = s8;
1136         s1 = s9;
1137         s2 = s10;
1138         s3 = s11;
1139         s4 = s12;
1140         s5 = s13;
1141         s6 = s14;
1142         s += 8;
1143         d_tmp += 8;
1144         width -= 8;
1145       } while (width > 0);
1146       src_ptr += 8 * src_stride;
1147       dst_ptr += 8 * im_dst_stride;
1148       height -= 8;
1149     } while (height >= 8);
1150 
1151     if (height >= 4) {
1152       assert(height < 8);
1153       int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
1154           reg10, reg11, reg12, reg13, reg14;
1155       int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
1156       int16x8_t out0, out1, out2, out3;
1157 
1158       __builtin_prefetch(src_ptr + 0 * src_stride);
1159       __builtin_prefetch(src_ptr + 1 * src_stride);
1160       __builtin_prefetch(src_ptr + 2 * src_stride);
1161       __builtin_prefetch(src_ptr + 3 * src_stride);
1162 
1163       load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
1164       transpose_u8_8x4(&t0, &t1, &t2, &t3);
1165 
1166       reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1167       reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1168       reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1169       reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1170       reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1171       reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1172       reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1173 
1174       __builtin_prefetch(dst_ptr + 0 * dst_stride);
1175       __builtin_prefetch(dst_ptr + 1 * dst_stride);
1176       __builtin_prefetch(dst_ptr + 2 * dst_stride);
1177       __builtin_prefetch(dst_ptr + 3 * dst_stride);
1178 
1179       s = src_ptr + 7;
1180       d_tmp = dst_ptr;
1181       width = w;
1182 
1183       do {
1184         load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1185         transpose_u8_8x4(&t0, &t1, &t2, &t3);
1186 
1187         reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1188         reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1189         reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1190         reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1191         reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1192         reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1193         reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1194         reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1195 
1196         d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
1197                            x_filter_tmp);
1198 
1199         d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
1200                            x_filter_tmp);
1201 
1202         d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
1203                            x_filter_tmp);
1204 
1205         d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
1206                            x_filter_tmp);
1207 
1208         d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
1209                            x_filter_tmp);
1210 
1211         d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
1212                            x_filter_tmp);
1213 
1214         d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
1215                            x_filter_tmp);
1216 
1217         d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
1218                            x_filter_tmp);
1219 
1220         transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
1221                           &out2, &out3);
1222 
1223         out0 = vaddq_s16(out0, horiz_const);
1224         out0 = vqrshlq_s16(out0, shift_round_0);
1225 
1226         out1 = vaddq_s16(out1, horiz_const);
1227         out1 = vqrshlq_s16(out1, shift_round_0);
1228 
1229         out2 = vaddq_s16(out2, horiz_const);
1230         out2 = vqrshlq_s16(out2, shift_round_0);
1231 
1232         out3 = vaddq_s16(out3, horiz_const);
1233         out3 = vqrshlq_s16(out3, shift_round_0);
1234 
1235         store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
1236 
1237         reg0 = reg8;
1238         reg1 = reg9;
1239         reg2 = reg10;
1240         reg3 = reg11;
1241         reg4 = reg12;
1242         reg5 = reg13;
1243         reg6 = reg14;
1244         s += 8;
1245         d_tmp += 8;
1246         width -= 8;
1247       } while (width > 0);
1248       src_ptr += 4 * src_stride;
1249       dst_ptr += 4 * im_dst_stride;
1250       height -= 4;
1251     }
1252 
1253     if (height) {
1254       assert(height < 4);
1255       horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
1256                                  height, x_filter_tmp, horiz_const,
1257                                  shift_round_0);
1258     }
1259 #else
1260 
1261     horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
1262                                height, x_filter_tmp, horiz_const,
1263                                shift_round_0);
1264 #endif
1265   }
1266 
1267   // vertical
1268   {
1269     uint8_t *dst_u8_ptr, *d_u8;
1270     int16_t *v_src_ptr, *v_s;
1271 
1272     const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
1273                               (1 << (offset_bits - conv_params->round_1 - 1));
1274     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1275         filter_params_y, subpel_y_qn & SUBPEL_MASK);
1276 
1277     const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
1278     const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
1279     const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
1280 
1281     src_stride = im_stride;
1282     v_src_ptr = im_block;
1283     dst_u8_ptr = dst;
1284 
1285     height = h;
1286     width = w;
1287 
1288     if (width <= 4) {
1289       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
1290       uint16x4_t d0;
1291       uint16x8_t dd0;
1292       uint8x8_t d01;
1293 
1294 #if defined(__aarch64__)
1295       int16x4_t s8, s9, s10;
1296       uint16x4_t d1, d2, d3;
1297       uint16x8_t dd1;
1298       uint8x8_t d23;
1299 #endif
1300 
1301       d_u8 = dst_u8_ptr;
1302       v_s = v_src_ptr;
1303 
1304       __builtin_prefetch(v_s + 0 * im_stride);
1305       __builtin_prefetch(v_s + 1 * im_stride);
1306       __builtin_prefetch(v_s + 2 * im_stride);
1307       __builtin_prefetch(v_s + 3 * im_stride);
1308       __builtin_prefetch(v_s + 4 * im_stride);
1309       __builtin_prefetch(v_s + 5 * im_stride);
1310       __builtin_prefetch(v_s + 6 * im_stride);
1311       __builtin_prefetch(v_s + 7 * im_stride);
1312 
1313       load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1314       v_s += (7 * im_stride);
1315 
1316       do {
1317 #if defined(__aarch64__)
1318         load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
1319         v_s += (im_stride << 2);
1320 
1321         __builtin_prefetch(d_u8 + 0 * dst_stride);
1322         __builtin_prefetch(d_u8 + 1 * dst_stride);
1323         __builtin_prefetch(d_u8 + 2 * dst_stride);
1324         __builtin_prefetch(d_u8 + 3 * dst_stride);
1325 
1326         d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1327                                     round_shift_vec, offset_const,
1328                                     sub_const_vec);
1329         d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1330                                     round_shift_vec, offset_const,
1331                                     sub_const_vec);
1332         d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1333                                     round_shift_vec, offset_const,
1334                                     sub_const_vec);
1335         d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1336                                     round_shift_vec, offset_const,
1337                                     sub_const_vec);
1338 
1339         dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
1340         dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
1341 
1342         d01 = vqmovn_u16(dd0);
1343         d23 = vqmovn_u16(dd1);
1344 
1345         if ((w == 4) && (h != 2)) {
1346           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1347                         0);  // 00 01 02 03
1348           d_u8 += dst_stride;
1349           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1350                         1);  // 10 11 12 13
1351           d_u8 += dst_stride;
1352           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
1353                         0);  // 20 21 22 23
1354           d_u8 += dst_stride;
1355           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
1356                         1);  // 30 31 32 33
1357           d_u8 += dst_stride;
1358         } else if ((w == 2) && (h != 2)) {
1359           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1360                         0);  // 00 01
1361           d_u8 += dst_stride;
1362           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1363                         2);  // 10 11
1364           d_u8 += dst_stride;
1365           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
1366                         0);  // 20 21
1367           d_u8 += dst_stride;
1368           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
1369                         2);  // 30 31
1370           d_u8 += dst_stride;
1371         } else if ((w == 4) && (h == 2)) {
1372           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1373                         0);  // 00 01 02 03
1374           d_u8 += dst_stride;
1375           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1376                         1);  // 10 11 12 13
1377           d_u8 += dst_stride;
1378         } else if ((w == 2) && (h == 2)) {
1379           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1380                         0);  // 00 01
1381           d_u8 += dst_stride;
1382           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1383                         2);  // 10 11
1384           d_u8 += dst_stride;
1385         }
1386 
1387         s0 = s4;
1388         s1 = s5;
1389         s2 = s6;
1390         s3 = s7;
1391         s4 = s8;
1392         s5 = s9;
1393         s6 = s10;
1394         height -= 4;
1395 #else
1396         s7 = vld1_s16(v_s);
1397         v_s += im_stride;
1398 
1399         __builtin_prefetch(d_u8 + 0 * dst_stride);
1400 
1401         d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1402                                     round_shift_vec, offset_const,
1403                                     sub_const_vec);
1404 
1405         dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
1406         d01 = vqmovn_u16(dd0);
1407 
1408         if (w == 4) {
1409           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1410                         0);  // 00 01 02 03
1411           d_u8 += dst_stride;
1412 
1413         } else if (w == 2) {
1414           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1415                         0);  // 00 01
1416           d_u8 += dst_stride;
1417         }
1418 
1419         s0 = s1;
1420         s1 = s2;
1421         s2 = s3;
1422         s3 = s4;
1423         s4 = s5;
1424         s5 = s6;
1425         s6 = s7;
1426         height -= 1;
1427 #endif
1428       } while (height > 0);
1429     } else {
1430       // if width is a multiple of 8 & height is a multiple of 4
1431       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
1432       uint8x8_t res0;
1433 #if defined(__aarch64__)
1434       int16x8_t s8, s9, s10;
1435       uint8x8_t res1, res2, res3;
1436 #endif
1437 
1438       do {
1439         __builtin_prefetch(v_src_ptr + 0 * im_stride);
1440         __builtin_prefetch(v_src_ptr + 1 * im_stride);
1441         __builtin_prefetch(v_src_ptr + 2 * im_stride);
1442         __builtin_prefetch(v_src_ptr + 3 * im_stride);
1443         __builtin_prefetch(v_src_ptr + 4 * im_stride);
1444         __builtin_prefetch(v_src_ptr + 5 * im_stride);
1445         __builtin_prefetch(v_src_ptr + 6 * im_stride);
1446         __builtin_prefetch(v_src_ptr + 7 * im_stride);
1447 
1448         v_s = v_src_ptr;
1449         load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1450         v_s += (7 * im_stride);
1451 
1452         d_u8 = dst_u8_ptr;
1453         height = h;
1454 
1455         do {
1456 #if defined(__aarch64__)
1457           load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
1458           v_s += (im_stride << 2);
1459 
1460           __builtin_prefetch(d_u8 + 4 * dst_stride);
1461           __builtin_prefetch(d_u8 + 5 * dst_stride);
1462           __builtin_prefetch(d_u8 + 6 * dst_stride);
1463           __builtin_prefetch(d_u8 + 7 * dst_stride);
1464 
1465           res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
1466                                         y_filter, round_shift_vec, offset_const,
1467                                         sub_const_vec, vec_round_bits);
1468           res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
1469                                         y_filter, round_shift_vec, offset_const,
1470                                         sub_const_vec, vec_round_bits);
1471           res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
1472                                         y_filter, round_shift_vec, offset_const,
1473                                         sub_const_vec, vec_round_bits);
1474           res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
1475                                         y_filter, round_shift_vec, offset_const,
1476                                         sub_const_vec, vec_round_bits);
1477 
1478           if (h != 2) {
1479             vst1_u8(d_u8, res0);
1480             d_u8 += dst_stride;
1481             vst1_u8(d_u8, res1);
1482             d_u8 += dst_stride;
1483             vst1_u8(d_u8, res2);
1484             d_u8 += dst_stride;
1485             vst1_u8(d_u8, res3);
1486             d_u8 += dst_stride;
1487           } else {
1488             vst1_u8(d_u8, res0);
1489             d_u8 += dst_stride;
1490             vst1_u8(d_u8, res1);
1491             d_u8 += dst_stride;
1492           }
1493           s0 = s4;
1494           s1 = s5;
1495           s2 = s6;
1496           s3 = s7;
1497           s4 = s8;
1498           s5 = s9;
1499           s6 = s10;
1500           height -= 4;
1501 #else
1502           s7 = vld1q_s16(v_s);
1503           v_s += im_stride;
1504 
1505           __builtin_prefetch(d_u8 + 0 * dst_stride);
1506 
1507           res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
1508                                         y_filter, round_shift_vec, offset_const,
1509                                         sub_const_vec, vec_round_bits);
1510 
1511           vst1_u8(d_u8, res0);
1512           d_u8 += dst_stride;
1513 
1514           s0 = s1;
1515           s1 = s2;
1516           s2 = s3;
1517           s3 = s4;
1518           s4 = s5;
1519           s5 = s6;
1520           s6 = s7;
1521           height -= 1;
1522 #endif
1523         } while (height > 0);
1524         v_src_ptr += 8;
1525         dst_u8_ptr += 8;
1526         w -= 8;
1527       } while (w > 0);
1528     }
1529   }
1530 }
1531 
scaledconvolve_horiz_w4(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const x_filters,const int x0_q4,const int x_step_q4,const int w,const int h)1532 static INLINE void scaledconvolve_horiz_w4(
1533     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1534     const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
1535     const int x0_q4, const int x_step_q4, const int w, const int h) {
1536   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
1537   int x, y, z;
1538 
1539   src -= SUBPEL_TAPS / 2 - 1;
1540 
1541   y = h;
1542   do {
1543     int x_q4 = x0_q4;
1544     x = 0;
1545     do {
1546       // process 4 src_x steps
1547       for (z = 0; z < 4; ++z) {
1548         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1549         if (x_q4 & SUBPEL_MASK) {
1550           const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
1551           const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
1552           const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
1553           uint8x8_t s[8], d;
1554           int16x8_t ss[4];
1555           int16x4_t t[8], tt;
1556 
1557           load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
1558           transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
1559 
1560           ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
1561           ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
1562           ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
1563           ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
1564           t[0] = vget_low_s16(ss[0]);
1565           t[1] = vget_low_s16(ss[1]);
1566           t[2] = vget_low_s16(ss[2]);
1567           t[3] = vget_low_s16(ss[3]);
1568           t[4] = vget_high_s16(ss[0]);
1569           t[5] = vget_high_s16(ss[1]);
1570           t[6] = vget_high_s16(ss[2]);
1571           t[7] = vget_high_s16(ss[3]);
1572 
1573           tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
1574                            filters, filter3, filter4);
1575           d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
1576           vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
1577         } else {
1578           int i;
1579           for (i = 0; i < 4; ++i) {
1580             temp[z * 4 + i] = src_x[i * src_stride + 3];
1581           }
1582         }
1583         x_q4 += x_step_q4;
1584       }
1585 
1586       // transpose the 4x4 filters values back to dst
1587       {
1588         const uint8x8x4_t d4 = vld4_u8(temp);
1589         vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
1590                       vreinterpret_u32_u8(d4.val[0]), 0);
1591         vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
1592                       vreinterpret_u32_u8(d4.val[1]), 0);
1593         vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
1594                       vreinterpret_u32_u8(d4.val[2]), 0);
1595         vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
1596                       vreinterpret_u32_u8(d4.val[3]), 0);
1597       }
1598       x += 4;
1599     } while (x < w);
1600 
1601     src += src_stride * 4;
1602     dst += dst_stride * 4;
1603     y -= 4;
1604   } while (y > 0);
1605 }
1606 
scaledconvolve_horiz_w8(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const x_filters,const int x0_q4,const int x_step_q4,const int w,const int h)1607 static INLINE void scaledconvolve_horiz_w8(
1608     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1609     const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
1610     const int x0_q4, const int x_step_q4, const int w, const int h) {
1611   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
1612   int x, y, z;
1613   src -= SUBPEL_TAPS / 2 - 1;
1614 
1615   // This function processes 8x8 areas. The intermediate height is not always
1616   // a multiple of 8, so force it to be a multiple of 8 here.
1617   y = (h + 7) & ~7;
1618 
1619   do {
1620     int x_q4 = x0_q4;
1621     x = 0;
1622     do {
1623       uint8x8_t d[8];
1624       // process 8 src_x steps
1625       for (z = 0; z < 8; ++z) {
1626         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1627 
1628         if (x_q4 & SUBPEL_MASK) {
1629           const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
1630           uint8x8_t s[8];
1631           load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
1632                       &s[5], &s[6], &s[7]);
1633           transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
1634                            &s[7]);
1635           d[0] = scale_filter_8(s, filters);
1636           vst1_u8(&temp[8 * z], d[0]);
1637         } else {
1638           int i;
1639           for (i = 0; i < 8; ++i) {
1640             temp[z * 8 + i] = src_x[i * src_stride + 3];
1641           }
1642         }
1643         x_q4 += x_step_q4;
1644       }
1645 
1646       // transpose the 8x8 filters values back to dst
1647       load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
1648                   &d[7]);
1649       transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
1650       vst1_u8(&dst[x + 0 * dst_stride], d[0]);
1651       vst1_u8(&dst[x + 1 * dst_stride], d[1]);
1652       vst1_u8(&dst[x + 2 * dst_stride], d[2]);
1653       vst1_u8(&dst[x + 3 * dst_stride], d[3]);
1654       vst1_u8(&dst[x + 4 * dst_stride], d[4]);
1655       vst1_u8(&dst[x + 5 * dst_stride], d[5]);
1656       vst1_u8(&dst[x + 6 * dst_stride], d[6]);
1657       vst1_u8(&dst[x + 7 * dst_stride], d[7]);
1658       x += 8;
1659     } while (x < w);
1660 
1661     src += src_stride * 8;
1662     dst += dst_stride * 8;
1663   } while (y -= 8);
1664 }
1665 
scaledconvolve_vert_w4(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1666 static INLINE void scaledconvolve_vert_w4(
1667     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1668     const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1669     const int y0_q4, const int y_step_q4, const int w, const int h) {
1670   int y;
1671   int y_q4 = y0_q4;
1672 
1673   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1674   y = h;
1675   do {
1676     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1677 
1678     if (y_q4 & SUBPEL_MASK) {
1679       const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
1680       const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
1681       const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
1682       uint8x8_t s[8], d;
1683       int16x4_t t[8], tt;
1684 
1685       load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
1686                   &s[6], &s[7]);
1687       t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
1688       t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
1689       t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
1690       t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
1691       t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
1692       t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
1693       t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
1694       t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
1695 
1696       tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
1697                        filter3, filter4);
1698       d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
1699       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
1700     } else {
1701       memcpy(dst, &src_y[3 * src_stride], w);
1702     }
1703 
1704     dst += dst_stride;
1705     y_q4 += y_step_q4;
1706   } while (--y);
1707 }
1708 
scaledconvolve_vert_w8(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1709 static INLINE void scaledconvolve_vert_w8(
1710     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1711     const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1712     const int y0_q4, const int y_step_q4, const int w, const int h) {
1713   int y;
1714   int y_q4 = y0_q4;
1715 
1716   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1717   y = h;
1718   do {
1719     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1720     if (y_q4 & SUBPEL_MASK) {
1721       const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
1722       uint8x8_t s[8], d;
1723       load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
1724                   &s[6], &s[7]);
1725       d = scale_filter_8(s, filters);
1726       vst1_u8(dst, d);
1727     } else {
1728       memcpy(dst, &src_y[3 * src_stride], w);
1729     }
1730     dst += dst_stride;
1731     y_q4 += y_step_q4;
1732   } while (--y);
1733 }
1734 
scaledconvolve_vert_w16(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1735 static INLINE void scaledconvolve_vert_w16(
1736     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1737     const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1738     const int y0_q4, const int y_step_q4, const int w, const int h) {
1739   int x, y;
1740   int y_q4 = y0_q4;
1741 
1742   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1743   y = h;
1744   do {
1745     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1746     if (y_q4 & SUBPEL_MASK) {
1747       x = 0;
1748       do {
1749         const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
1750         uint8x16_t ss[8];
1751         uint8x8_t s[8], d[2];
1752         load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
1753                      &ss[5], &ss[6], &ss[7]);
1754         s[0] = vget_low_u8(ss[0]);
1755         s[1] = vget_low_u8(ss[1]);
1756         s[2] = vget_low_u8(ss[2]);
1757         s[3] = vget_low_u8(ss[3]);
1758         s[4] = vget_low_u8(ss[4]);
1759         s[5] = vget_low_u8(ss[5]);
1760         s[6] = vget_low_u8(ss[6]);
1761         s[7] = vget_low_u8(ss[7]);
1762         d[0] = scale_filter_8(s, filters);
1763 
1764         s[0] = vget_high_u8(ss[0]);
1765         s[1] = vget_high_u8(ss[1]);
1766         s[2] = vget_high_u8(ss[2]);
1767         s[3] = vget_high_u8(ss[3]);
1768         s[4] = vget_high_u8(ss[4]);
1769         s[5] = vget_high_u8(ss[5]);
1770         s[6] = vget_high_u8(ss[6]);
1771         s[7] = vget_high_u8(ss[7]);
1772         d[1] = scale_filter_8(s, filters);
1773         vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
1774         src_y += 16;
1775         x += 16;
1776       } while (x < w);
1777     } else {
1778       memcpy(dst, &src_y[3 * src_stride], w);
1779     }
1780     dst += dst_stride;
1781     y_q4 += y_step_q4;
1782   } while (--y);
1783 }
1784 
aom_scaled_2d_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)1785 void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1786                         ptrdiff_t dst_stride, const InterpKernel *filter,
1787                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
1788                         int w, int h) {
1789   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1790   // 2d filtering proceeds in 2 steps:
1791   //   (1) Interpolate horizontally into an intermediate buffer, temp.
1792   //   (2) Interpolate temp vertically to derive the sub-pixel result.
1793   // Deriving the maximum number of rows in the temp buffer (135):
1794   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1795   // --Largest block size is 64x64 pixels.
1796   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1797   //   original frame (in 1/16th pixel units).
1798   // --Must round-up because block may be located at sub-pixel position.
1799   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1800   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1801   // --Require an additional 8 rows for the horiz_w8 transpose tail.
1802   // When calling in frame scaling function, the smallest scaling factor is x1/4
1803   // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
1804   // big enough.
1805   DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
1806   const int intermediate_height =
1807       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1808 
1809   assert(w <= 64);
1810   assert(h <= 64);
1811   assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
1812   assert(x_step_q4 <= 64);
1813 
1814   if (w >= 8) {
1815     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1816                             src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
1817                             intermediate_height);
1818   } else {
1819     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1820                             src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
1821                             intermediate_height);
1822   }
1823 
1824   if (w >= 16) {
1825     scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1826                             dst_stride, filter, y0_q4, y_step_q4, w, h);
1827   } else if (w == 8) {
1828     scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1829                            dst_stride, filter, y0_q4, y_step_q4, w, h);
1830   } else {
1831     scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1832                            dst_stride, filter, y0_q4, y_step_q4, w, h);
1833   }
1834 }
1835