1 /*
2  *
3  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
4  *
5  * This source code is subject to the terms of the BSD 2 Clause License and
6  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7  * was not distributed with this source code in the LICENSE file, you can
8  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9  * Media Patent License 1.0 was not distributed with this source code in the
10  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11  */
12 
13 #include <assert.h>
14 #include <arm_neon.h>
15 
16 #include "config/av1_rtcd.h"
17 
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_ports/mem.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/arm/convolve_neon.h"
23 #include "av1/common/arm/mem_neon.h"
24 #include "av1/common/arm/transpose_neon.h"
25 
convolve8_4x4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * filter)26 static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
27                                       const int16x4_t s2, const int16x4_t s3,
28                                       const int16x4_t s4, const int16x4_t s5,
29                                       const int16x4_t s6, const int16x4_t s7,
30                                       const int16_t *filter) {
31   int16x4_t sum;
32 
33   sum = vmul_n_s16(s0, filter[0]);
34   sum = vmla_n_s16(sum, s1, filter[1]);
35   sum = vmla_n_s16(sum, s2, filter[2]);
36   sum = vmla_n_s16(sum, s5, filter[5]);
37   sum = vmla_n_s16(sum, s6, filter[6]);
38   sum = vmla_n_s16(sum, s7, filter[7]);
39   /* filter[3] can take a max value of 128. So the max value of the result :
40    * 128*255 + sum > 16 bits
41    */
42   sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
43   sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
44 
45   return sum;
46 }
47 
convolve8_horiz_8x8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * filter,const int16x8_t shift_round_0,const int16x8_t shift_by_bits)48 static INLINE uint8x8_t convolve8_horiz_8x8(
49     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
50     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
51     const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
52     const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
53   int16x8_t sum;
54 
55   sum = vmulq_n_s16(s0, filter[0]);
56   sum = vmlaq_n_s16(sum, s1, filter[1]);
57   sum = vmlaq_n_s16(sum, s2, filter[2]);
58   sum = vmlaq_n_s16(sum, s5, filter[5]);
59   sum = vmlaq_n_s16(sum, s6, filter[6]);
60   sum = vmlaq_n_s16(sum, s7, filter[7]);
61   /* filter[3] can take a max value of 128. So the max value of the result :
62    * 128*255 + sum > 16 bits
63    */
64   sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
65   sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
66 
67   sum = vqrshlq_s16(sum, shift_round_0);
68   sum = vqrshlq_s16(sum, shift_by_bits);
69 
70   return vqmovun_s16(sum);
71 }
72 
73 #if !defined(__aarch64__)
convolve8_horiz_4x1(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * filter,const int16x4_t shift_round_0,const int16x4_t shift_by_bits)74 static INLINE uint8x8_t convolve8_horiz_4x1(
75     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
76     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
77     const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
78     const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
79   int16x4_t sum;
80 
81   sum = vmul_n_s16(s0, filter[0]);
82   sum = vmla_n_s16(sum, s1, filter[1]);
83   sum = vmla_n_s16(sum, s2, filter[2]);
84   sum = vmla_n_s16(sum, s5, filter[5]);
85   sum = vmla_n_s16(sum, s6, filter[6]);
86   sum = vmla_n_s16(sum, s7, filter[7]);
87   /* filter[3] can take a max value of 128. So the max value of the result :
88    * 128*255 + sum > 16 bits
89    */
90   sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
91   sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
92 
93   sum = vqrshl_s16(sum, shift_round_0);
94   sum = vqrshl_s16(sum, shift_by_bits);
95 
96   return vqmovun_s16(vcombine_s16(sum, sum));
97 }
98 #endif  // !defined(__arch64__)
99 
convolve8_vert_8x4(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * filter)100 static INLINE uint8x8_t convolve8_vert_8x4(
101     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
102     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
103     const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
104   int16x8_t sum;
105 
106   sum = vmulq_n_s16(s0, filter[0]);
107   sum = vmlaq_n_s16(sum, s1, filter[1]);
108   sum = vmlaq_n_s16(sum, s2, filter[2]);
109   sum = vmlaq_n_s16(sum, s5, filter[5]);
110   sum = vmlaq_n_s16(sum, s6, filter[6]);
111   sum = vmlaq_n_s16(sum, s7, filter[7]);
112   /* filter[3] can take a max value of 128. So the max value of the result :
113    * 128*255 + sum > 16 bits
114    */
115   sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
116   sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
117 
118   return vqrshrun_n_s16(sum, FILTER_BITS);
119 }
120 
convolve8_vert_4x4_s32(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * y_filter,const int32x4_t round_shift_vec,const int32x4_t offset_const,const int32x4_t sub_const_vec)121 static INLINE uint16x4_t convolve8_vert_4x4_s32(
122     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
123     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
124     const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
125     const int32x4_t round_shift_vec, const int32x4_t offset_const,
126     const int32x4_t sub_const_vec) {
127   int32x4_t sum0;
128   uint16x4_t res;
129   const int32x4_t zero = vdupq_n_s32(0);
130 
131   sum0 = vmull_n_s16(s0, y_filter[0]);
132   sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
133   sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
134   sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
135   sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
136   sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
137   sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
138   sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
139 
140   sum0 = vaddq_s32(sum0, offset_const);
141   sum0 = vqrshlq_s32(sum0, round_shift_vec);
142   sum0 = vsubq_s32(sum0, sub_const_vec);
143   sum0 = vmaxq_s32(sum0, zero);
144 
145   res = vmovn_u32(vreinterpretq_u32_s32(sum0));
146 
147   return res;
148 }
149 
convolve8_vert_8x4_s32(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * y_filter,const int32x4_t round_shift_vec,const int32x4_t offset_const,const int32x4_t sub_const_vec,const int16x8_t vec_round_bits)150 static INLINE uint8x8_t convolve8_vert_8x4_s32(
151     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
152     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
153     const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
154     const int32x4_t round_shift_vec, const int32x4_t offset_const,
155     const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
156   int32x4_t sum0, sum1;
157   uint16x8_t res;
158   const int32x4_t zero = vdupq_n_s32(0);
159 
160   sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
161   sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
162   sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
163   sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
164   sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
165   sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
166   sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
167   sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
168 
169   sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
170   sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
171   sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
172   sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
173   sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
174   sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
175   sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
176   sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
177 
178   sum0 = vaddq_s32(sum0, offset_const);
179   sum1 = vaddq_s32(sum1, offset_const);
180   sum0 = vqrshlq_s32(sum0, round_shift_vec);
181   sum1 = vqrshlq_s32(sum1, round_shift_vec);
182   sum0 = vsubq_s32(sum0, sub_const_vec);
183   sum1 = vsubq_s32(sum1, sub_const_vec);
184   sum0 = vmaxq_s32(sum0, zero);
185   sum1 = vmaxq_s32(sum1, zero);
186   res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
187                      vqmovn_u32(vreinterpretq_u32_s32(sum1)));
188 
189   res = vqrshlq_u16(res, vec_round_bits);
190 
191   return vqmovn_u16(res);
192 }
193 
av1_convolve_x_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)194 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
195                             int dst_stride, int w, int h,
196                             const InterpFilterParams *filter_params_x,
197                             const InterpFilterParams *filter_params_y,
198                             const int subpel_x_q4, const int subpel_y_q4,
199                             ConvolveParams *conv_params) {
200   const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
201   const int8_t bits = FILTER_BITS - conv_params->round_0;
202 
203   (void)subpel_y_q4;
204   (void)conv_params;
205   (void)filter_params_y;
206 
207   uint8x8_t t0;
208 #if defined(__aarch64__)
209   uint8x8_t t1, t2, t3;
210 #endif
211 
212   assert(bits >= 0);
213   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
214          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
215 
216   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
217       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
218 
219   const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
220   const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
221 
222   src -= horiz_offset;
223 #if defined(__aarch64__)
224   if (h == 4) {
225     uint8x8_t d01, d23;
226     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
227     int16x8_t d01_temp, d23_temp;
228 
229     __builtin_prefetch(src + 0 * src_stride);
230     __builtin_prefetch(src + 1 * src_stride);
231     __builtin_prefetch(src + 2 * src_stride);
232     __builtin_prefetch(src + 3 * src_stride);
233 
234     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
235     transpose_u8_8x4(&t0, &t1, &t2, &t3);
236 
237     s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
238     s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
239     s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
240     s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
241     s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
242     s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
243     s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
244     __builtin_prefetch(dst + 0 * dst_stride);
245     __builtin_prefetch(dst + 1 * dst_stride);
246     __builtin_prefetch(dst + 2 * dst_stride);
247     __builtin_prefetch(dst + 3 * dst_stride);
248     src += 7;
249 
250     do {
251       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
252       transpose_u8_8x4(&t0, &t1, &t2, &t3);
253 
254       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
255       s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
256       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
257       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
258 
259       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
260 
261       d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
262 
263       d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
264 
265       d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
266 
267       d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0);
268       d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0);
269 
270       d01_temp = vqrshlq_s16(d01_temp, shift_by_bits);
271       d23_temp = vqrshlq_s16(d23_temp, shift_by_bits);
272 
273       d01 = vqmovun_s16(d01_temp);
274       d23 = vqmovun_s16(d23_temp);
275 
276       transpose_u8_4x4(&d01, &d23);
277 
278       if (w != 2) {
279         vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),  // 00 01 02 03
280                       vreinterpret_u32_u8(d01), 0);
281         vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),  // 10 11 12 13
282                       vreinterpret_u32_u8(d23), 0);
283         vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),  // 20 21 22 23
284                       vreinterpret_u32_u8(d01), 1);
285         vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),  // 30 31 32 33
286                       vreinterpret_u32_u8(d23), 1);
287       } else {
288         vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride),  // 00 01
289                       vreinterpret_u16_u8(d01), 0);
290         vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride),  // 10 11
291                       vreinterpret_u16_u8(d23), 0);
292         vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride),  // 20 21
293                       vreinterpret_u16_u8(d01), 2);
294         vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride),  // 30 31
295                       vreinterpret_u16_u8(d23), 2);
296       }
297 
298       s0 = s4;
299       s1 = s5;
300       s2 = s6;
301       s3 = s7;
302       s4 = s8;
303       s5 = s9;
304       s6 = s10;
305       src += 4;
306       dst += 4;
307       w -= 4;
308     } while (w > 0);
309   } else {
310 #endif
311     int width;
312     const uint8_t *s;
313     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
314 
315 #if defined(__aarch64__)
316     int16x8_t s8, s9, s10;
317     uint8x8_t t4, t5, t6, t7;
318 #endif
319 
320     if (w <= 4) {
321 #if defined(__aarch64__)
322       do {
323         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
324         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
325         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
326         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
327         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
328         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
329         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
330         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
331         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
332 
333         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
334                     &t7);
335         src += 8 * src_stride;
336         __builtin_prefetch(dst + 0 * dst_stride);
337         __builtin_prefetch(dst + 1 * dst_stride);
338         __builtin_prefetch(dst + 2 * dst_stride);
339         __builtin_prefetch(dst + 3 * dst_stride);
340         __builtin_prefetch(dst + 4 * dst_stride);
341         __builtin_prefetch(dst + 5 * dst_stride);
342         __builtin_prefetch(dst + 6 * dst_stride);
343         __builtin_prefetch(dst + 7 * dst_stride);
344 
345         transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
346 
347         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
348         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
349         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
350         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
351 
352         __builtin_prefetch(src + 0 * src_stride);
353         __builtin_prefetch(src + 1 * src_stride);
354         __builtin_prefetch(src + 2 * src_stride);
355         __builtin_prefetch(src + 3 * src_stride);
356         __builtin_prefetch(src + 4 * src_stride);
357         __builtin_prefetch(src + 5 * src_stride);
358         __builtin_prefetch(src + 6 * src_stride);
359         __builtin_prefetch(src + 7 * src_stride);
360         t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
361                                  shift_round_0, shift_by_bits);
362         t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
363                                  shift_round_0, shift_by_bits);
364         t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
365                                  shift_round_0, shift_by_bits);
366         t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
367                                  shift_round_0, shift_by_bits);
368 
369         transpose_u8_8x4(&t0, &t1, &t2, &t3);
370 
371         if ((w == 4) && (h > 4)) {
372           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
373                         0);  // 00 01 02 03
374           dst += dst_stride;
375           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
376                         0);  // 10 11 12 13
377           dst += dst_stride;
378           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
379                         0);  // 20 21 22 23
380           dst += dst_stride;
381           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
382                         0);  // 30 31 32 33
383           dst += dst_stride;
384           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
385                         1);  // 40 41 42 43
386           dst += dst_stride;
387           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
388                         1);  // 50 51 52 53
389           dst += dst_stride;
390           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
391                         1);  // 60 61 62 63
392           dst += dst_stride;
393           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
394                         1);  // 70 71 72 73
395           dst += dst_stride;
396         } else if ((w == 4) && (h == 2)) {
397           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
398                         0);  // 00 01 02 03
399           dst += dst_stride;
400           vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
401                         0);  // 10 11 12 13
402           dst += dst_stride;
403         } else if ((w == 2) && (h > 4)) {
404           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
405           dst += dst_stride;
406           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
407           dst += dst_stride;
408           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0);  // 20 21
409           dst += dst_stride;
410           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0);  // 30 31
411           dst += dst_stride;
412           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2);  // 40 41
413           dst += dst_stride;
414           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2);  // 50 51
415           dst += dst_stride;
416           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2);  // 60 61
417           dst += dst_stride;
418           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2);  // 70 71
419           dst += dst_stride;
420         } else if ((w == 2) && (h == 2)) {
421           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
422           dst += dst_stride;
423           vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
424           dst += dst_stride;
425         }
426         h -= 8;
427       } while (h > 0);
428 #else
429     int16x8_t tt0;
430     int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
431     const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
432     const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
433     do {
434       t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
435       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
436       x0 = vget_low_s16(tt0);   // a0 a1 a2 a3
437       x4 = vget_high_s16(tt0);  // a4 a5 a6 a7
438 
439       t0 = vld1_u8(src + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
440       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
441       x7 = vget_low_s16(tt0);  // a8 a9 a10 a11
442 
443       x1 = vext_s16(x0, x4, 1);  // a1 a2 a3 a4
444       x2 = vext_s16(x0, x4, 2);  // a2 a3 a4 a5
445       x3 = vext_s16(x0, x4, 3);  // a3 a4 a5 a6
446       x5 = vext_s16(x4, x7, 1);  // a5 a6 a7 a8
447       x6 = vext_s16(x4, x7, 2);  // a6 a7 a8 a9
448       x7 = vext_s16(x4, x7, 3);  // a7 a8 a9 a10
449 
450       src += src_stride;
451 
452       t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
453                                shift_round_0_low, shift_by_bits_low);
454 
455       if (w == 4) {
456         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
457                       0);  // 00 01 02 03
458         dst += dst_stride;
459       } else if (w == 2) {
460         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
461         dst += dst_stride;
462       }
463       h -= 1;
464     } while (h > 0);
465 #endif
466     } else {
467       uint8_t *d;
468       int16x8_t s11;
469 #if defined(__aarch64__)
470       int16x8_t s12, s13, s14;
471       do {
472         __builtin_prefetch(src + 0 * src_stride);
473         __builtin_prefetch(src + 1 * src_stride);
474         __builtin_prefetch(src + 2 * src_stride);
475         __builtin_prefetch(src + 3 * src_stride);
476         __builtin_prefetch(src + 4 * src_stride);
477         __builtin_prefetch(src + 5 * src_stride);
478         __builtin_prefetch(src + 6 * src_stride);
479         __builtin_prefetch(src + 7 * src_stride);
480         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
481         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
482         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
483         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
484         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
485         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
486         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
487         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
488         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
489 
490         width = w;
491         s = src + 7;
492         d = dst;
493         __builtin_prefetch(dst + 0 * dst_stride);
494         __builtin_prefetch(dst + 1 * dst_stride);
495         __builtin_prefetch(dst + 2 * dst_stride);
496         __builtin_prefetch(dst + 3 * dst_stride);
497         __builtin_prefetch(dst + 4 * dst_stride);
498         __builtin_prefetch(dst + 5 * dst_stride);
499         __builtin_prefetch(dst + 6 * dst_stride);
500         __builtin_prefetch(dst + 7 * dst_stride);
501 
502         do {
503           load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
504           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
505           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
506           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
507           s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
508           s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
509           s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
510           s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
511           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
512           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
513 
514           t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
515                                    shift_round_0, shift_by_bits);
516 
517           t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
518                                    shift_round_0, shift_by_bits);
519 
520           t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
521                                    shift_round_0, shift_by_bits);
522 
523           t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
524                                    shift_round_0, shift_by_bits);
525 
526           t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
527                                    shift_round_0, shift_by_bits);
528 
529           t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
530                                    shift_round_0, shift_by_bits);
531 
532           t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
533                                    shift_round_0, shift_by_bits);
534 
535           t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
536                                    x_filter, shift_round_0, shift_by_bits);
537 
538           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
539           if (h != 2) {
540             store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
541           } else {
542             store_row2_u8_8x8(d, dst_stride, t0, t1);
543           }
544           s0 = s8;
545           s1 = s9;
546           s2 = s10;
547           s3 = s11;
548           s4 = s12;
549           s5 = s13;
550           s6 = s14;
551           s += 8;
552           d += 8;
553           width -= 8;
554         } while (width > 0);
555         src += 8 * src_stride;
556         dst += 8 * dst_stride;
557         h -= 8;
558       } while (h > 0);
559 #else
560     do {
561       t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
562       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
563 
564       width = w;
565       s = src + 8;
566       d = dst;
567       __builtin_prefetch(dst);
568 
569       do {
570         t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
571         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
572         s11 = s0;
573         s0 = s7;
574 
575         s1 = vextq_s16(s11, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
576         s2 = vextq_s16(s11, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
577         s3 = vextq_s16(s11, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
578         s4 = vextq_s16(s11, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
579         s5 = vextq_s16(s11, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
580         s6 = vextq_s16(s11, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
581         s7 = vextq_s16(s11, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
582 
583         t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
584                                  shift_round_0, shift_by_bits);
585         vst1_u8(d, t0);
586 
587         s += 8;
588         d += 8;
589         width -= 8;
590       } while (width > 0);
591       src += src_stride;
592       dst += dst_stride;
593       h -= 1;
594     } while (h > 0);
595 #endif
596     }
597 #if defined(__aarch64__)
598   }
599 #endif
600 }
601 
av1_convolve_y_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)602 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
603                             int dst_stride, int w, int h,
604                             const InterpFilterParams *filter_params_x,
605                             const InterpFilterParams *filter_params_y,
606                             const int subpel_x_q4, const int subpel_y_q4,
607                             ConvolveParams *conv_params) {
608   const int vert_offset = filter_params_y->taps / 2 - 1;
609 
610   src -= vert_offset * src_stride;
611 
612   (void)filter_params_x;
613   (void)subpel_x_q4;
614   (void)conv_params;
615 
616   assert(conv_params->round_0 <= FILTER_BITS);
617   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
618          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
619 
620   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
621       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
622 
623   if (w <= 4) {
624     uint8x8_t d01;
625     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
626 #if defined(__aarch64__)
627     uint8x8_t d23;
628     int16x4_t s8, s9, s10, d1, d2, d3;
629 #endif
630     s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
631     src += src_stride;
632     s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
633     src += src_stride;
634     s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
635     src += src_stride;
636     s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
637     src += src_stride;
638     s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
639     src += src_stride;
640     s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
641     src += src_stride;
642     s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
643     src += src_stride;
644 
645     do {
646       s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
647       src += src_stride;
648 #if defined(__aarch64__)
649       s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
650       src += src_stride;
651       s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
652       src += src_stride;
653       s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
654       src += src_stride;
655 
656       __builtin_prefetch(dst + 0 * dst_stride);
657       __builtin_prefetch(dst + 1 * dst_stride);
658       __builtin_prefetch(dst + 2 * dst_stride);
659       __builtin_prefetch(dst + 3 * dst_stride);
660       __builtin_prefetch(src + 0 * src_stride);
661       __builtin_prefetch(src + 1 * src_stride);
662       __builtin_prefetch(src + 2 * src_stride);
663       __builtin_prefetch(src + 3 * src_stride);
664       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
665       d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
666       d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
667       d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
668 
669       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
670       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
671       if ((w == 4) && (h != 2)) {
672         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
673                       0);  // 00 01 02 03
674         dst += dst_stride;
675         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
676                       1);  // 10 11 12 13
677         dst += dst_stride;
678         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
679                       0);  // 20 21 22 23
680         dst += dst_stride;
681         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
682                       1);  // 30 31 32 33
683         dst += dst_stride;
684       } else if ((w == 4) && (h == 2)) {
685         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
686                       0);  // 00 01 02 03
687         dst += dst_stride;
688         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
689                       1);  // 10 11 12 13
690         dst += dst_stride;
691       } else if ((w == 2) && (h != 2)) {
692         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
693         dst += dst_stride;
694         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
695         dst += dst_stride;
696         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0);  // 20 21
697         dst += dst_stride;
698         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2);  // 30 31
699         dst += dst_stride;
700       } else if ((w == 2) && (h == 2)) {
701         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
702         dst += dst_stride;
703         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
704         dst += dst_stride;
705       }
706       s0 = s4;
707       s1 = s5;
708       s2 = s6;
709       s3 = s7;
710       s4 = s8;
711       s5 = s9;
712       s6 = s10;
713       h -= 4;
714 #else
715       __builtin_prefetch(dst + 0 * dst_stride);
716       __builtin_prefetch(src + 0 * src_stride);
717 
718       d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
719 
720       d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
721 
722       if (w == 4) {
723         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
724         dst += dst_stride;
725       } else if (w == 2) {
726         vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
727         dst += dst_stride;
728       }
729       s0 = s1;
730       s1 = s2;
731       s2 = s3;
732       s3 = s4;
733       s4 = s5;
734       s5 = s6;
735       s6 = s7;
736       h -= 1;
737 #endif
738     } while (h > 0);
739   } else {
740     int height;
741     const uint8_t *s;
742     uint8_t *d;
743     uint8x8_t t0;
744     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
745 #if defined(__aarch64__)
746     uint8x8_t t1, t2, t3;
747     int16x8_t s8, s9, s10;
748 #endif
749     do {
750       __builtin_prefetch(src + 0 * src_stride);
751       __builtin_prefetch(src + 1 * src_stride);
752       __builtin_prefetch(src + 2 * src_stride);
753       __builtin_prefetch(src + 3 * src_stride);
754       __builtin_prefetch(src + 4 * src_stride);
755       __builtin_prefetch(src + 5 * src_stride);
756       __builtin_prefetch(src + 6 * src_stride);
757       s = src;
758       s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
759       s += src_stride;
760       s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
761       s += src_stride;
762       s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
763       s += src_stride;
764       s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
765       s += src_stride;
766       s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
767       s += src_stride;
768       s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
769       s += src_stride;
770       s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
771       s += src_stride;
772       d = dst;
773       height = h;
774 
775       do {
776         s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
777         s += src_stride;
778 #if defined(__aarch64__)
779         s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
780         s += src_stride;
781         s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
782         s += src_stride;
783         s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
784         s += src_stride;
785 
786         __builtin_prefetch(d + 0 * dst_stride);
787         __builtin_prefetch(d + 1 * dst_stride);
788         __builtin_prefetch(d + 2 * dst_stride);
789         __builtin_prefetch(d + 3 * dst_stride);
790         __builtin_prefetch(s + 0 * src_stride);
791         __builtin_prefetch(s + 1 * src_stride);
792         __builtin_prefetch(s + 2 * src_stride);
793         __builtin_prefetch(s + 3 * src_stride);
794         t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
795         t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
796         t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
797         t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
798         if (h != 2) {
799           vst1_u8(d, t0);
800           d += dst_stride;
801           vst1_u8(d, t1);
802           d += dst_stride;
803           vst1_u8(d, t2);
804           d += dst_stride;
805           vst1_u8(d, t3);
806           d += dst_stride;
807         } else {
808           vst1_u8(d, t0);
809           d += dst_stride;
810           vst1_u8(d, t1);
811           d += dst_stride;
812         }
813         s0 = s4;
814         s1 = s5;
815         s2 = s6;
816         s3 = s7;
817         s4 = s8;
818         s5 = s9;
819         s6 = s10;
820         height -= 4;
821 #else
822         __builtin_prefetch(d);
823         __builtin_prefetch(s);
824 
825         t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
826 
827         vst1_u8(d, t0);
828         d += dst_stride;
829 
830         s0 = s1;
831         s1 = s2;
832         s2 = s3;
833         s3 = s4;
834         s4 = s5;
835         s5 = s6;
836         s6 = s7;
837         height -= 1;
838 #endif
839       } while (height > 0);
840       src += 8;
841       dst += 8;
842       w -= 8;
843     } while (w > 0);
844   }
845 }
846 
av1_convolve_2d_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)847 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
848                              int dst_stride, int w, int h,
849                              const InterpFilterParams *filter_params_x,
850                              const InterpFilterParams *filter_params_y,
851                              const int subpel_x_q4, const int subpel_y_q4,
852                              ConvolveParams *conv_params) {
853   int im_dst_stride;
854   int width, height;
855   uint8x8_t t0;
856 #if defined(__aarch64__)
857   uint8x8_t t1, t2, t3, t4, t5, t6, t7;
858 #endif
859 
860   DECLARE_ALIGNED(16, int16_t,
861                   im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
862 
863   const int bd = 8;
864   const int im_h = h + filter_params_y->taps - 1;
865   const int im_stride = MAX_SB_SIZE;
866   const int vert_offset = filter_params_y->taps / 2 - 1;
867   const int horiz_offset = filter_params_x->taps / 2 - 1;
868 
869   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
870   const uint8_t *s;
871   int16_t *dst_ptr;
872 
873   dst_ptr = im_block;
874   im_dst_stride = im_stride;
875   height = im_h;
876   width = w;
877 
878   const int16_t round_bits =
879       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
880   const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
881   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
882   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
883       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
884 
885   int16_t x_filter_tmp[8];
886   int16x8_t filter_x_coef = vld1q_s16(x_filter);
887 
888   // filter coeffs are even, so downshifting by 1 to reduce intermediate
889   // precision requirements.
890   filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
891   vst1q_s16(&x_filter_tmp[0], filter_x_coef);
892 
893   assert(conv_params->round_0 > 0);
894 
895   if (w <= 4) {
896     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
897 #if defined(__aarch64__)
898     int16x4_t s8, s9, s10, d1, d2, d3;
899 #endif
900 
901     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
902     const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
903 
904     do {
905       s = src_ptr;
906 
907 #if defined(__aarch64__)
908       __builtin_prefetch(s + 0 * src_stride);
909       __builtin_prefetch(s + 1 * src_stride);
910       __builtin_prefetch(s + 2 * src_stride);
911       __builtin_prefetch(s + 3 * src_stride);
912 
913       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
914       transpose_u8_8x4(&t0, &t1, &t2, &t3);
915 
916       s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
917       s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
918       s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
919       s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
920       s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
921       s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
922       s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
923 
924       __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
925       __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
926       __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
927       __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
928       s += 7;
929 
930       load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
931       transpose_u8_8x4(&t0, &t1, &t2, &t3);
932 
933       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
934       s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
935       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
936       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
937 
938       d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
939                              horiz_const, shift_round_0);
940       d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
941                              horiz_const, shift_round_0);
942       d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
943                              horiz_const, shift_round_0);
944       d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
945                              horiz_const, shift_round_0);
946 
947       transpose_s16_4x4d(&d0, &d1, &d2, &d3);
948       if (w == 4) {
949         vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
950         vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
951         vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
952         vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
953       } else if (w == 2) {
954         vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
955                       vreinterpret_u32_s16(d0), 0);
956         vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
957                       vreinterpret_u32_s16(d1), 0);
958         vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
959                       vreinterpret_u32_s16(d2), 0);
960         vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
961                       vreinterpret_u32_s16(d3), 0);
962       }
963       src_ptr += 4 * src_stride;
964       dst_ptr += 4 * im_dst_stride;
965       height -= 4;
966 #else
967       int16x8_t tt0;
968 
969       __builtin_prefetch(s);
970 
971       t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
972       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
973       s0 = vget_low_s16(tt0);
974       s4 = vget_high_s16(tt0);
975 
976       __builtin_prefetch(dst_ptr);
977       s += 8;
978 
979       t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
980       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
981 
982       s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
983       s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
984       s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
985       s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
986       s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
987       s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
988 
989       d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
990                              horiz_const, shift_round_0);
991 
992       if (w == 4) {
993         vst1_s16(dst_ptr, d0);
994         dst_ptr += im_dst_stride;
995       } else if (w == 2) {
996         vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
997         dst_ptr += im_dst_stride;
998       }
999 
1000       src_ptr += src_stride;
1001       height -= 1;
1002 #endif
1003     } while (height > 0);
1004   } else {
1005     int16_t *d_tmp;
1006     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
1007 #if defined(__aarch64__)
1008     int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
1009     int16x8_t s11, s12, s13, s14;
1010 #endif
1011 
1012     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
1013     const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
1014 
1015 #if defined(__aarch64__)
1016     do {
1017       __builtin_prefetch(src_ptr + 0 * src_stride);
1018       __builtin_prefetch(src_ptr + 1 * src_stride);
1019       __builtin_prefetch(src_ptr + 2 * src_stride);
1020       __builtin_prefetch(src_ptr + 3 * src_stride);
1021       __builtin_prefetch(src_ptr + 4 * src_stride);
1022       __builtin_prefetch(src_ptr + 5 * src_stride);
1023       __builtin_prefetch(src_ptr + 6 * src_stride);
1024       __builtin_prefetch(src_ptr + 7 * src_stride);
1025 
1026       load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1027 
1028       transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1029 
1030       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1031       s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1032       s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1033       s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1034       s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1035       s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1036       s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1037 
1038       width = w;
1039       s = src_ptr + 7;
1040       d_tmp = dst_ptr;
1041 
1042       __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
1043       __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
1044       __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
1045       __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
1046       __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
1047       __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
1048       __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
1049       __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
1050 
1051       do {
1052         load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1053 
1054         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1055 
1056         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1057         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1058         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
1059         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
1060         s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
1061         s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
1062         s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
1063         s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
1064 
1065         res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
1066                                  horiz_const, shift_round_0);
1067         res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
1068                                  horiz_const, shift_round_0);
1069         res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
1070                                  horiz_const, shift_round_0);
1071         res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
1072                                  horiz_const, shift_round_0);
1073         res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
1074                                  horiz_const, shift_round_0);
1075         res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
1076                                  x_filter_tmp, horiz_const, shift_round_0);
1077         res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
1078                                  x_filter_tmp, horiz_const, shift_round_0);
1079         res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
1080                                  x_filter_tmp, horiz_const, shift_round_0);
1081 
1082         transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
1083                           &res7);
1084 
1085         store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
1086                       res6, res7);
1087 
1088         s0 = s8;
1089         s1 = s9;
1090         s2 = s10;
1091         s3 = s11;
1092         s4 = s12;
1093         s5 = s13;
1094         s6 = s14;
1095         s += 8;
1096         d_tmp += 8;
1097         width -= 8;
1098       } while (width > 0);
1099       src_ptr += 8 * src_stride;
1100       dst_ptr += 8 * im_dst_stride;
1101       height -= 8;
1102     } while (height > 0);
1103 #else
1104     do {
1105       t0 = vld1_u8(src_ptr);
1106       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
1107 
1108       width = w;
1109       s = src_ptr + 8;
1110       d_tmp = dst_ptr;
1111 
1112       __builtin_prefetch(dst_ptr);
1113 
1114       do {
1115         t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
1116         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1117         int16x8_t sum = s0;
1118         s0 = s7;
1119 
1120         s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
1121         s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
1122         s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
1123         s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
1124         s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
1125         s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
1126         s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
1127 
1128         res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
1129                                  horiz_const, shift_round_0);
1130 
1131         vst1q_s16(d_tmp, res0);
1132 
1133         s += 8;
1134         d_tmp += 8;
1135         width -= 8;
1136       } while (width > 0);
1137       src_ptr += src_stride;
1138       dst_ptr += im_dst_stride;
1139       height -= 1;
1140     } while (height > 0);
1141 #endif
1142   }
1143 
1144   // vertical
1145   {
1146     uint8_t *dst_u8_ptr, *d_u8;
1147     int16_t *v_src_ptr, *v_s;
1148 
1149     const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
1150                               (1 << (offset_bits - conv_params->round_1 - 1));
1151     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1152         filter_params_y, subpel_y_q4 & SUBPEL_MASK);
1153 
1154     const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
1155     const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
1156     const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
1157 
1158     src_stride = im_stride;
1159     v_src_ptr = im_block;
1160     dst_u8_ptr = dst;
1161 
1162     height = h;
1163     width = w;
1164 
1165     if (width <= 4) {
1166       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
1167       uint16x4_t d0;
1168       uint16x8_t dd0;
1169       uint8x8_t d01;
1170 
1171 #if defined(__aarch64__)
1172       int16x4_t s8, s9, s10;
1173       uint16x4_t d1, d2, d3;
1174       uint16x8_t dd1;
1175       uint8x8_t d23;
1176 #endif
1177 
1178       d_u8 = dst_u8_ptr;
1179       v_s = v_src_ptr;
1180 
1181       __builtin_prefetch(v_s + 0 * im_stride);
1182       __builtin_prefetch(v_s + 1 * im_stride);
1183       __builtin_prefetch(v_s + 2 * im_stride);
1184       __builtin_prefetch(v_s + 3 * im_stride);
1185       __builtin_prefetch(v_s + 4 * im_stride);
1186       __builtin_prefetch(v_s + 5 * im_stride);
1187       __builtin_prefetch(v_s + 6 * im_stride);
1188       __builtin_prefetch(v_s + 7 * im_stride);
1189 
1190       load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1191       v_s += (7 * im_stride);
1192 
1193       do {
1194 #if defined(__aarch64__)
1195         load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
1196         v_s += (im_stride << 2);
1197 
1198         __builtin_prefetch(d_u8 + 0 * dst_stride);
1199         __builtin_prefetch(d_u8 + 1 * dst_stride);
1200         __builtin_prefetch(d_u8 + 2 * dst_stride);
1201         __builtin_prefetch(d_u8 + 3 * dst_stride);
1202 
1203         d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1204                                     round_shift_vec, offset_const,
1205                                     sub_const_vec);
1206         d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1207                                     round_shift_vec, offset_const,
1208                                     sub_const_vec);
1209         d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1210                                     round_shift_vec, offset_const,
1211                                     sub_const_vec);
1212         d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1213                                     round_shift_vec, offset_const,
1214                                     sub_const_vec);
1215 
1216         dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
1217         dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
1218 
1219         d01 = vqmovn_u16(dd0);
1220         d23 = vqmovn_u16(dd1);
1221 
1222         if ((w == 4) && (h != 2)) {
1223           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1224                         0);  // 00 01 02 03
1225           d_u8 += dst_stride;
1226           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1227                         1);  // 10 11 12 13
1228           d_u8 += dst_stride;
1229           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
1230                         0);  // 20 21 22 23
1231           d_u8 += dst_stride;
1232           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
1233                         1);  // 30 31 32 33
1234           d_u8 += dst_stride;
1235         } else if ((w == 2) && (h != 2)) {
1236           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1237                         0);  // 00 01
1238           d_u8 += dst_stride;
1239           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1240                         2);  // 10 11
1241           d_u8 += dst_stride;
1242           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
1243                         0);  // 20 21
1244           d_u8 += dst_stride;
1245           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
1246                         2);  // 30 31
1247           d_u8 += dst_stride;
1248         } else if ((w == 4) && (h == 2)) {
1249           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1250                         0);  // 00 01 02 03
1251           d_u8 += dst_stride;
1252           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1253                         1);  // 10 11 12 13
1254           d_u8 += dst_stride;
1255         } else if ((w == 2) && (h == 2)) {
1256           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1257                         0);  // 00 01
1258           d_u8 += dst_stride;
1259           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1260                         2);  // 10 11
1261           d_u8 += dst_stride;
1262         }
1263 
1264         s0 = s4;
1265         s1 = s5;
1266         s2 = s6;
1267         s3 = s7;
1268         s4 = s8;
1269         s5 = s9;
1270         s6 = s10;
1271         height -= 4;
1272 #else
1273         s7 = vld1_s16(v_s);
1274         v_s += im_stride;
1275 
1276         __builtin_prefetch(d_u8 + 0 * dst_stride);
1277 
1278         d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1279                                     round_shift_vec, offset_const,
1280                                     sub_const_vec);
1281 
1282         dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
1283         d01 = vqmovn_u16(dd0);
1284 
1285         if (w == 4) {
1286           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1287                         0);  // 00 01 02 03
1288           d_u8 += dst_stride;
1289 
1290         } else if (w == 2) {
1291           vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1292                         0);  // 00 01
1293           d_u8 += dst_stride;
1294         }
1295 
1296         s0 = s1;
1297         s1 = s2;
1298         s2 = s3;
1299         s3 = s4;
1300         s4 = s5;
1301         s5 = s6;
1302         s6 = s7;
1303         height -= 1;
1304 #endif
1305       } while (height > 0);
1306     } else {
1307       // if width is a multiple of 8 & height is a multiple of 4
1308       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
1309       uint8x8_t res0;
1310 #if defined(__aarch64__)
1311       int16x8_t s8, s9, s10;
1312       uint8x8_t res1, res2, res3;
1313 #endif
1314 
1315       do {
1316         __builtin_prefetch(v_src_ptr + 0 * im_stride);
1317         __builtin_prefetch(v_src_ptr + 1 * im_stride);
1318         __builtin_prefetch(v_src_ptr + 2 * im_stride);
1319         __builtin_prefetch(v_src_ptr + 3 * im_stride);
1320         __builtin_prefetch(v_src_ptr + 4 * im_stride);
1321         __builtin_prefetch(v_src_ptr + 5 * im_stride);
1322         __builtin_prefetch(v_src_ptr + 6 * im_stride);
1323         __builtin_prefetch(v_src_ptr + 7 * im_stride);
1324 
1325         v_s = v_src_ptr;
1326         load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1327         v_s += (7 * im_stride);
1328 
1329         d_u8 = dst_u8_ptr;
1330         height = h;
1331 
1332         do {
1333 #if defined(__aarch64__)
1334           load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
1335           v_s += (im_stride << 2);
1336 
1337           __builtin_prefetch(d_u8 + 4 * dst_stride);
1338           __builtin_prefetch(d_u8 + 5 * dst_stride);
1339           __builtin_prefetch(d_u8 + 6 * dst_stride);
1340           __builtin_prefetch(d_u8 + 7 * dst_stride);
1341 
1342           res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
1343                                         y_filter, round_shift_vec, offset_const,
1344                                         sub_const_vec, vec_round_bits);
1345           res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
1346                                         y_filter, round_shift_vec, offset_const,
1347                                         sub_const_vec, vec_round_bits);
1348           res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
1349                                         y_filter, round_shift_vec, offset_const,
1350                                         sub_const_vec, vec_round_bits);
1351           res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
1352                                         y_filter, round_shift_vec, offset_const,
1353                                         sub_const_vec, vec_round_bits);
1354 
1355           if (h != 2) {
1356             vst1_u8(d_u8, res0);
1357             d_u8 += dst_stride;
1358             vst1_u8(d_u8, res1);
1359             d_u8 += dst_stride;
1360             vst1_u8(d_u8, res2);
1361             d_u8 += dst_stride;
1362             vst1_u8(d_u8, res3);
1363             d_u8 += dst_stride;
1364           } else {
1365             vst1_u8(d_u8, res0);
1366             d_u8 += dst_stride;
1367             vst1_u8(d_u8, res1);
1368             d_u8 += dst_stride;
1369           }
1370           s0 = s4;
1371           s1 = s5;
1372           s2 = s6;
1373           s3 = s7;
1374           s4 = s8;
1375           s5 = s9;
1376           s6 = s10;
1377           height -= 4;
1378 #else
1379           s7 = vld1q_s16(v_s);
1380           v_s += im_stride;
1381 
1382           __builtin_prefetch(d_u8 + 0 * dst_stride);
1383 
1384           res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
1385                                         y_filter, round_shift_vec, offset_const,
1386                                         sub_const_vec, vec_round_bits);
1387 
1388           vst1_u8(d_u8, res0);
1389           d_u8 += dst_stride;
1390 
1391           s0 = s1;
1392           s1 = s2;
1393           s2 = s3;
1394           s3 = s4;
1395           s4 = s5;
1396           s5 = s6;
1397           s6 = s7;
1398           height -= 1;
1399 #endif
1400         } while (height > 0);
1401         v_src_ptr += 8;
1402         dst_u8_ptr += 8;
1403         w -= 8;
1404       } while (w > 0);
1405     }
1406   }
1407 }
av1_convolve_2d_copy_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)1408 void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
1409                                   uint8_t *dst, int dst_stride, int w, int h,
1410                                   const InterpFilterParams *filter_params_x,
1411                                   const InterpFilterParams *filter_params_y,
1412                                   const int subpel_x_q4, const int subpel_y_q4,
1413                                   ConvolveParams *conv_params) {
1414   (void)filter_params_x;
1415   (void)filter_params_y;
1416   (void)subpel_x_q4;
1417   (void)subpel_y_q4;
1418   (void)conv_params;
1419 
1420   const uint8_t *src1;
1421   uint8_t *dst1;
1422   int y;
1423 
1424   if (!(w & 0x0F)) {
1425     for (y = 0; y < h; ++y) {
1426       src1 = src;
1427       dst1 = dst;
1428       for (int x = 0; x < (w >> 4); ++x) {
1429         vst1q_u8(dst1, vld1q_u8(src1));
1430         src1 += 16;
1431         dst1 += 16;
1432       }
1433       src += src_stride;
1434       dst += dst_stride;
1435     }
1436   } else if (!(w & 0x07)) {
1437     for (y = 0; y < h; ++y) {
1438       vst1_u8(dst, vld1_u8(src));
1439       src += src_stride;
1440       dst += dst_stride;
1441     }
1442   } else if (!(w & 0x03)) {
1443     for (y = 0; y < h; ++y) {
1444       vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0);
1445       src += src_stride;
1446       dst += dst_stride;
1447     }
1448   } else if (!(w & 0x01)) {
1449     for (y = 0; y < h; ++y) {
1450       vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0);
1451       src += src_stride;
1452       dst += dst_stride;
1453     }
1454   }
1455 }
1456