1 /*
2 *
3 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11 */
12
13 #include <assert.h>
14 #include <arm_neon.h>
15
16 #include "config/av1_rtcd.h"
17
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_ports/mem.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/arm/convolve_neon.h"
23 #include "av1/common/arm/mem_neon.h"
24 #include "av1/common/arm/transpose_neon.h"
25
convolve8_4x4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * filter)26 static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
27 const int16x4_t s2, const int16x4_t s3,
28 const int16x4_t s4, const int16x4_t s5,
29 const int16x4_t s6, const int16x4_t s7,
30 const int16_t *filter) {
31 int16x4_t sum;
32
33 sum = vmul_n_s16(s0, filter[0]);
34 sum = vmla_n_s16(sum, s1, filter[1]);
35 sum = vmla_n_s16(sum, s2, filter[2]);
36 sum = vmla_n_s16(sum, s5, filter[5]);
37 sum = vmla_n_s16(sum, s6, filter[6]);
38 sum = vmla_n_s16(sum, s7, filter[7]);
39 /* filter[3] can take a max value of 128. So the max value of the result :
40 * 128*255 + sum > 16 bits
41 */
42 sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
43 sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
44
45 return sum;
46 }
47
convolve8_horiz_8x8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * filter,const int16x8_t shift_round_0,const int16x8_t shift_by_bits)48 static INLINE uint8x8_t convolve8_horiz_8x8(
49 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
50 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
51 const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
52 const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
53 int16x8_t sum;
54
55 sum = vmulq_n_s16(s0, filter[0]);
56 sum = vmlaq_n_s16(sum, s1, filter[1]);
57 sum = vmlaq_n_s16(sum, s2, filter[2]);
58 sum = vmlaq_n_s16(sum, s5, filter[5]);
59 sum = vmlaq_n_s16(sum, s6, filter[6]);
60 sum = vmlaq_n_s16(sum, s7, filter[7]);
61 /* filter[3] can take a max value of 128. So the max value of the result :
62 * 128*255 + sum > 16 bits
63 */
64 sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
65 sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
66
67 sum = vqrshlq_s16(sum, shift_round_0);
68 sum = vqrshlq_s16(sum, shift_by_bits);
69
70 return vqmovun_s16(sum);
71 }
72
73 #if !defined(__aarch64__)
convolve8_horiz_4x1(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * filter,const int16x4_t shift_round_0,const int16x4_t shift_by_bits)74 static INLINE uint8x8_t convolve8_horiz_4x1(
75 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
76 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
77 const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
78 const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
79 int16x4_t sum;
80
81 sum = vmul_n_s16(s0, filter[0]);
82 sum = vmla_n_s16(sum, s1, filter[1]);
83 sum = vmla_n_s16(sum, s2, filter[2]);
84 sum = vmla_n_s16(sum, s5, filter[5]);
85 sum = vmla_n_s16(sum, s6, filter[6]);
86 sum = vmla_n_s16(sum, s7, filter[7]);
87 /* filter[3] can take a max value of 128. So the max value of the result :
88 * 128*255 + sum > 16 bits
89 */
90 sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
91 sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
92
93 sum = vqrshl_s16(sum, shift_round_0);
94 sum = vqrshl_s16(sum, shift_by_bits);
95
96 return vqmovun_s16(vcombine_s16(sum, sum));
97 }
98 #endif // !defined(__arch64__)
99
convolve8_vert_8x4(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * filter)100 static INLINE uint8x8_t convolve8_vert_8x4(
101 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
102 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
103 const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
104 int16x8_t sum;
105
106 sum = vmulq_n_s16(s0, filter[0]);
107 sum = vmlaq_n_s16(sum, s1, filter[1]);
108 sum = vmlaq_n_s16(sum, s2, filter[2]);
109 sum = vmlaq_n_s16(sum, s5, filter[5]);
110 sum = vmlaq_n_s16(sum, s6, filter[6]);
111 sum = vmlaq_n_s16(sum, s7, filter[7]);
112 /* filter[3] can take a max value of 128. So the max value of the result :
113 * 128*255 + sum > 16 bits
114 */
115 sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
116 sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
117
118 return vqrshrun_n_s16(sum, FILTER_BITS);
119 }
120
convolve8_vert_4x4_s32(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * y_filter,const int32x4_t round_shift_vec,const int32x4_t offset_const,const int32x4_t sub_const_vec)121 static INLINE uint16x4_t convolve8_vert_4x4_s32(
122 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
123 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
124 const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
125 const int32x4_t round_shift_vec, const int32x4_t offset_const,
126 const int32x4_t sub_const_vec) {
127 int32x4_t sum0;
128 uint16x4_t res;
129 const int32x4_t zero = vdupq_n_s32(0);
130
131 sum0 = vmull_n_s16(s0, y_filter[0]);
132 sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
133 sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
134 sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
135 sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
136 sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
137 sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
138 sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
139
140 sum0 = vaddq_s32(sum0, offset_const);
141 sum0 = vqrshlq_s32(sum0, round_shift_vec);
142 sum0 = vsubq_s32(sum0, sub_const_vec);
143 sum0 = vmaxq_s32(sum0, zero);
144
145 res = vmovn_u32(vreinterpretq_u32_s32(sum0));
146
147 return res;
148 }
149
convolve8_vert_8x4_s32(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * y_filter,const int32x4_t round_shift_vec,const int32x4_t offset_const,const int32x4_t sub_const_vec,const int16x8_t vec_round_bits)150 static INLINE uint8x8_t convolve8_vert_8x4_s32(
151 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
152 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
153 const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
154 const int32x4_t round_shift_vec, const int32x4_t offset_const,
155 const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
156 int32x4_t sum0, sum1;
157 uint16x8_t res;
158 const int32x4_t zero = vdupq_n_s32(0);
159
160 sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
161 sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
162 sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
163 sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
164 sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
165 sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
166 sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
167 sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
168
169 sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
170 sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
171 sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
172 sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
173 sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
174 sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
175 sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
176 sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
177
178 sum0 = vaddq_s32(sum0, offset_const);
179 sum1 = vaddq_s32(sum1, offset_const);
180 sum0 = vqrshlq_s32(sum0, round_shift_vec);
181 sum1 = vqrshlq_s32(sum1, round_shift_vec);
182 sum0 = vsubq_s32(sum0, sub_const_vec);
183 sum1 = vsubq_s32(sum1, sub_const_vec);
184 sum0 = vmaxq_s32(sum0, zero);
185 sum1 = vmaxq_s32(sum1, zero);
186 res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
187 vqmovn_u32(vreinterpretq_u32_s32(sum1)));
188
189 res = vqrshlq_u16(res, vec_round_bits);
190
191 return vqmovn_u16(res);
192 }
193
av1_convolve_x_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)194 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
195 int dst_stride, int w, int h,
196 const InterpFilterParams *filter_params_x,
197 const int subpel_x_qn,
198 ConvolveParams *conv_params) {
199 if (filter_params_x->taps > 8) {
200 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
201 subpel_x_qn, conv_params);
202 return;
203 }
204 const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
205 const int8_t bits = FILTER_BITS - conv_params->round_0;
206
207 uint8x8_t t0;
208 #if defined(__aarch64__)
209 uint8x8_t t1, t2, t3;
210 #endif
211
212 assert(bits >= 0);
213 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
214 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
215
216 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
217 filter_params_x, subpel_x_qn & SUBPEL_MASK);
218
219 const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
220 const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
221
222 src -= horiz_offset;
223 #if defined(__aarch64__)
224 if (h == 4) {
225 uint8x8_t d01, d23;
226 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
227 int16x8_t d01_temp, d23_temp;
228
229 __builtin_prefetch(src + 0 * src_stride);
230 __builtin_prefetch(src + 1 * src_stride);
231 __builtin_prefetch(src + 2 * src_stride);
232 __builtin_prefetch(src + 3 * src_stride);
233
234 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
235 transpose_u8_8x4(&t0, &t1, &t2, &t3);
236
237 s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
238 s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
239 s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
240 s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
241 s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
242 s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
243 s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
244 __builtin_prefetch(dst + 0 * dst_stride);
245 __builtin_prefetch(dst + 1 * dst_stride);
246 __builtin_prefetch(dst + 2 * dst_stride);
247 __builtin_prefetch(dst + 3 * dst_stride);
248 src += 7;
249
250 do {
251 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
252 transpose_u8_8x4(&t0, &t1, &t2, &t3);
253
254 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
255 s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
256 s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
257 s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
258
259 d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
260
261 d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
262
263 d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
264
265 d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
266
267 d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0);
268 d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0);
269
270 d01_temp = vqrshlq_s16(d01_temp, shift_by_bits);
271 d23_temp = vqrshlq_s16(d23_temp, shift_by_bits);
272
273 d01 = vqmovun_s16(d01_temp);
274 d23 = vqmovun_s16(d23_temp);
275
276 transpose_u8_4x4(&d01, &d23);
277
278 if (w != 2) {
279 vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), // 00 01 02 03
280 vreinterpret_u32_u8(d01), 0);
281 vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), // 10 11 12 13
282 vreinterpret_u32_u8(d23), 0);
283 vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), // 20 21 22 23
284 vreinterpret_u32_u8(d01), 1);
285 vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), // 30 31 32 33
286 vreinterpret_u32_u8(d23), 1);
287 } else {
288 vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride), // 00 01
289 vreinterpret_u16_u8(d01), 0);
290 vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride), // 10 11
291 vreinterpret_u16_u8(d23), 0);
292 vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride), // 20 21
293 vreinterpret_u16_u8(d01), 2);
294 vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride), // 30 31
295 vreinterpret_u16_u8(d23), 2);
296 }
297
298 s0 = s4;
299 s1 = s5;
300 s2 = s6;
301 s3 = s7;
302 s4 = s8;
303 s5 = s9;
304 s6 = s10;
305 src += 4;
306 dst += 4;
307 w -= 4;
308 } while (w > 0);
309 } else {
310 #endif
311 int width;
312 const uint8_t *s;
313 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
314
315 #if defined(__aarch64__)
316 int16x8_t s8, s9, s10;
317 uint8x8_t t4, t5, t6, t7;
318 #endif
319
320 if (w <= 4) {
321 #if defined(__aarch64__)
322 do {
323 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
324 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
325 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
326 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
327 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
328 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
329 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
330 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
331 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
332
333 load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
334 &t7);
335 src += 8 * src_stride;
336 __builtin_prefetch(dst + 0 * dst_stride);
337 __builtin_prefetch(dst + 1 * dst_stride);
338 __builtin_prefetch(dst + 2 * dst_stride);
339 __builtin_prefetch(dst + 3 * dst_stride);
340 __builtin_prefetch(dst + 4 * dst_stride);
341 __builtin_prefetch(dst + 5 * dst_stride);
342 __builtin_prefetch(dst + 6 * dst_stride);
343 __builtin_prefetch(dst + 7 * dst_stride);
344
345 transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
346
347 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
348 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
349 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
350 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
351
352 __builtin_prefetch(src + 0 * src_stride);
353 __builtin_prefetch(src + 1 * src_stride);
354 __builtin_prefetch(src + 2 * src_stride);
355 __builtin_prefetch(src + 3 * src_stride);
356 __builtin_prefetch(src + 4 * src_stride);
357 __builtin_prefetch(src + 5 * src_stride);
358 __builtin_prefetch(src + 6 * src_stride);
359 __builtin_prefetch(src + 7 * src_stride);
360 t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
361 shift_round_0, shift_by_bits);
362 t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
363 shift_round_0, shift_by_bits);
364 t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
365 shift_round_0, shift_by_bits);
366 t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
367 shift_round_0, shift_by_bits);
368
369 transpose_u8_8x4(&t0, &t1, &t2, &t3);
370
371 if ((w == 4) && (h > 4)) {
372 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
373 0); // 00 01 02 03
374 dst += dst_stride;
375 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
376 0); // 10 11 12 13
377 dst += dst_stride;
378 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
379 0); // 20 21 22 23
380 dst += dst_stride;
381 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
382 0); // 30 31 32 33
383 dst += dst_stride;
384 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
385 1); // 40 41 42 43
386 dst += dst_stride;
387 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
388 1); // 50 51 52 53
389 dst += dst_stride;
390 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
391 1); // 60 61 62 63
392 dst += dst_stride;
393 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
394 1); // 70 71 72 73
395 dst += dst_stride;
396 } else if ((w == 4) && (h == 2)) {
397 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
398 0); // 00 01 02 03
399 dst += dst_stride;
400 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
401 0); // 10 11 12 13
402 dst += dst_stride;
403 } else if ((w == 2) && (h > 4)) {
404 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0),
405 0); // 00 01
406 dst += dst_stride;
407 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1),
408 0); // 10 11
409 dst += dst_stride;
410 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2),
411 0); // 20 21
412 dst += dst_stride;
413 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3),
414 0); // 30 31
415 dst += dst_stride;
416 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0),
417 2); // 40 41
418 dst += dst_stride;
419 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1),
420 2); // 50 51
421 dst += dst_stride;
422 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2),
423 2); // 60 61
424 dst += dst_stride;
425 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3),
426 2); // 70 71
427 dst += dst_stride;
428 } else if ((w == 2) && (h == 2)) {
429 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0),
430 0); // 00 01
431 dst += dst_stride;
432 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1),
433 0); // 10 11
434 dst += dst_stride;
435 }
436 h -= 8;
437 } while (h > 0);
438 #else
439 int16x8_t tt0;
440 int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
441 const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
442 const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
443 do {
444 t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
445 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
446 x0 = vget_low_s16(tt0); // a0 a1 a2 a3
447 x4 = vget_high_s16(tt0); // a4 a5 a6 a7
448
449 t0 = vld1_u8(src + 8); // a8 a9 a10 a11 a12 a13 a14 a15
450 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
451 x7 = vget_low_s16(tt0); // a8 a9 a10 a11
452
453 x1 = vext_s16(x0, x4, 1); // a1 a2 a3 a4
454 x2 = vext_s16(x0, x4, 2); // a2 a3 a4 a5
455 x3 = vext_s16(x0, x4, 3); // a3 a4 a5 a6
456 x5 = vext_s16(x4, x7, 1); // a5 a6 a7 a8
457 x6 = vext_s16(x4, x7, 2); // a6 a7 a8 a9
458 x7 = vext_s16(x4, x7, 3); // a7 a8 a9 a10
459
460 src += src_stride;
461
462 t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
463 shift_round_0_low, shift_by_bits_low);
464
465 if (w == 4) {
466 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
467 0); // 00 01 02 03
468 dst += dst_stride;
469 } else if (w == 2) {
470 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01
471 dst += dst_stride;
472 }
473 h -= 1;
474 } while (h > 0);
475 #endif
476 } else {
477 uint8_t *d;
478 int16x8_t s11;
479 #if defined(__aarch64__)
480 int16x8_t s12, s13, s14;
481 do {
482 __builtin_prefetch(src + 0 * src_stride);
483 __builtin_prefetch(src + 1 * src_stride);
484 __builtin_prefetch(src + 2 * src_stride);
485 __builtin_prefetch(src + 3 * src_stride);
486 __builtin_prefetch(src + 4 * src_stride);
487 __builtin_prefetch(src + 5 * src_stride);
488 __builtin_prefetch(src + 6 * src_stride);
489 __builtin_prefetch(src + 7 * src_stride);
490 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
491 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
492 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
493 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
494 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
495 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
496 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
497 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
498 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
499
500 width = w;
501 s = src + 7;
502 d = dst;
503 __builtin_prefetch(dst + 0 * dst_stride);
504 __builtin_prefetch(dst + 1 * dst_stride);
505 __builtin_prefetch(dst + 2 * dst_stride);
506 __builtin_prefetch(dst + 3 * dst_stride);
507 __builtin_prefetch(dst + 4 * dst_stride);
508 __builtin_prefetch(dst + 5 * dst_stride);
509 __builtin_prefetch(dst + 6 * dst_stride);
510 __builtin_prefetch(dst + 7 * dst_stride);
511
512 do {
513 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
514 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
515 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
516 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
517 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
518 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
519 s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
520 s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
521 s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
522 s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
523
524 t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
525 shift_round_0, shift_by_bits);
526
527 t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
528 shift_round_0, shift_by_bits);
529
530 t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
531 shift_round_0, shift_by_bits);
532
533 t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
534 shift_round_0, shift_by_bits);
535
536 t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
537 shift_round_0, shift_by_bits);
538
539 t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
540 shift_round_0, shift_by_bits);
541
542 t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
543 shift_round_0, shift_by_bits);
544
545 t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
546 x_filter, shift_round_0, shift_by_bits);
547
548 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
549 if (h != 2) {
550 store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
551 } else {
552 store_row2_u8_8x8(d, dst_stride, t0, t1);
553 }
554 s0 = s8;
555 s1 = s9;
556 s2 = s10;
557 s3 = s11;
558 s4 = s12;
559 s5 = s13;
560 s6 = s14;
561 s += 8;
562 d += 8;
563 width -= 8;
564 } while (width > 0);
565 src += 8 * src_stride;
566 dst += 8 * dst_stride;
567 h -= 8;
568 } while (h > 0);
569 #else
570 do {
571 t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
572 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
573
574 width = w;
575 s = src + 8;
576 d = dst;
577 __builtin_prefetch(dst);
578
579 do {
580 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
581 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
582 s11 = s0;
583 s0 = s7;
584
585 s1 = vextq_s16(s11, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
586 s2 = vextq_s16(s11, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
587 s3 = vextq_s16(s11, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
588 s4 = vextq_s16(s11, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
589 s5 = vextq_s16(s11, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
590 s6 = vextq_s16(s11, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
591 s7 = vextq_s16(s11, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
592
593 t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
594 shift_round_0, shift_by_bits);
595 vst1_u8(d, t0);
596
597 s += 8;
598 d += 8;
599 width -= 8;
600 } while (width > 0);
601 src += src_stride;
602 dst += dst_stride;
603 h -= 1;
604 } while (h > 0);
605 #endif
606 }
607 #if defined(__aarch64__)
608 }
609 #endif
610 }
611
av1_convolve_y_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)612 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
613 int dst_stride, int w, int h,
614 const InterpFilterParams *filter_params_y,
615 const int subpel_y_qn) {
616 if (filter_params_y->taps > 8) {
617 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
618 subpel_y_qn);
619 return;
620 }
621 const int vert_offset = filter_params_y->taps / 2 - 1;
622
623 src -= vert_offset * src_stride;
624
625 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
626 filter_params_y, subpel_y_qn & SUBPEL_MASK);
627
628 if (w <= 4) {
629 uint8x8_t d01;
630 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
631 #if defined(__aarch64__)
632 uint8x8_t d23;
633 int16x4_t s8, s9, s10, d1, d2, d3;
634 #endif
635 s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
636 src += src_stride;
637 s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
638 src += src_stride;
639 s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
640 src += src_stride;
641 s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
642 src += src_stride;
643 s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
644 src += src_stride;
645 s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
646 src += src_stride;
647 s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
648 src += src_stride;
649
650 do {
651 s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
652 src += src_stride;
653 #if defined(__aarch64__)
654 s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
655 src += src_stride;
656 s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
657 src += src_stride;
658 s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
659 src += src_stride;
660
661 __builtin_prefetch(dst + 0 * dst_stride);
662 __builtin_prefetch(dst + 1 * dst_stride);
663 __builtin_prefetch(dst + 2 * dst_stride);
664 __builtin_prefetch(dst + 3 * dst_stride);
665 __builtin_prefetch(src + 0 * src_stride);
666 __builtin_prefetch(src + 1 * src_stride);
667 __builtin_prefetch(src + 2 * src_stride);
668 __builtin_prefetch(src + 3 * src_stride);
669 d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
670 d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
671 d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
672 d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
673
674 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
675 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
676 if ((w == 4) && (h != 2)) {
677 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
678 0); // 00 01 02 03
679 dst += dst_stride;
680 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
681 1); // 10 11 12 13
682 dst += dst_stride;
683 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
684 0); // 20 21 22 23
685 dst += dst_stride;
686 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
687 1); // 30 31 32 33
688 dst += dst_stride;
689 } else if ((w == 4) && (h == 2)) {
690 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
691 0); // 00 01 02 03
692 dst += dst_stride;
693 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
694 1); // 10 11 12 13
695 dst += dst_stride;
696 } else if ((w == 2) && (h != 2)) {
697 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); // 00 01
698 dst += dst_stride;
699 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2); // 10 11
700 dst += dst_stride;
701 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0); // 20 21
702 dst += dst_stride;
703 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2); // 30 31
704 dst += dst_stride;
705 } else if ((w == 2) && (h == 2)) {
706 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); // 00 01
707 dst += dst_stride;
708 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2); // 10 11
709 dst += dst_stride;
710 }
711 s0 = s4;
712 s1 = s5;
713 s2 = s6;
714 s3 = s7;
715 s4 = s8;
716 s5 = s9;
717 s6 = s10;
718 h -= 4;
719 #else
720 __builtin_prefetch(dst + 0 * dst_stride);
721 __builtin_prefetch(src + 0 * src_stride);
722
723 d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
724
725 d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
726
727 if (w == 4) {
728 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
729 dst += dst_stride;
730 } else if (w == 2) {
731 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
732 dst += dst_stride;
733 }
734 s0 = s1;
735 s1 = s2;
736 s2 = s3;
737 s3 = s4;
738 s4 = s5;
739 s5 = s6;
740 s6 = s7;
741 h -= 1;
742 #endif
743 } while (h > 0);
744 } else {
745 int height;
746 const uint8_t *s;
747 uint8_t *d;
748 uint8x8_t t0;
749 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
750 #if defined(__aarch64__)
751 uint8x8_t t1, t2, t3;
752 int16x8_t s8, s9, s10;
753 #endif
754 do {
755 __builtin_prefetch(src + 0 * src_stride);
756 __builtin_prefetch(src + 1 * src_stride);
757 __builtin_prefetch(src + 2 * src_stride);
758 __builtin_prefetch(src + 3 * src_stride);
759 __builtin_prefetch(src + 4 * src_stride);
760 __builtin_prefetch(src + 5 * src_stride);
761 __builtin_prefetch(src + 6 * src_stride);
762 s = src;
763 s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
764 s += src_stride;
765 s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
766 s += src_stride;
767 s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
768 s += src_stride;
769 s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
770 s += src_stride;
771 s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
772 s += src_stride;
773 s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
774 s += src_stride;
775 s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
776 s += src_stride;
777 d = dst;
778 height = h;
779
780 do {
781 s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
782 s += src_stride;
783 #if defined(__aarch64__)
784 s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
785 s += src_stride;
786 s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
787 s += src_stride;
788 s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
789 s += src_stride;
790
791 __builtin_prefetch(d + 0 * dst_stride);
792 __builtin_prefetch(d + 1 * dst_stride);
793 __builtin_prefetch(d + 2 * dst_stride);
794 __builtin_prefetch(d + 3 * dst_stride);
795 __builtin_prefetch(s + 0 * src_stride);
796 __builtin_prefetch(s + 1 * src_stride);
797 __builtin_prefetch(s + 2 * src_stride);
798 __builtin_prefetch(s + 3 * src_stride);
799 t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
800 t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
801 t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
802 t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
803 if (h != 2) {
804 vst1_u8(d, t0);
805 d += dst_stride;
806 vst1_u8(d, t1);
807 d += dst_stride;
808 vst1_u8(d, t2);
809 d += dst_stride;
810 vst1_u8(d, t3);
811 d += dst_stride;
812 } else {
813 vst1_u8(d, t0);
814 d += dst_stride;
815 vst1_u8(d, t1);
816 d += dst_stride;
817 }
818 s0 = s4;
819 s1 = s5;
820 s2 = s6;
821 s3 = s7;
822 s4 = s8;
823 s5 = s9;
824 s6 = s10;
825 height -= 4;
826 #else
827 __builtin_prefetch(d);
828 __builtin_prefetch(s);
829
830 t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
831
832 vst1_u8(d, t0);
833 d += dst_stride;
834
835 s0 = s1;
836 s1 = s2;
837 s2 = s3;
838 s3 = s4;
839 s4 = s5;
840 s5 = s6;
841 s6 = s7;
842 height -= 1;
843 #endif
844 } while (height > 0);
845 src += 8;
846 dst += 8;
847 w -= 8;
848 } while (w > 0);
849 }
850 }
851
852 // Horizontal filtering for convolve_2d_sr for width multiple of 8
853 // Processes one row at a time
horiz_filter_w8_single_row(const uint8_t * src_ptr,int src_stride,int16_t * dst_ptr,const int dst_stride,int width,int height,const int16_t * x_filter,const int16x8_t horiz_const,const int16x8_t shift_round_0)854 static INLINE void horiz_filter_w8_single_row(
855 const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
856 const int dst_stride, int width, int height, const int16_t *x_filter,
857 const int16x8_t horiz_const, const int16x8_t shift_round_0) {
858 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
859 do {
860 uint8x8_t t0 = vld1_u8(src_ptr);
861 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
862
863 int width_tmp = width;
864 const uint8_t *s = src_ptr + 8;
865 int16_t *dst_tmp = dst_ptr;
866
867 __builtin_prefetch(dst_ptr);
868
869 do {
870 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
871 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
872 int16x8_t sum = s0;
873 s0 = s7;
874
875 s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
876 s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
877 s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
878 s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
879 s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
880 s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
881 s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
882
883 int16x8_t res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7,
884 x_filter, horiz_const, shift_round_0);
885
886 vst1q_s16(dst_tmp, res0);
887
888 s += 8;
889 dst_tmp += 8;
890 width_tmp -= 8;
891 } while (width_tmp > 0);
892 src_ptr += src_stride;
893 dst_ptr += dst_stride;
894 height--;
895 } while (height > 0);
896 }
897
898 // Horizontal filtering for convolve_2d_sr for width <= 4
899 // Processes one row at a time
horiz_filter_w4_single_row(const uint8_t * src_ptr,int src_stride,int16_t * dst_ptr,const int dst_stride,int width,int height,const int16_t * x_filter,const int16x4_t horiz_const,const int16x4_t shift_round_0)900 static INLINE void horiz_filter_w4_single_row(
901 const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
902 const int dst_stride, int width, int height, const int16_t *x_filter,
903 const int16x4_t horiz_const, const int16x4_t shift_round_0) {
904 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
905 do {
906 const uint8_t *s = src_ptr;
907
908 __builtin_prefetch(s);
909
910 uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
911 int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
912 s0 = vget_low_s16(tt0);
913 s4 = vget_high_s16(tt0);
914
915 __builtin_prefetch(dst_ptr);
916 s += 8;
917
918 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
919 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
920
921 s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
922 s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
923 s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
924 s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
925 s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
926 s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
927
928 int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
929 horiz_const, shift_round_0);
930
931 if (width == 4) {
932 vst1_s16(dst_ptr, d0);
933 dst_ptr += dst_stride;
934 } else if (width == 2) {
935 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
936 dst_ptr += dst_stride;
937 }
938
939 src_ptr += src_stride;
940 height--;
941 } while (height > 0);
942 }
943
av1_convolve_2d_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)944 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
945 int dst_stride, int w, int h,
946 const InterpFilterParams *filter_params_x,
947 const InterpFilterParams *filter_params_y,
948 const int subpel_x_qn, const int subpel_y_qn,
949 ConvolveParams *conv_params) {
950 if (filter_params_x->taps > 8) {
951 av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
952 filter_params_x, filter_params_y, subpel_x_qn,
953 subpel_y_qn, conv_params);
954 return;
955 }
956 int im_dst_stride;
957 int width, height;
958 #if defined(__aarch64__)
959 uint8x8_t t0;
960 uint8x8_t t1, t2, t3, t4, t5, t6, t7;
961 const uint8_t *s;
962 #endif
963
964 DECLARE_ALIGNED(16, int16_t,
965 im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
966
967 const int bd = 8;
968 const int im_h = h + filter_params_y->taps - 1;
969 const int im_stride = MAX_SB_SIZE;
970 const int vert_offset = filter_params_y->taps / 2 - 1;
971 const int horiz_offset = filter_params_x->taps / 2 - 1;
972
973 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
974
975 int16_t *dst_ptr;
976
977 dst_ptr = im_block;
978 im_dst_stride = im_stride;
979 height = im_h;
980 width = w;
981
982 const int16_t round_bits =
983 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
984 const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
985 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
986 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
987 filter_params_x, subpel_x_qn & SUBPEL_MASK);
988
989 int16_t x_filter_tmp[8];
990 int16x8_t filter_x_coef = vld1q_s16(x_filter);
991
992 // filter coeffs are even, so downshifting by 1 to reduce intermediate
993 // precision requirements.
994 filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
995 vst1q_s16(&x_filter_tmp[0], filter_x_coef);
996
997 assert(conv_params->round_0 > 0);
998
999 if (w <= 4) {
1000 const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
1001 const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
1002
1003 #if defined(__aarch64__)
1004 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
1005 do {
1006 assert(height >= 4);
1007 s = src_ptr;
1008 __builtin_prefetch(s + 0 * src_stride);
1009 __builtin_prefetch(s + 1 * src_stride);
1010 __builtin_prefetch(s + 2 * src_stride);
1011 __builtin_prefetch(s + 3 * src_stride);
1012
1013 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1014 transpose_u8_8x4(&t0, &t1, &t2, &t3);
1015
1016 s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1017 s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1018 s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1019 s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1020 s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1021 s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1022 s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1023
1024 __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
1025 __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
1026 __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
1027 __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
1028 s += 7;
1029
1030 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1031 transpose_u8_8x4(&t0, &t1, &t2, &t3);
1032
1033 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1034 s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1035 s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1036 s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1037
1038 d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
1039 horiz_const, shift_round_0);
1040 d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
1041 horiz_const, shift_round_0);
1042 d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
1043 horiz_const, shift_round_0);
1044 d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
1045 horiz_const, shift_round_0);
1046
1047 transpose_s16_4x4d(&d0, &d1, &d2, &d3);
1048 if (w == 4) {
1049 vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
1050 vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
1051 vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
1052 vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
1053 } else if (w == 2) {
1054 vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
1055 vreinterpret_u32_s16(d0), 0);
1056 vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
1057 vreinterpret_u32_s16(d1), 0);
1058 vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
1059 vreinterpret_u32_s16(d2), 0);
1060 vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
1061 vreinterpret_u32_s16(d3), 0);
1062 }
1063 src_ptr += 4 * src_stride;
1064 dst_ptr += 4 * im_dst_stride;
1065 height -= 4;
1066 } while (height >= 4);
1067
1068 if (height) {
1069 assert(height < 4);
1070 horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
1071 height, x_filter_tmp, horiz_const,
1072 shift_round_0);
1073 }
1074 #else
1075 horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
1076 height, x_filter_tmp, horiz_const,
1077 shift_round_0);
1078 #endif
1079
1080 } else {
1081 const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
1082 const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
1083
1084 #if defined(__aarch64__)
1085 int16_t *d_tmp;
1086 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
1087 int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
1088 do {
1089 assert(height >= 8);
1090 __builtin_prefetch(src_ptr + 0 * src_stride);
1091 __builtin_prefetch(src_ptr + 1 * src_stride);
1092 __builtin_prefetch(src_ptr + 2 * src_stride);
1093 __builtin_prefetch(src_ptr + 3 * src_stride);
1094 __builtin_prefetch(src_ptr + 4 * src_stride);
1095 __builtin_prefetch(src_ptr + 5 * src_stride);
1096 __builtin_prefetch(src_ptr + 6 * src_stride);
1097 __builtin_prefetch(src_ptr + 7 * src_stride);
1098
1099 load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1100
1101 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1102
1103 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1104 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1105 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1106 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1107 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1108 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1109 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1110
1111 width = w;
1112 s = src_ptr + 7;
1113 d_tmp = dst_ptr;
1114
1115 __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
1116 __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
1117 __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
1118 __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
1119 __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
1120 __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
1121 __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
1122 __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
1123
1124 do {
1125 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1126
1127 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1128
1129 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1130 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1131 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
1132 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
1133 s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
1134 s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
1135 s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
1136 s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
1137
1138 res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
1139 horiz_const, shift_round_0);
1140 res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
1141 horiz_const, shift_round_0);
1142 res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
1143 horiz_const, shift_round_0);
1144 res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
1145 horiz_const, shift_round_0);
1146 res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
1147 horiz_const, shift_round_0);
1148 res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
1149 x_filter_tmp, horiz_const, shift_round_0);
1150 res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
1151 x_filter_tmp, horiz_const, shift_round_0);
1152 res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
1153 x_filter_tmp, horiz_const, shift_round_0);
1154
1155 transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
1156 &res7);
1157
1158 store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
1159 res6, res7);
1160
1161 s0 = s8;
1162 s1 = s9;
1163 s2 = s10;
1164 s3 = s11;
1165 s4 = s12;
1166 s5 = s13;
1167 s6 = s14;
1168 s += 8;
1169 d_tmp += 8;
1170 width -= 8;
1171 } while (width > 0);
1172 src_ptr += 8 * src_stride;
1173 dst_ptr += 8 * im_dst_stride;
1174 height -= 8;
1175 } while (height >= 8);
1176
1177 if (height >= 4) {
1178 assert(height < 8);
1179 int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
1180 reg10, reg11, reg12, reg13, reg14;
1181 int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
1182 int16x8_t out0, out1, out2, out3;
1183
1184 __builtin_prefetch(src_ptr + 0 * src_stride);
1185 __builtin_prefetch(src_ptr + 1 * src_stride);
1186 __builtin_prefetch(src_ptr + 2 * src_stride);
1187 __builtin_prefetch(src_ptr + 3 * src_stride);
1188
1189 load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
1190 transpose_u8_8x4(&t0, &t1, &t2, &t3);
1191
1192 reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1193 reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1194 reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1195 reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1196 reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1197 reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1198 reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1199
1200 __builtin_prefetch(dst_ptr + 0 * dst_stride);
1201 __builtin_prefetch(dst_ptr + 1 * dst_stride);
1202 __builtin_prefetch(dst_ptr + 2 * dst_stride);
1203 __builtin_prefetch(dst_ptr + 3 * dst_stride);
1204
1205 s = src_ptr + 7;
1206 d_tmp = dst_ptr;
1207 width = w;
1208
1209 do {
1210 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1211 transpose_u8_8x4(&t0, &t1, &t2, &t3);
1212
1213 reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1214 reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1215 reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1216 reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1217 reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1218 reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1219 reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1220 reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1221
1222 d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
1223 x_filter_tmp);
1224
1225 d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
1226 x_filter_tmp);
1227
1228 d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
1229 x_filter_tmp);
1230
1231 d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
1232 x_filter_tmp);
1233
1234 d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
1235 x_filter_tmp);
1236
1237 d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
1238 x_filter_tmp);
1239
1240 d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
1241 x_filter_tmp);
1242
1243 d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
1244 x_filter_tmp);
1245
1246 transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
1247 &out2, &out3);
1248
1249 out0 = vaddq_s16(out0, horiz_const);
1250 out0 = vqrshlq_s16(out0, shift_round_0);
1251
1252 out1 = vaddq_s16(out1, horiz_const);
1253 out1 = vqrshlq_s16(out1, shift_round_0);
1254
1255 out2 = vaddq_s16(out2, horiz_const);
1256 out2 = vqrshlq_s16(out2, shift_round_0);
1257
1258 out3 = vaddq_s16(out3, horiz_const);
1259 out3 = vqrshlq_s16(out3, shift_round_0);
1260
1261 store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
1262
1263 reg0 = reg8;
1264 reg1 = reg9;
1265 reg2 = reg10;
1266 reg3 = reg11;
1267 reg4 = reg12;
1268 reg5 = reg13;
1269 reg6 = reg14;
1270 s += 8;
1271 d_tmp += 8;
1272 width -= 8;
1273 } while (width > 0);
1274 src_ptr += 4 * src_stride;
1275 dst_ptr += 4 * im_dst_stride;
1276 height -= 4;
1277 }
1278
1279 if (height) {
1280 assert(height < 4);
1281 horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
1282 height, x_filter_tmp, horiz_const,
1283 shift_round_0);
1284 }
1285 #else
1286
1287 horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
1288 height, x_filter_tmp, horiz_const,
1289 shift_round_0);
1290 #endif
1291 }
1292
1293 // vertical
1294 {
1295 uint8_t *dst_u8_ptr, *d_u8;
1296 int16_t *v_src_ptr, *v_s;
1297
1298 const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
1299 (1 << (offset_bits - conv_params->round_1 - 1));
1300 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1301 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1302
1303 const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
1304 const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
1305 const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
1306
1307 src_stride = im_stride;
1308 v_src_ptr = im_block;
1309 dst_u8_ptr = dst;
1310
1311 height = h;
1312 width = w;
1313
1314 if (width <= 4) {
1315 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
1316 uint16x4_t d0;
1317 uint16x8_t dd0;
1318 uint8x8_t d01;
1319
1320 #if defined(__aarch64__)
1321 int16x4_t s8, s9, s10;
1322 uint16x4_t d1, d2, d3;
1323 uint16x8_t dd1;
1324 uint8x8_t d23;
1325 #endif
1326
1327 d_u8 = dst_u8_ptr;
1328 v_s = v_src_ptr;
1329
1330 __builtin_prefetch(v_s + 0 * im_stride);
1331 __builtin_prefetch(v_s + 1 * im_stride);
1332 __builtin_prefetch(v_s + 2 * im_stride);
1333 __builtin_prefetch(v_s + 3 * im_stride);
1334 __builtin_prefetch(v_s + 4 * im_stride);
1335 __builtin_prefetch(v_s + 5 * im_stride);
1336 __builtin_prefetch(v_s + 6 * im_stride);
1337 __builtin_prefetch(v_s + 7 * im_stride);
1338
1339 load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1340 v_s += (7 * im_stride);
1341
1342 do {
1343 #if defined(__aarch64__)
1344 load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
1345 v_s += (im_stride << 2);
1346
1347 __builtin_prefetch(d_u8 + 0 * dst_stride);
1348 __builtin_prefetch(d_u8 + 1 * dst_stride);
1349 __builtin_prefetch(d_u8 + 2 * dst_stride);
1350 __builtin_prefetch(d_u8 + 3 * dst_stride);
1351
1352 d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1353 round_shift_vec, offset_const,
1354 sub_const_vec);
1355 d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1356 round_shift_vec, offset_const,
1357 sub_const_vec);
1358 d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1359 round_shift_vec, offset_const,
1360 sub_const_vec);
1361 d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1362 round_shift_vec, offset_const,
1363 sub_const_vec);
1364
1365 dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
1366 dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
1367
1368 d01 = vqmovn_u16(dd0);
1369 d23 = vqmovn_u16(dd1);
1370
1371 if ((w == 4) && (h != 2)) {
1372 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1373 0); // 00 01 02 03
1374 d_u8 += dst_stride;
1375 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1376 1); // 10 11 12 13
1377 d_u8 += dst_stride;
1378 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
1379 0); // 20 21 22 23
1380 d_u8 += dst_stride;
1381 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
1382 1); // 30 31 32 33
1383 d_u8 += dst_stride;
1384 } else if ((w == 2) && (h != 2)) {
1385 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1386 0); // 00 01
1387 d_u8 += dst_stride;
1388 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1389 2); // 10 11
1390 d_u8 += dst_stride;
1391 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
1392 0); // 20 21
1393 d_u8 += dst_stride;
1394 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
1395 2); // 30 31
1396 d_u8 += dst_stride;
1397 } else if ((w == 4) && (h == 2)) {
1398 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1399 0); // 00 01 02 03
1400 d_u8 += dst_stride;
1401 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1402 1); // 10 11 12 13
1403 d_u8 += dst_stride;
1404 } else if ((w == 2) && (h == 2)) {
1405 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1406 0); // 00 01
1407 d_u8 += dst_stride;
1408 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1409 2); // 10 11
1410 d_u8 += dst_stride;
1411 }
1412
1413 s0 = s4;
1414 s1 = s5;
1415 s2 = s6;
1416 s3 = s7;
1417 s4 = s8;
1418 s5 = s9;
1419 s6 = s10;
1420 height -= 4;
1421 #else
1422 s7 = vld1_s16(v_s);
1423 v_s += im_stride;
1424
1425 __builtin_prefetch(d_u8 + 0 * dst_stride);
1426
1427 d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1428 round_shift_vec, offset_const,
1429 sub_const_vec);
1430
1431 dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
1432 d01 = vqmovn_u16(dd0);
1433
1434 if (w == 4) {
1435 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1436 0); // 00 01 02 03
1437 d_u8 += dst_stride;
1438
1439 } else if (w == 2) {
1440 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1441 0); // 00 01
1442 d_u8 += dst_stride;
1443 }
1444
1445 s0 = s1;
1446 s1 = s2;
1447 s2 = s3;
1448 s3 = s4;
1449 s4 = s5;
1450 s5 = s6;
1451 s6 = s7;
1452 height -= 1;
1453 #endif
1454 } while (height > 0);
1455 } else {
1456 // if width is a multiple of 8 & height is a multiple of 4
1457 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
1458 uint8x8_t res0;
1459 #if defined(__aarch64__)
1460 int16x8_t s8, s9, s10;
1461 uint8x8_t res1, res2, res3;
1462 #endif
1463
1464 do {
1465 __builtin_prefetch(v_src_ptr + 0 * im_stride);
1466 __builtin_prefetch(v_src_ptr + 1 * im_stride);
1467 __builtin_prefetch(v_src_ptr + 2 * im_stride);
1468 __builtin_prefetch(v_src_ptr + 3 * im_stride);
1469 __builtin_prefetch(v_src_ptr + 4 * im_stride);
1470 __builtin_prefetch(v_src_ptr + 5 * im_stride);
1471 __builtin_prefetch(v_src_ptr + 6 * im_stride);
1472 __builtin_prefetch(v_src_ptr + 7 * im_stride);
1473
1474 v_s = v_src_ptr;
1475 load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1476 v_s += (7 * im_stride);
1477
1478 d_u8 = dst_u8_ptr;
1479 height = h;
1480
1481 do {
1482 #if defined(__aarch64__)
1483 load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
1484 v_s += (im_stride << 2);
1485
1486 __builtin_prefetch(d_u8 + 4 * dst_stride);
1487 __builtin_prefetch(d_u8 + 5 * dst_stride);
1488 __builtin_prefetch(d_u8 + 6 * dst_stride);
1489 __builtin_prefetch(d_u8 + 7 * dst_stride);
1490
1491 res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
1492 y_filter, round_shift_vec, offset_const,
1493 sub_const_vec, vec_round_bits);
1494 res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
1495 y_filter, round_shift_vec, offset_const,
1496 sub_const_vec, vec_round_bits);
1497 res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
1498 y_filter, round_shift_vec, offset_const,
1499 sub_const_vec, vec_round_bits);
1500 res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
1501 y_filter, round_shift_vec, offset_const,
1502 sub_const_vec, vec_round_bits);
1503
1504 if (h != 2) {
1505 vst1_u8(d_u8, res0);
1506 d_u8 += dst_stride;
1507 vst1_u8(d_u8, res1);
1508 d_u8 += dst_stride;
1509 vst1_u8(d_u8, res2);
1510 d_u8 += dst_stride;
1511 vst1_u8(d_u8, res3);
1512 d_u8 += dst_stride;
1513 } else {
1514 vst1_u8(d_u8, res0);
1515 d_u8 += dst_stride;
1516 vst1_u8(d_u8, res1);
1517 d_u8 += dst_stride;
1518 }
1519 s0 = s4;
1520 s1 = s5;
1521 s2 = s6;
1522 s3 = s7;
1523 s4 = s8;
1524 s5 = s9;
1525 s6 = s10;
1526 height -= 4;
1527 #else
1528 s7 = vld1q_s16(v_s);
1529 v_s += im_stride;
1530
1531 __builtin_prefetch(d_u8 + 0 * dst_stride);
1532
1533 res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
1534 y_filter, round_shift_vec, offset_const,
1535 sub_const_vec, vec_round_bits);
1536
1537 vst1_u8(d_u8, res0);
1538 d_u8 += dst_stride;
1539
1540 s0 = s1;
1541 s1 = s2;
1542 s2 = s3;
1543 s3 = s4;
1544 s4 = s5;
1545 s5 = s6;
1546 s6 = s7;
1547 height -= 1;
1548 #endif
1549 } while (height > 0);
1550 v_src_ptr += 8;
1551 dst_u8_ptr += 8;
1552 w -= 8;
1553 } while (w > 0);
1554 }
1555 }
1556 }
1557
scaledconvolve_horiz_w4(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const x_filters,const int x0_q4,const int x_step_q4,const int w,const int h)1558 static INLINE void scaledconvolve_horiz_w4(
1559 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1560 const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
1561 const int x0_q4, const int x_step_q4, const int w, const int h) {
1562 DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
1563 int x, y, z;
1564
1565 src -= SUBPEL_TAPS / 2 - 1;
1566
1567 y = h;
1568 do {
1569 int x_q4 = x0_q4;
1570 x = 0;
1571 do {
1572 // process 4 src_x steps
1573 for (z = 0; z < 4; ++z) {
1574 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1575 if (x_q4 & SUBPEL_MASK) {
1576 const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
1577 const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
1578 const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
1579 uint8x8_t s[8], d;
1580 int16x8_t ss[4];
1581 int16x4_t t[8], tt;
1582
1583 load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
1584 transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
1585
1586 ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
1587 ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
1588 ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
1589 ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
1590 t[0] = vget_low_s16(ss[0]);
1591 t[1] = vget_low_s16(ss[1]);
1592 t[2] = vget_low_s16(ss[2]);
1593 t[3] = vget_low_s16(ss[3]);
1594 t[4] = vget_high_s16(ss[0]);
1595 t[5] = vget_high_s16(ss[1]);
1596 t[6] = vget_high_s16(ss[2]);
1597 t[7] = vget_high_s16(ss[3]);
1598
1599 tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
1600 filters, filter3, filter4);
1601 d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
1602 vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
1603 } else {
1604 int i;
1605 for (i = 0; i < 4; ++i) {
1606 temp[z * 4 + i] = src_x[i * src_stride + 3];
1607 }
1608 }
1609 x_q4 += x_step_q4;
1610 }
1611
1612 // transpose the 4x4 filters values back to dst
1613 {
1614 const uint8x8x4_t d4 = vld4_u8(temp);
1615 vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
1616 vreinterpret_u32_u8(d4.val[0]), 0);
1617 vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
1618 vreinterpret_u32_u8(d4.val[1]), 0);
1619 vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
1620 vreinterpret_u32_u8(d4.val[2]), 0);
1621 vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
1622 vreinterpret_u32_u8(d4.val[3]), 0);
1623 }
1624 x += 4;
1625 } while (x < w);
1626
1627 src += src_stride * 4;
1628 dst += dst_stride * 4;
1629 y -= 4;
1630 } while (y > 0);
1631 }
1632
scaledconvolve_horiz_w8(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const x_filters,const int x0_q4,const int x_step_q4,const int w,const int h)1633 static INLINE void scaledconvolve_horiz_w8(
1634 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1635 const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
1636 const int x0_q4, const int x_step_q4, const int w, const int h) {
1637 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
1638 int x, y, z;
1639 src -= SUBPEL_TAPS / 2 - 1;
1640
1641 // This function processes 8x8 areas. The intermediate height is not always
1642 // a multiple of 8, so force it to be a multiple of 8 here.
1643 y = (h + 7) & ~7;
1644
1645 do {
1646 int x_q4 = x0_q4;
1647 x = 0;
1648 do {
1649 uint8x8_t d[8];
1650 // process 8 src_x steps
1651 for (z = 0; z < 8; ++z) {
1652 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1653
1654 if (x_q4 & SUBPEL_MASK) {
1655 const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
1656 uint8x8_t s[8];
1657 load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
1658 &s[5], &s[6], &s[7]);
1659 transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
1660 &s[7]);
1661 d[0] = scale_filter_8(s, filters);
1662 vst1_u8(&temp[8 * z], d[0]);
1663 } else {
1664 int i;
1665 for (i = 0; i < 8; ++i) {
1666 temp[z * 8 + i] = src_x[i * src_stride + 3];
1667 }
1668 }
1669 x_q4 += x_step_q4;
1670 }
1671
1672 // transpose the 8x8 filters values back to dst
1673 load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
1674 &d[7]);
1675 transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
1676 vst1_u8(&dst[x + 0 * dst_stride], d[0]);
1677 vst1_u8(&dst[x + 1 * dst_stride], d[1]);
1678 vst1_u8(&dst[x + 2 * dst_stride], d[2]);
1679 vst1_u8(&dst[x + 3 * dst_stride], d[3]);
1680 vst1_u8(&dst[x + 4 * dst_stride], d[4]);
1681 vst1_u8(&dst[x + 5 * dst_stride], d[5]);
1682 vst1_u8(&dst[x + 6 * dst_stride], d[6]);
1683 vst1_u8(&dst[x + 7 * dst_stride], d[7]);
1684 x += 8;
1685 } while (x < w);
1686
1687 src += src_stride * 8;
1688 dst += dst_stride * 8;
1689 } while (y -= 8);
1690 }
1691
scaledconvolve_vert_w4(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1692 static INLINE void scaledconvolve_vert_w4(
1693 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1694 const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1695 const int y0_q4, const int y_step_q4, const int w, const int h) {
1696 int y;
1697 int y_q4 = y0_q4;
1698
1699 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1700 y = h;
1701 do {
1702 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1703
1704 if (y_q4 & SUBPEL_MASK) {
1705 const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
1706 const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
1707 const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
1708 uint8x8_t s[8], d;
1709 int16x4_t t[8], tt;
1710
1711 load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
1712 &s[6], &s[7]);
1713 t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
1714 t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
1715 t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
1716 t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
1717 t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
1718 t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
1719 t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
1720 t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
1721
1722 tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
1723 filter3, filter4);
1724 d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
1725 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
1726 } else {
1727 memcpy(dst, &src_y[3 * src_stride], w);
1728 }
1729
1730 dst += dst_stride;
1731 y_q4 += y_step_q4;
1732 } while (--y);
1733 }
1734
scaledconvolve_vert_w8(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1735 static INLINE void scaledconvolve_vert_w8(
1736 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1737 const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1738 const int y0_q4, const int y_step_q4, const int w, const int h) {
1739 int y;
1740 int y_q4 = y0_q4;
1741
1742 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1743 y = h;
1744 do {
1745 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1746 if (y_q4 & SUBPEL_MASK) {
1747 const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
1748 uint8x8_t s[8], d;
1749 load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
1750 &s[6], &s[7]);
1751 d = scale_filter_8(s, filters);
1752 vst1_u8(dst, d);
1753 } else {
1754 memcpy(dst, &src_y[3 * src_stride], w);
1755 }
1756 dst += dst_stride;
1757 y_q4 += y_step_q4;
1758 } while (--y);
1759 }
1760
scaledconvolve_vert_w16(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1761 static INLINE void scaledconvolve_vert_w16(
1762 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1763 const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1764 const int y0_q4, const int y_step_q4, const int w, const int h) {
1765 int x, y;
1766 int y_q4 = y0_q4;
1767
1768 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1769 y = h;
1770 do {
1771 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1772 if (y_q4 & SUBPEL_MASK) {
1773 x = 0;
1774 do {
1775 const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
1776 uint8x16_t ss[8];
1777 uint8x8_t s[8], d[2];
1778 load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
1779 &ss[5], &ss[6], &ss[7]);
1780 s[0] = vget_low_u8(ss[0]);
1781 s[1] = vget_low_u8(ss[1]);
1782 s[2] = vget_low_u8(ss[2]);
1783 s[3] = vget_low_u8(ss[3]);
1784 s[4] = vget_low_u8(ss[4]);
1785 s[5] = vget_low_u8(ss[5]);
1786 s[6] = vget_low_u8(ss[6]);
1787 s[7] = vget_low_u8(ss[7]);
1788 d[0] = scale_filter_8(s, filters);
1789
1790 s[0] = vget_high_u8(ss[0]);
1791 s[1] = vget_high_u8(ss[1]);
1792 s[2] = vget_high_u8(ss[2]);
1793 s[3] = vget_high_u8(ss[3]);
1794 s[4] = vget_high_u8(ss[4]);
1795 s[5] = vget_high_u8(ss[5]);
1796 s[6] = vget_high_u8(ss[6]);
1797 s[7] = vget_high_u8(ss[7]);
1798 d[1] = scale_filter_8(s, filters);
1799 vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
1800 src_y += 16;
1801 x += 16;
1802 } while (x < w);
1803 } else {
1804 memcpy(dst, &src_y[3 * src_stride], w);
1805 }
1806 dst += dst_stride;
1807 y_q4 += y_step_q4;
1808 } while (--y);
1809 }
1810
aom_scaled_2d_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)1811 void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1812 ptrdiff_t dst_stride, const InterpKernel *filter,
1813 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
1814 int w, int h) {
1815 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1816 // 2d filtering proceeds in 2 steps:
1817 // (1) Interpolate horizontally into an intermediate buffer, temp.
1818 // (2) Interpolate temp vertically to derive the sub-pixel result.
1819 // Deriving the maximum number of rows in the temp buffer (135):
1820 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1821 // --Largest block size is 64x64 pixels.
1822 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1823 // original frame (in 1/16th pixel units).
1824 // --Must round-up because block may be located at sub-pixel position.
1825 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1826 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1827 // --Require an additional 8 rows for the horiz_w8 transpose tail.
1828 // When calling in frame scaling function, the smallest scaling factor is x1/4
1829 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
1830 // big enough.
1831 DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
1832 const int intermediate_height =
1833 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1834
1835 assert(w <= 64);
1836 assert(h <= 64);
1837 assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
1838 assert(x_step_q4 <= 64);
1839
1840 if (w >= 8) {
1841 scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1842 src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
1843 intermediate_height);
1844 } else {
1845 scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1846 src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
1847 intermediate_height);
1848 }
1849
1850 if (w >= 16) {
1851 scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1852 dst_stride, filter, y0_q4, y_step_q4, w, h);
1853 } else if (w == 8) {
1854 scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1855 dst_stride, filter, y0_q4, y_step_q4, w, h);
1856 } else {
1857 scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1858 dst_stride, filter, y0_q4, y_step_q4, w, h);
1859 }
1860 }
1861