1 /*
2 *
3 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11 */
12
13 #include <assert.h>
14 #include <arm_neon.h>
15
16 #include "config/av1_rtcd.h"
17
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_ports/mem.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/arm/convolve_neon.h"
23 #include "av1/common/arm/mem_neon.h"
24 #include "av1/common/arm/transpose_neon.h"
25
convolve8_4x4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * filter)26 static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
27 const int16x4_t s2, const int16x4_t s3,
28 const int16x4_t s4, const int16x4_t s5,
29 const int16x4_t s6, const int16x4_t s7,
30 const int16_t *filter) {
31 int16x4_t sum;
32
33 sum = vmul_n_s16(s0, filter[0]);
34 sum = vmla_n_s16(sum, s1, filter[1]);
35 sum = vmla_n_s16(sum, s2, filter[2]);
36 sum = vmla_n_s16(sum, s5, filter[5]);
37 sum = vmla_n_s16(sum, s6, filter[6]);
38 sum = vmla_n_s16(sum, s7, filter[7]);
39 /* filter[3] can take a max value of 128. So the max value of the result :
40 * 128*255 + sum > 16 bits
41 */
42 sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
43 sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
44
45 return sum;
46 }
47
convolve8_horiz_8x8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * filter,const int16x8_t shift_round_0,const int16x8_t shift_by_bits)48 static INLINE uint8x8_t convolve8_horiz_8x8(
49 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
50 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
51 const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
52 const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
53 int16x8_t sum;
54
55 sum = vmulq_n_s16(s0, filter[0]);
56 sum = vmlaq_n_s16(sum, s1, filter[1]);
57 sum = vmlaq_n_s16(sum, s2, filter[2]);
58 sum = vmlaq_n_s16(sum, s5, filter[5]);
59 sum = vmlaq_n_s16(sum, s6, filter[6]);
60 sum = vmlaq_n_s16(sum, s7, filter[7]);
61 /* filter[3] can take a max value of 128. So the max value of the result :
62 * 128*255 + sum > 16 bits
63 */
64 sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
65 sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
66
67 sum = vqrshlq_s16(sum, shift_round_0);
68 sum = vqrshlq_s16(sum, shift_by_bits);
69
70 return vqmovun_s16(sum);
71 }
72
73 #if !defined(__aarch64__)
convolve8_horiz_4x1(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * filter,const int16x4_t shift_round_0,const int16x4_t shift_by_bits)74 static INLINE uint8x8_t convolve8_horiz_4x1(
75 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
76 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
77 const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
78 const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
79 int16x4_t sum;
80
81 sum = vmul_n_s16(s0, filter[0]);
82 sum = vmla_n_s16(sum, s1, filter[1]);
83 sum = vmla_n_s16(sum, s2, filter[2]);
84 sum = vmla_n_s16(sum, s5, filter[5]);
85 sum = vmla_n_s16(sum, s6, filter[6]);
86 sum = vmla_n_s16(sum, s7, filter[7]);
87 /* filter[3] can take a max value of 128. So the max value of the result :
88 * 128*255 + sum > 16 bits
89 */
90 sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
91 sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
92
93 sum = vqrshl_s16(sum, shift_round_0);
94 sum = vqrshl_s16(sum, shift_by_bits);
95
96 return vqmovun_s16(vcombine_s16(sum, sum));
97 }
98 #endif // !defined(__arch64__)
99
convolve8_vert_8x4(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * filter)100 static INLINE uint8x8_t convolve8_vert_8x4(
101 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
102 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
103 const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
104 int16x8_t sum;
105
106 sum = vmulq_n_s16(s0, filter[0]);
107 sum = vmlaq_n_s16(sum, s1, filter[1]);
108 sum = vmlaq_n_s16(sum, s2, filter[2]);
109 sum = vmlaq_n_s16(sum, s5, filter[5]);
110 sum = vmlaq_n_s16(sum, s6, filter[6]);
111 sum = vmlaq_n_s16(sum, s7, filter[7]);
112 /* filter[3] can take a max value of 128. So the max value of the result :
113 * 128*255 + sum > 16 bits
114 */
115 sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
116 sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
117
118 return vqrshrun_n_s16(sum, FILTER_BITS);
119 }
120
convolve8_vert_4x4_s32(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * y_filter,const int32x4_t round_shift_vec,const int32x4_t offset_const,const int32x4_t sub_const_vec)121 static INLINE uint16x4_t convolve8_vert_4x4_s32(
122 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
123 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
124 const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
125 const int32x4_t round_shift_vec, const int32x4_t offset_const,
126 const int32x4_t sub_const_vec) {
127 int32x4_t sum0;
128 uint16x4_t res;
129 const int32x4_t zero = vdupq_n_s32(0);
130
131 sum0 = vmull_n_s16(s0, y_filter[0]);
132 sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
133 sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
134 sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
135 sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
136 sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
137 sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
138 sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
139
140 sum0 = vaddq_s32(sum0, offset_const);
141 sum0 = vqrshlq_s32(sum0, round_shift_vec);
142 sum0 = vsubq_s32(sum0, sub_const_vec);
143 sum0 = vmaxq_s32(sum0, zero);
144
145 res = vmovn_u32(vreinterpretq_u32_s32(sum0));
146
147 return res;
148 }
149
convolve8_vert_8x4_s32(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * y_filter,const int32x4_t round_shift_vec,const int32x4_t offset_const,const int32x4_t sub_const_vec,const int16x8_t vec_round_bits)150 static INLINE uint8x8_t convolve8_vert_8x4_s32(
151 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
152 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
153 const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
154 const int32x4_t round_shift_vec, const int32x4_t offset_const,
155 const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
156 int32x4_t sum0, sum1;
157 uint16x8_t res;
158 const int32x4_t zero = vdupq_n_s32(0);
159
160 sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
161 sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
162 sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
163 sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
164 sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
165 sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
166 sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
167 sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
168
169 sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
170 sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
171 sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
172 sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
173 sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
174 sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
175 sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
176 sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
177
178 sum0 = vaddq_s32(sum0, offset_const);
179 sum1 = vaddq_s32(sum1, offset_const);
180 sum0 = vqrshlq_s32(sum0, round_shift_vec);
181 sum1 = vqrshlq_s32(sum1, round_shift_vec);
182 sum0 = vsubq_s32(sum0, sub_const_vec);
183 sum1 = vsubq_s32(sum1, sub_const_vec);
184 sum0 = vmaxq_s32(sum0, zero);
185 sum1 = vmaxq_s32(sum1, zero);
186 res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
187 vqmovn_u32(vreinterpretq_u32_s32(sum1)));
188
189 res = vqrshlq_u16(res, vec_round_bits);
190
191 return vqmovn_u16(res);
192 }
193
av1_convolve_x_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)194 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
195 int dst_stride, int w, int h,
196 const InterpFilterParams *filter_params_x,
197 const int subpel_x_qn,
198 ConvolveParams *conv_params) {
199 const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
200 const int8_t bits = FILTER_BITS - conv_params->round_0;
201
202 uint8x8_t t0;
203 #if defined(__aarch64__)
204 uint8x8_t t1, t2, t3;
205 #endif
206
207 assert(bits >= 0);
208 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
209 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
210
211 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
212 filter_params_x, subpel_x_qn & SUBPEL_MASK);
213
214 const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
215 const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
216
217 src -= horiz_offset;
218 #if defined(__aarch64__)
219 if (h == 4) {
220 uint8x8_t d01, d23;
221 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
222 int16x8_t d01_temp, d23_temp;
223
224 __builtin_prefetch(src + 0 * src_stride);
225 __builtin_prefetch(src + 1 * src_stride);
226 __builtin_prefetch(src + 2 * src_stride);
227 __builtin_prefetch(src + 3 * src_stride);
228
229 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
230 transpose_u8_8x4(&t0, &t1, &t2, &t3);
231
232 s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
233 s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
234 s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
235 s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
236 s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
237 s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
238 s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
239 __builtin_prefetch(dst + 0 * dst_stride);
240 __builtin_prefetch(dst + 1 * dst_stride);
241 __builtin_prefetch(dst + 2 * dst_stride);
242 __builtin_prefetch(dst + 3 * dst_stride);
243 src += 7;
244
245 do {
246 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
247 transpose_u8_8x4(&t0, &t1, &t2, &t3);
248
249 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
250 s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
251 s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
252 s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
253
254 d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
255
256 d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
257
258 d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
259
260 d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
261
262 d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0);
263 d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0);
264
265 d01_temp = vqrshlq_s16(d01_temp, shift_by_bits);
266 d23_temp = vqrshlq_s16(d23_temp, shift_by_bits);
267
268 d01 = vqmovun_s16(d01_temp);
269 d23 = vqmovun_s16(d23_temp);
270
271 transpose_u8_4x4(&d01, &d23);
272
273 if (w != 2) {
274 vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), // 00 01 02 03
275 vreinterpret_u32_u8(d01), 0);
276 vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), // 10 11 12 13
277 vreinterpret_u32_u8(d23), 0);
278 vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), // 20 21 22 23
279 vreinterpret_u32_u8(d01), 1);
280 vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), // 30 31 32 33
281 vreinterpret_u32_u8(d23), 1);
282 } else {
283 vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride), // 00 01
284 vreinterpret_u16_u8(d01), 0);
285 vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride), // 10 11
286 vreinterpret_u16_u8(d23), 0);
287 vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride), // 20 21
288 vreinterpret_u16_u8(d01), 2);
289 vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride), // 30 31
290 vreinterpret_u16_u8(d23), 2);
291 }
292
293 s0 = s4;
294 s1 = s5;
295 s2 = s6;
296 s3 = s7;
297 s4 = s8;
298 s5 = s9;
299 s6 = s10;
300 src += 4;
301 dst += 4;
302 w -= 4;
303 } while (w > 0);
304 } else {
305 #endif
306 int width;
307 const uint8_t *s;
308 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
309
310 #if defined(__aarch64__)
311 int16x8_t s8, s9, s10;
312 uint8x8_t t4, t5, t6, t7;
313 #endif
314
315 if (w <= 4) {
316 #if defined(__aarch64__)
317 do {
318 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
319 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
320 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
321 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
322 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
323 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
324 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
325 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
326 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
327
328 load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
329 &t7);
330 src += 8 * src_stride;
331 __builtin_prefetch(dst + 0 * dst_stride);
332 __builtin_prefetch(dst + 1 * dst_stride);
333 __builtin_prefetch(dst + 2 * dst_stride);
334 __builtin_prefetch(dst + 3 * dst_stride);
335 __builtin_prefetch(dst + 4 * dst_stride);
336 __builtin_prefetch(dst + 5 * dst_stride);
337 __builtin_prefetch(dst + 6 * dst_stride);
338 __builtin_prefetch(dst + 7 * dst_stride);
339
340 transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
341
342 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
343 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
344 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
345 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
346
347 __builtin_prefetch(src + 0 * src_stride);
348 __builtin_prefetch(src + 1 * src_stride);
349 __builtin_prefetch(src + 2 * src_stride);
350 __builtin_prefetch(src + 3 * src_stride);
351 __builtin_prefetch(src + 4 * src_stride);
352 __builtin_prefetch(src + 5 * src_stride);
353 __builtin_prefetch(src + 6 * src_stride);
354 __builtin_prefetch(src + 7 * src_stride);
355 t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
356 shift_round_0, shift_by_bits);
357 t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
358 shift_round_0, shift_by_bits);
359 t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
360 shift_round_0, shift_by_bits);
361 t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
362 shift_round_0, shift_by_bits);
363
364 transpose_u8_8x4(&t0, &t1, &t2, &t3);
365
366 if ((w == 4) && (h > 4)) {
367 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
368 0); // 00 01 02 03
369 dst += dst_stride;
370 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
371 0); // 10 11 12 13
372 dst += dst_stride;
373 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
374 0); // 20 21 22 23
375 dst += dst_stride;
376 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
377 0); // 30 31 32 33
378 dst += dst_stride;
379 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
380 1); // 40 41 42 43
381 dst += dst_stride;
382 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
383 1); // 50 51 52 53
384 dst += dst_stride;
385 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
386 1); // 60 61 62 63
387 dst += dst_stride;
388 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
389 1); // 70 71 72 73
390 dst += dst_stride;
391 } else if ((w == 4) && (h == 2)) {
392 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
393 0); // 00 01 02 03
394 dst += dst_stride;
395 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
396 0); // 10 11 12 13
397 dst += dst_stride;
398 } else if ((w == 2) && (h > 4)) {
399 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01
400 dst += dst_stride;
401 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0); // 10 11
402 dst += dst_stride;
403 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0); // 20 21
404 dst += dst_stride;
405 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0); // 30 31
406 dst += dst_stride;
407 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2); // 40 41
408 dst += dst_stride;
409 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2); // 50 51
410 dst += dst_stride;
411 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2); // 60 61
412 dst += dst_stride;
413 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2); // 70 71
414 dst += dst_stride;
415 } else if ((w == 2) && (h == 2)) {
416 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01
417 dst += dst_stride;
418 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0); // 10 11
419 dst += dst_stride;
420 }
421 h -= 8;
422 } while (h > 0);
423 #else
424 int16x8_t tt0;
425 int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
426 const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
427 const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
428 do {
429 t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
430 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
431 x0 = vget_low_s16(tt0); // a0 a1 a2 a3
432 x4 = vget_high_s16(tt0); // a4 a5 a6 a7
433
434 t0 = vld1_u8(src + 8); // a8 a9 a10 a11 a12 a13 a14 a15
435 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
436 x7 = vget_low_s16(tt0); // a8 a9 a10 a11
437
438 x1 = vext_s16(x0, x4, 1); // a1 a2 a3 a4
439 x2 = vext_s16(x0, x4, 2); // a2 a3 a4 a5
440 x3 = vext_s16(x0, x4, 3); // a3 a4 a5 a6
441 x5 = vext_s16(x4, x7, 1); // a5 a6 a7 a8
442 x6 = vext_s16(x4, x7, 2); // a6 a7 a8 a9
443 x7 = vext_s16(x4, x7, 3); // a7 a8 a9 a10
444
445 src += src_stride;
446
447 t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
448 shift_round_0_low, shift_by_bits_low);
449
450 if (w == 4) {
451 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
452 0); // 00 01 02 03
453 dst += dst_stride;
454 } else if (w == 2) {
455 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01
456 dst += dst_stride;
457 }
458 h -= 1;
459 } while (h > 0);
460 #endif
461 } else {
462 uint8_t *d;
463 int16x8_t s11;
464 #if defined(__aarch64__)
465 int16x8_t s12, s13, s14;
466 do {
467 __builtin_prefetch(src + 0 * src_stride);
468 __builtin_prefetch(src + 1 * src_stride);
469 __builtin_prefetch(src + 2 * src_stride);
470 __builtin_prefetch(src + 3 * src_stride);
471 __builtin_prefetch(src + 4 * src_stride);
472 __builtin_prefetch(src + 5 * src_stride);
473 __builtin_prefetch(src + 6 * src_stride);
474 __builtin_prefetch(src + 7 * src_stride);
475 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
476 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
477 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
478 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
479 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
480 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
481 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
482 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
483 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
484
485 width = w;
486 s = src + 7;
487 d = dst;
488 __builtin_prefetch(dst + 0 * dst_stride);
489 __builtin_prefetch(dst + 1 * dst_stride);
490 __builtin_prefetch(dst + 2 * dst_stride);
491 __builtin_prefetch(dst + 3 * dst_stride);
492 __builtin_prefetch(dst + 4 * dst_stride);
493 __builtin_prefetch(dst + 5 * dst_stride);
494 __builtin_prefetch(dst + 6 * dst_stride);
495 __builtin_prefetch(dst + 7 * dst_stride);
496
497 do {
498 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
499 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
500 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
501 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
502 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
503 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
504 s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
505 s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
506 s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
507 s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
508
509 t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
510 shift_round_0, shift_by_bits);
511
512 t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
513 shift_round_0, shift_by_bits);
514
515 t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
516 shift_round_0, shift_by_bits);
517
518 t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
519 shift_round_0, shift_by_bits);
520
521 t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
522 shift_round_0, shift_by_bits);
523
524 t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
525 shift_round_0, shift_by_bits);
526
527 t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
528 shift_round_0, shift_by_bits);
529
530 t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
531 x_filter, shift_round_0, shift_by_bits);
532
533 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
534 if (h != 2) {
535 store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
536 } else {
537 store_row2_u8_8x8(d, dst_stride, t0, t1);
538 }
539 s0 = s8;
540 s1 = s9;
541 s2 = s10;
542 s3 = s11;
543 s4 = s12;
544 s5 = s13;
545 s6 = s14;
546 s += 8;
547 d += 8;
548 width -= 8;
549 } while (width > 0);
550 src += 8 * src_stride;
551 dst += 8 * dst_stride;
552 h -= 8;
553 } while (h > 0);
554 #else
555 do {
556 t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
557 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
558
559 width = w;
560 s = src + 8;
561 d = dst;
562 __builtin_prefetch(dst);
563
564 do {
565 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
566 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
567 s11 = s0;
568 s0 = s7;
569
570 s1 = vextq_s16(s11, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
571 s2 = vextq_s16(s11, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
572 s3 = vextq_s16(s11, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
573 s4 = vextq_s16(s11, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
574 s5 = vextq_s16(s11, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
575 s6 = vextq_s16(s11, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
576 s7 = vextq_s16(s11, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
577
578 t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
579 shift_round_0, shift_by_bits);
580 vst1_u8(d, t0);
581
582 s += 8;
583 d += 8;
584 width -= 8;
585 } while (width > 0);
586 src += src_stride;
587 dst += dst_stride;
588 h -= 1;
589 } while (h > 0);
590 #endif
591 }
592 #if defined(__aarch64__)
593 }
594 #endif
595 }
596
av1_convolve_y_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)597 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
598 int dst_stride, int w, int h,
599 const InterpFilterParams *filter_params_y,
600 const int subpel_y_qn) {
601 const int vert_offset = filter_params_y->taps / 2 - 1;
602
603 src -= vert_offset * src_stride;
604
605 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
606 filter_params_y, subpel_y_qn & SUBPEL_MASK);
607
608 if (w <= 4) {
609 uint8x8_t d01;
610 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
611 #if defined(__aarch64__)
612 uint8x8_t d23;
613 int16x4_t s8, s9, s10, d1, d2, d3;
614 #endif
615 s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
616 src += src_stride;
617 s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
618 src += src_stride;
619 s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
620 src += src_stride;
621 s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
622 src += src_stride;
623 s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
624 src += src_stride;
625 s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
626 src += src_stride;
627 s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
628 src += src_stride;
629
630 do {
631 s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
632 src += src_stride;
633 #if defined(__aarch64__)
634 s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
635 src += src_stride;
636 s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
637 src += src_stride;
638 s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
639 src += src_stride;
640
641 __builtin_prefetch(dst + 0 * dst_stride);
642 __builtin_prefetch(dst + 1 * dst_stride);
643 __builtin_prefetch(dst + 2 * dst_stride);
644 __builtin_prefetch(dst + 3 * dst_stride);
645 __builtin_prefetch(src + 0 * src_stride);
646 __builtin_prefetch(src + 1 * src_stride);
647 __builtin_prefetch(src + 2 * src_stride);
648 __builtin_prefetch(src + 3 * src_stride);
649 d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
650 d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
651 d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
652 d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
653
654 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
655 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
656 if ((w == 4) && (h != 2)) {
657 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
658 0); // 00 01 02 03
659 dst += dst_stride;
660 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
661 1); // 10 11 12 13
662 dst += dst_stride;
663 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
664 0); // 20 21 22 23
665 dst += dst_stride;
666 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
667 1); // 30 31 32 33
668 dst += dst_stride;
669 } else if ((w == 4) && (h == 2)) {
670 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
671 0); // 00 01 02 03
672 dst += dst_stride;
673 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
674 1); // 10 11 12 13
675 dst += dst_stride;
676 } else if ((w == 2) && (h != 2)) {
677 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); // 00 01
678 dst += dst_stride;
679 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2); // 10 11
680 dst += dst_stride;
681 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0); // 20 21
682 dst += dst_stride;
683 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2); // 30 31
684 dst += dst_stride;
685 } else if ((w == 2) && (h == 2)) {
686 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); // 00 01
687 dst += dst_stride;
688 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2); // 10 11
689 dst += dst_stride;
690 }
691 s0 = s4;
692 s1 = s5;
693 s2 = s6;
694 s3 = s7;
695 s4 = s8;
696 s5 = s9;
697 s6 = s10;
698 h -= 4;
699 #else
700 __builtin_prefetch(dst + 0 * dst_stride);
701 __builtin_prefetch(src + 0 * src_stride);
702
703 d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
704
705 d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
706
707 if (w == 4) {
708 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
709 dst += dst_stride;
710 } else if (w == 2) {
711 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
712 dst += dst_stride;
713 }
714 s0 = s1;
715 s1 = s2;
716 s2 = s3;
717 s3 = s4;
718 s4 = s5;
719 s5 = s6;
720 s6 = s7;
721 h -= 1;
722 #endif
723 } while (h > 0);
724 } else {
725 int height;
726 const uint8_t *s;
727 uint8_t *d;
728 uint8x8_t t0;
729 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
730 #if defined(__aarch64__)
731 uint8x8_t t1, t2, t3;
732 int16x8_t s8, s9, s10;
733 #endif
734 do {
735 __builtin_prefetch(src + 0 * src_stride);
736 __builtin_prefetch(src + 1 * src_stride);
737 __builtin_prefetch(src + 2 * src_stride);
738 __builtin_prefetch(src + 3 * src_stride);
739 __builtin_prefetch(src + 4 * src_stride);
740 __builtin_prefetch(src + 5 * src_stride);
741 __builtin_prefetch(src + 6 * src_stride);
742 s = src;
743 s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
744 s += src_stride;
745 s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
746 s += src_stride;
747 s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
748 s += src_stride;
749 s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
750 s += src_stride;
751 s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
752 s += src_stride;
753 s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
754 s += src_stride;
755 s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
756 s += src_stride;
757 d = dst;
758 height = h;
759
760 do {
761 s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
762 s += src_stride;
763 #if defined(__aarch64__)
764 s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
765 s += src_stride;
766 s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
767 s += src_stride;
768 s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
769 s += src_stride;
770
771 __builtin_prefetch(d + 0 * dst_stride);
772 __builtin_prefetch(d + 1 * dst_stride);
773 __builtin_prefetch(d + 2 * dst_stride);
774 __builtin_prefetch(d + 3 * dst_stride);
775 __builtin_prefetch(s + 0 * src_stride);
776 __builtin_prefetch(s + 1 * src_stride);
777 __builtin_prefetch(s + 2 * src_stride);
778 __builtin_prefetch(s + 3 * src_stride);
779 t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
780 t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
781 t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
782 t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
783 if (h != 2) {
784 vst1_u8(d, t0);
785 d += dst_stride;
786 vst1_u8(d, t1);
787 d += dst_stride;
788 vst1_u8(d, t2);
789 d += dst_stride;
790 vst1_u8(d, t3);
791 d += dst_stride;
792 } else {
793 vst1_u8(d, t0);
794 d += dst_stride;
795 vst1_u8(d, t1);
796 d += dst_stride;
797 }
798 s0 = s4;
799 s1 = s5;
800 s2 = s6;
801 s3 = s7;
802 s4 = s8;
803 s5 = s9;
804 s6 = s10;
805 height -= 4;
806 #else
807 __builtin_prefetch(d);
808 __builtin_prefetch(s);
809
810 t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
811
812 vst1_u8(d, t0);
813 d += dst_stride;
814
815 s0 = s1;
816 s1 = s2;
817 s2 = s3;
818 s3 = s4;
819 s4 = s5;
820 s5 = s6;
821 s6 = s7;
822 height -= 1;
823 #endif
824 } while (height > 0);
825 src += 8;
826 dst += 8;
827 w -= 8;
828 } while (w > 0);
829 }
830 }
831
832 // Horizontal filtering for convolve_2d_sr for width multiple of 8
833 // Processes one row at a time
horiz_filter_w8_single_row(const uint8_t * src_ptr,int src_stride,int16_t * dst_ptr,const int dst_stride,int width,int height,const int16_t * x_filter,const int16x8_t horiz_const,const int16x8_t shift_round_0)834 static INLINE void horiz_filter_w8_single_row(
835 const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
836 const int dst_stride, int width, int height, const int16_t *x_filter,
837 const int16x8_t horiz_const, const int16x8_t shift_round_0) {
838 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
839 do {
840 uint8x8_t t0 = vld1_u8(src_ptr);
841 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
842
843 int width_tmp = width;
844 const uint8_t *s = src_ptr + 8;
845 int16_t *dst_tmp = dst_ptr;
846
847 __builtin_prefetch(dst_ptr);
848
849 do {
850 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
851 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
852 int16x8_t sum = s0;
853 s0 = s7;
854
855 s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
856 s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
857 s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
858 s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
859 s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
860 s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
861 s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
862
863 int16x8_t res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7,
864 x_filter, horiz_const, shift_round_0);
865
866 vst1q_s16(dst_tmp, res0);
867
868 s += 8;
869 dst_tmp += 8;
870 width_tmp -= 8;
871 } while (width_tmp > 0);
872 src_ptr += src_stride;
873 dst_ptr += dst_stride;
874 height--;
875 } while (height > 0);
876 }
877
878 // Horizontal filtering for convolve_2d_sr for width <= 4
879 // Processes one row at a time
horiz_filter_w4_single_row(const uint8_t * src_ptr,int src_stride,int16_t * dst_ptr,const int dst_stride,int width,int height,const int16_t * x_filter,const int16x4_t horiz_const,const int16x4_t shift_round_0)880 static INLINE void horiz_filter_w4_single_row(
881 const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
882 const int dst_stride, int width, int height, const int16_t *x_filter,
883 const int16x4_t horiz_const, const int16x4_t shift_round_0) {
884 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
885 do {
886 const uint8_t *s = src_ptr;
887
888 __builtin_prefetch(s);
889
890 uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
891 int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
892 s0 = vget_low_s16(tt0);
893 s4 = vget_high_s16(tt0);
894
895 __builtin_prefetch(dst_ptr);
896 s += 8;
897
898 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
899 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
900
901 s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
902 s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
903 s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
904 s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
905 s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
906 s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
907
908 int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
909 horiz_const, shift_round_0);
910
911 if (width == 4) {
912 vst1_s16(dst_ptr, d0);
913 dst_ptr += dst_stride;
914 } else if (width == 2) {
915 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
916 dst_ptr += dst_stride;
917 }
918
919 src_ptr += src_stride;
920 height--;
921 } while (height > 0);
922 }
923
av1_convolve_2d_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)924 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
925 int dst_stride, int w, int h,
926 const InterpFilterParams *filter_params_x,
927 const InterpFilterParams *filter_params_y,
928 const int subpel_x_qn, const int subpel_y_qn,
929 ConvolveParams *conv_params) {
930 int im_dst_stride;
931 int width, height;
932 #if defined(__aarch64__)
933 uint8x8_t t0;
934 uint8x8_t t1, t2, t3, t4, t5, t6, t7;
935 const uint8_t *s;
936 #endif
937
938 DECLARE_ALIGNED(16, int16_t,
939 im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
940
941 const int bd = 8;
942 const int im_h = h + filter_params_y->taps - 1;
943 const int im_stride = MAX_SB_SIZE;
944 const int vert_offset = filter_params_y->taps / 2 - 1;
945 const int horiz_offset = filter_params_x->taps / 2 - 1;
946
947 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
948
949 int16_t *dst_ptr;
950
951 dst_ptr = im_block;
952 im_dst_stride = im_stride;
953 height = im_h;
954 width = w;
955
956 const int16_t round_bits =
957 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
958 const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
959 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
960 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
961 filter_params_x, subpel_x_qn & SUBPEL_MASK);
962
963 int16_t x_filter_tmp[8];
964 int16x8_t filter_x_coef = vld1q_s16(x_filter);
965
966 // filter coeffs are even, so downshifting by 1 to reduce intermediate
967 // precision requirements.
968 filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
969 vst1q_s16(&x_filter_tmp[0], filter_x_coef);
970
971 assert(conv_params->round_0 > 0);
972
973 if (w <= 4) {
974 const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
975 const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
976
977 #if defined(__aarch64__)
978 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
979 do {
980 assert(height >= 4);
981 s = src_ptr;
982 __builtin_prefetch(s + 0 * src_stride);
983 __builtin_prefetch(s + 1 * src_stride);
984 __builtin_prefetch(s + 2 * src_stride);
985 __builtin_prefetch(s + 3 * src_stride);
986
987 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
988 transpose_u8_8x4(&t0, &t1, &t2, &t3);
989
990 s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
991 s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
992 s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
993 s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
994 s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
995 s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
996 s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
997
998 __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
999 __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
1000 __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
1001 __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
1002 s += 7;
1003
1004 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1005 transpose_u8_8x4(&t0, &t1, &t2, &t3);
1006
1007 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1008 s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1009 s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1010 s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1011
1012 d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
1013 horiz_const, shift_round_0);
1014 d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
1015 horiz_const, shift_round_0);
1016 d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
1017 horiz_const, shift_round_0);
1018 d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
1019 horiz_const, shift_round_0);
1020
1021 transpose_s16_4x4d(&d0, &d1, &d2, &d3);
1022 if (w == 4) {
1023 vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
1024 vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
1025 vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
1026 vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
1027 } else if (w == 2) {
1028 vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
1029 vreinterpret_u32_s16(d0), 0);
1030 vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
1031 vreinterpret_u32_s16(d1), 0);
1032 vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
1033 vreinterpret_u32_s16(d2), 0);
1034 vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
1035 vreinterpret_u32_s16(d3), 0);
1036 }
1037 src_ptr += 4 * src_stride;
1038 dst_ptr += 4 * im_dst_stride;
1039 height -= 4;
1040 } while (height >= 4);
1041
1042 if (height) {
1043 assert(height < 4);
1044 horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
1045 height, x_filter_tmp, horiz_const,
1046 shift_round_0);
1047 }
1048 #else
1049 horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
1050 height, x_filter_tmp, horiz_const,
1051 shift_round_0);
1052 #endif
1053
1054 } else {
1055 const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
1056 const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
1057
1058 #if defined(__aarch64__)
1059 int16_t *d_tmp;
1060 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
1061 int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
1062 do {
1063 assert(height >= 8);
1064 __builtin_prefetch(src_ptr + 0 * src_stride);
1065 __builtin_prefetch(src_ptr + 1 * src_stride);
1066 __builtin_prefetch(src_ptr + 2 * src_stride);
1067 __builtin_prefetch(src_ptr + 3 * src_stride);
1068 __builtin_prefetch(src_ptr + 4 * src_stride);
1069 __builtin_prefetch(src_ptr + 5 * src_stride);
1070 __builtin_prefetch(src_ptr + 6 * src_stride);
1071 __builtin_prefetch(src_ptr + 7 * src_stride);
1072
1073 load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1074
1075 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1076
1077 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1078 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1079 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1080 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1081 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1082 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1083 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1084
1085 width = w;
1086 s = src_ptr + 7;
1087 d_tmp = dst_ptr;
1088
1089 __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
1090 __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
1091 __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
1092 __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
1093 __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
1094 __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
1095 __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
1096 __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
1097
1098 do {
1099 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1100
1101 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1102
1103 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1104 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1105 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
1106 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
1107 s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
1108 s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
1109 s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
1110 s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
1111
1112 res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
1113 horiz_const, shift_round_0);
1114 res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
1115 horiz_const, shift_round_0);
1116 res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
1117 horiz_const, shift_round_0);
1118 res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
1119 horiz_const, shift_round_0);
1120 res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
1121 horiz_const, shift_round_0);
1122 res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
1123 x_filter_tmp, horiz_const, shift_round_0);
1124 res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
1125 x_filter_tmp, horiz_const, shift_round_0);
1126 res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
1127 x_filter_tmp, horiz_const, shift_round_0);
1128
1129 transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
1130 &res7);
1131
1132 store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
1133 res6, res7);
1134
1135 s0 = s8;
1136 s1 = s9;
1137 s2 = s10;
1138 s3 = s11;
1139 s4 = s12;
1140 s5 = s13;
1141 s6 = s14;
1142 s += 8;
1143 d_tmp += 8;
1144 width -= 8;
1145 } while (width > 0);
1146 src_ptr += 8 * src_stride;
1147 dst_ptr += 8 * im_dst_stride;
1148 height -= 8;
1149 } while (height >= 8);
1150
1151 if (height >= 4) {
1152 assert(height < 8);
1153 int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
1154 reg10, reg11, reg12, reg13, reg14;
1155 int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
1156 int16x8_t out0, out1, out2, out3;
1157
1158 __builtin_prefetch(src_ptr + 0 * src_stride);
1159 __builtin_prefetch(src_ptr + 1 * src_stride);
1160 __builtin_prefetch(src_ptr + 2 * src_stride);
1161 __builtin_prefetch(src_ptr + 3 * src_stride);
1162
1163 load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
1164 transpose_u8_8x4(&t0, &t1, &t2, &t3);
1165
1166 reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1167 reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1168 reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1169 reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1170 reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1171 reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1172 reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1173
1174 __builtin_prefetch(dst_ptr + 0 * dst_stride);
1175 __builtin_prefetch(dst_ptr + 1 * dst_stride);
1176 __builtin_prefetch(dst_ptr + 2 * dst_stride);
1177 __builtin_prefetch(dst_ptr + 3 * dst_stride);
1178
1179 s = src_ptr + 7;
1180 d_tmp = dst_ptr;
1181 width = w;
1182
1183 do {
1184 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
1185 transpose_u8_8x4(&t0, &t1, &t2, &t3);
1186
1187 reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1188 reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1189 reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1190 reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1191 reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
1192 reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
1193 reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
1194 reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
1195
1196 d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
1197 x_filter_tmp);
1198
1199 d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
1200 x_filter_tmp);
1201
1202 d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
1203 x_filter_tmp);
1204
1205 d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
1206 x_filter_tmp);
1207
1208 d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
1209 x_filter_tmp);
1210
1211 d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
1212 x_filter_tmp);
1213
1214 d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
1215 x_filter_tmp);
1216
1217 d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
1218 x_filter_tmp);
1219
1220 transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
1221 &out2, &out3);
1222
1223 out0 = vaddq_s16(out0, horiz_const);
1224 out0 = vqrshlq_s16(out0, shift_round_0);
1225
1226 out1 = vaddq_s16(out1, horiz_const);
1227 out1 = vqrshlq_s16(out1, shift_round_0);
1228
1229 out2 = vaddq_s16(out2, horiz_const);
1230 out2 = vqrshlq_s16(out2, shift_round_0);
1231
1232 out3 = vaddq_s16(out3, horiz_const);
1233 out3 = vqrshlq_s16(out3, shift_round_0);
1234
1235 store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
1236
1237 reg0 = reg8;
1238 reg1 = reg9;
1239 reg2 = reg10;
1240 reg3 = reg11;
1241 reg4 = reg12;
1242 reg5 = reg13;
1243 reg6 = reg14;
1244 s += 8;
1245 d_tmp += 8;
1246 width -= 8;
1247 } while (width > 0);
1248 src_ptr += 4 * src_stride;
1249 dst_ptr += 4 * im_dst_stride;
1250 height -= 4;
1251 }
1252
1253 if (height) {
1254 assert(height < 4);
1255 horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
1256 height, x_filter_tmp, horiz_const,
1257 shift_round_0);
1258 }
1259 #else
1260
1261 horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
1262 height, x_filter_tmp, horiz_const,
1263 shift_round_0);
1264 #endif
1265 }
1266
1267 // vertical
1268 {
1269 uint8_t *dst_u8_ptr, *d_u8;
1270 int16_t *v_src_ptr, *v_s;
1271
1272 const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
1273 (1 << (offset_bits - conv_params->round_1 - 1));
1274 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1275 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1276
1277 const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
1278 const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
1279 const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
1280
1281 src_stride = im_stride;
1282 v_src_ptr = im_block;
1283 dst_u8_ptr = dst;
1284
1285 height = h;
1286 width = w;
1287
1288 if (width <= 4) {
1289 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
1290 uint16x4_t d0;
1291 uint16x8_t dd0;
1292 uint8x8_t d01;
1293
1294 #if defined(__aarch64__)
1295 int16x4_t s8, s9, s10;
1296 uint16x4_t d1, d2, d3;
1297 uint16x8_t dd1;
1298 uint8x8_t d23;
1299 #endif
1300
1301 d_u8 = dst_u8_ptr;
1302 v_s = v_src_ptr;
1303
1304 __builtin_prefetch(v_s + 0 * im_stride);
1305 __builtin_prefetch(v_s + 1 * im_stride);
1306 __builtin_prefetch(v_s + 2 * im_stride);
1307 __builtin_prefetch(v_s + 3 * im_stride);
1308 __builtin_prefetch(v_s + 4 * im_stride);
1309 __builtin_prefetch(v_s + 5 * im_stride);
1310 __builtin_prefetch(v_s + 6 * im_stride);
1311 __builtin_prefetch(v_s + 7 * im_stride);
1312
1313 load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1314 v_s += (7 * im_stride);
1315
1316 do {
1317 #if defined(__aarch64__)
1318 load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
1319 v_s += (im_stride << 2);
1320
1321 __builtin_prefetch(d_u8 + 0 * dst_stride);
1322 __builtin_prefetch(d_u8 + 1 * dst_stride);
1323 __builtin_prefetch(d_u8 + 2 * dst_stride);
1324 __builtin_prefetch(d_u8 + 3 * dst_stride);
1325
1326 d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1327 round_shift_vec, offset_const,
1328 sub_const_vec);
1329 d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1330 round_shift_vec, offset_const,
1331 sub_const_vec);
1332 d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1333 round_shift_vec, offset_const,
1334 sub_const_vec);
1335 d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1336 round_shift_vec, offset_const,
1337 sub_const_vec);
1338
1339 dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
1340 dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
1341
1342 d01 = vqmovn_u16(dd0);
1343 d23 = vqmovn_u16(dd1);
1344
1345 if ((w == 4) && (h != 2)) {
1346 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1347 0); // 00 01 02 03
1348 d_u8 += dst_stride;
1349 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1350 1); // 10 11 12 13
1351 d_u8 += dst_stride;
1352 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
1353 0); // 20 21 22 23
1354 d_u8 += dst_stride;
1355 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
1356 1); // 30 31 32 33
1357 d_u8 += dst_stride;
1358 } else if ((w == 2) && (h != 2)) {
1359 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1360 0); // 00 01
1361 d_u8 += dst_stride;
1362 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1363 2); // 10 11
1364 d_u8 += dst_stride;
1365 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
1366 0); // 20 21
1367 d_u8 += dst_stride;
1368 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
1369 2); // 30 31
1370 d_u8 += dst_stride;
1371 } else if ((w == 4) && (h == 2)) {
1372 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1373 0); // 00 01 02 03
1374 d_u8 += dst_stride;
1375 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1376 1); // 10 11 12 13
1377 d_u8 += dst_stride;
1378 } else if ((w == 2) && (h == 2)) {
1379 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1380 0); // 00 01
1381 d_u8 += dst_stride;
1382 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1383 2); // 10 11
1384 d_u8 += dst_stride;
1385 }
1386
1387 s0 = s4;
1388 s1 = s5;
1389 s2 = s6;
1390 s3 = s7;
1391 s4 = s8;
1392 s5 = s9;
1393 s6 = s10;
1394 height -= 4;
1395 #else
1396 s7 = vld1_s16(v_s);
1397 v_s += im_stride;
1398
1399 __builtin_prefetch(d_u8 + 0 * dst_stride);
1400
1401 d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1402 round_shift_vec, offset_const,
1403 sub_const_vec);
1404
1405 dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
1406 d01 = vqmovn_u16(dd0);
1407
1408 if (w == 4) {
1409 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1410 0); // 00 01 02 03
1411 d_u8 += dst_stride;
1412
1413 } else if (w == 2) {
1414 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1415 0); // 00 01
1416 d_u8 += dst_stride;
1417 }
1418
1419 s0 = s1;
1420 s1 = s2;
1421 s2 = s3;
1422 s3 = s4;
1423 s4 = s5;
1424 s5 = s6;
1425 s6 = s7;
1426 height -= 1;
1427 #endif
1428 } while (height > 0);
1429 } else {
1430 // if width is a multiple of 8 & height is a multiple of 4
1431 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
1432 uint8x8_t res0;
1433 #if defined(__aarch64__)
1434 int16x8_t s8, s9, s10;
1435 uint8x8_t res1, res2, res3;
1436 #endif
1437
1438 do {
1439 __builtin_prefetch(v_src_ptr + 0 * im_stride);
1440 __builtin_prefetch(v_src_ptr + 1 * im_stride);
1441 __builtin_prefetch(v_src_ptr + 2 * im_stride);
1442 __builtin_prefetch(v_src_ptr + 3 * im_stride);
1443 __builtin_prefetch(v_src_ptr + 4 * im_stride);
1444 __builtin_prefetch(v_src_ptr + 5 * im_stride);
1445 __builtin_prefetch(v_src_ptr + 6 * im_stride);
1446 __builtin_prefetch(v_src_ptr + 7 * im_stride);
1447
1448 v_s = v_src_ptr;
1449 load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1450 v_s += (7 * im_stride);
1451
1452 d_u8 = dst_u8_ptr;
1453 height = h;
1454
1455 do {
1456 #if defined(__aarch64__)
1457 load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
1458 v_s += (im_stride << 2);
1459
1460 __builtin_prefetch(d_u8 + 4 * dst_stride);
1461 __builtin_prefetch(d_u8 + 5 * dst_stride);
1462 __builtin_prefetch(d_u8 + 6 * dst_stride);
1463 __builtin_prefetch(d_u8 + 7 * dst_stride);
1464
1465 res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
1466 y_filter, round_shift_vec, offset_const,
1467 sub_const_vec, vec_round_bits);
1468 res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
1469 y_filter, round_shift_vec, offset_const,
1470 sub_const_vec, vec_round_bits);
1471 res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
1472 y_filter, round_shift_vec, offset_const,
1473 sub_const_vec, vec_round_bits);
1474 res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
1475 y_filter, round_shift_vec, offset_const,
1476 sub_const_vec, vec_round_bits);
1477
1478 if (h != 2) {
1479 vst1_u8(d_u8, res0);
1480 d_u8 += dst_stride;
1481 vst1_u8(d_u8, res1);
1482 d_u8 += dst_stride;
1483 vst1_u8(d_u8, res2);
1484 d_u8 += dst_stride;
1485 vst1_u8(d_u8, res3);
1486 d_u8 += dst_stride;
1487 } else {
1488 vst1_u8(d_u8, res0);
1489 d_u8 += dst_stride;
1490 vst1_u8(d_u8, res1);
1491 d_u8 += dst_stride;
1492 }
1493 s0 = s4;
1494 s1 = s5;
1495 s2 = s6;
1496 s3 = s7;
1497 s4 = s8;
1498 s5 = s9;
1499 s6 = s10;
1500 height -= 4;
1501 #else
1502 s7 = vld1q_s16(v_s);
1503 v_s += im_stride;
1504
1505 __builtin_prefetch(d_u8 + 0 * dst_stride);
1506
1507 res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
1508 y_filter, round_shift_vec, offset_const,
1509 sub_const_vec, vec_round_bits);
1510
1511 vst1_u8(d_u8, res0);
1512 d_u8 += dst_stride;
1513
1514 s0 = s1;
1515 s1 = s2;
1516 s2 = s3;
1517 s3 = s4;
1518 s4 = s5;
1519 s5 = s6;
1520 s6 = s7;
1521 height -= 1;
1522 #endif
1523 } while (height > 0);
1524 v_src_ptr += 8;
1525 dst_u8_ptr += 8;
1526 w -= 8;
1527 } while (w > 0);
1528 }
1529 }
1530 }
1531
scaledconvolve_horiz_w4(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const x_filters,const int x0_q4,const int x_step_q4,const int w,const int h)1532 static INLINE void scaledconvolve_horiz_w4(
1533 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1534 const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
1535 const int x0_q4, const int x_step_q4, const int w, const int h) {
1536 DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
1537 int x, y, z;
1538
1539 src -= SUBPEL_TAPS / 2 - 1;
1540
1541 y = h;
1542 do {
1543 int x_q4 = x0_q4;
1544 x = 0;
1545 do {
1546 // process 4 src_x steps
1547 for (z = 0; z < 4; ++z) {
1548 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1549 if (x_q4 & SUBPEL_MASK) {
1550 const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
1551 const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
1552 const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
1553 uint8x8_t s[8], d;
1554 int16x8_t ss[4];
1555 int16x4_t t[8], tt;
1556
1557 load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
1558 transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
1559
1560 ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
1561 ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
1562 ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
1563 ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
1564 t[0] = vget_low_s16(ss[0]);
1565 t[1] = vget_low_s16(ss[1]);
1566 t[2] = vget_low_s16(ss[2]);
1567 t[3] = vget_low_s16(ss[3]);
1568 t[4] = vget_high_s16(ss[0]);
1569 t[5] = vget_high_s16(ss[1]);
1570 t[6] = vget_high_s16(ss[2]);
1571 t[7] = vget_high_s16(ss[3]);
1572
1573 tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
1574 filters, filter3, filter4);
1575 d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
1576 vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
1577 } else {
1578 int i;
1579 for (i = 0; i < 4; ++i) {
1580 temp[z * 4 + i] = src_x[i * src_stride + 3];
1581 }
1582 }
1583 x_q4 += x_step_q4;
1584 }
1585
1586 // transpose the 4x4 filters values back to dst
1587 {
1588 const uint8x8x4_t d4 = vld4_u8(temp);
1589 vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
1590 vreinterpret_u32_u8(d4.val[0]), 0);
1591 vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
1592 vreinterpret_u32_u8(d4.val[1]), 0);
1593 vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
1594 vreinterpret_u32_u8(d4.val[2]), 0);
1595 vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
1596 vreinterpret_u32_u8(d4.val[3]), 0);
1597 }
1598 x += 4;
1599 } while (x < w);
1600
1601 src += src_stride * 4;
1602 dst += dst_stride * 4;
1603 y -= 4;
1604 } while (y > 0);
1605 }
1606
scaledconvolve_horiz_w8(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const x_filters,const int x0_q4,const int x_step_q4,const int w,const int h)1607 static INLINE void scaledconvolve_horiz_w8(
1608 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1609 const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
1610 const int x0_q4, const int x_step_q4, const int w, const int h) {
1611 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
1612 int x, y, z;
1613 src -= SUBPEL_TAPS / 2 - 1;
1614
1615 // This function processes 8x8 areas. The intermediate height is not always
1616 // a multiple of 8, so force it to be a multiple of 8 here.
1617 y = (h + 7) & ~7;
1618
1619 do {
1620 int x_q4 = x0_q4;
1621 x = 0;
1622 do {
1623 uint8x8_t d[8];
1624 // process 8 src_x steps
1625 for (z = 0; z < 8; ++z) {
1626 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1627
1628 if (x_q4 & SUBPEL_MASK) {
1629 const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
1630 uint8x8_t s[8];
1631 load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
1632 &s[5], &s[6], &s[7]);
1633 transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
1634 &s[7]);
1635 d[0] = scale_filter_8(s, filters);
1636 vst1_u8(&temp[8 * z], d[0]);
1637 } else {
1638 int i;
1639 for (i = 0; i < 8; ++i) {
1640 temp[z * 8 + i] = src_x[i * src_stride + 3];
1641 }
1642 }
1643 x_q4 += x_step_q4;
1644 }
1645
1646 // transpose the 8x8 filters values back to dst
1647 load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
1648 &d[7]);
1649 transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
1650 vst1_u8(&dst[x + 0 * dst_stride], d[0]);
1651 vst1_u8(&dst[x + 1 * dst_stride], d[1]);
1652 vst1_u8(&dst[x + 2 * dst_stride], d[2]);
1653 vst1_u8(&dst[x + 3 * dst_stride], d[3]);
1654 vst1_u8(&dst[x + 4 * dst_stride], d[4]);
1655 vst1_u8(&dst[x + 5 * dst_stride], d[5]);
1656 vst1_u8(&dst[x + 6 * dst_stride], d[6]);
1657 vst1_u8(&dst[x + 7 * dst_stride], d[7]);
1658 x += 8;
1659 } while (x < w);
1660
1661 src += src_stride * 8;
1662 dst += dst_stride * 8;
1663 } while (y -= 8);
1664 }
1665
scaledconvolve_vert_w4(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1666 static INLINE void scaledconvolve_vert_w4(
1667 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1668 const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1669 const int y0_q4, const int y_step_q4, const int w, const int h) {
1670 int y;
1671 int y_q4 = y0_q4;
1672
1673 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1674 y = h;
1675 do {
1676 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1677
1678 if (y_q4 & SUBPEL_MASK) {
1679 const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
1680 const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
1681 const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
1682 uint8x8_t s[8], d;
1683 int16x4_t t[8], tt;
1684
1685 load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
1686 &s[6], &s[7]);
1687 t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
1688 t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
1689 t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
1690 t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
1691 t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
1692 t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
1693 t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
1694 t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
1695
1696 tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
1697 filter3, filter4);
1698 d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
1699 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
1700 } else {
1701 memcpy(dst, &src_y[3 * src_stride], w);
1702 }
1703
1704 dst += dst_stride;
1705 y_q4 += y_step_q4;
1706 } while (--y);
1707 }
1708
scaledconvolve_vert_w8(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1709 static INLINE void scaledconvolve_vert_w8(
1710 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1711 const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1712 const int y0_q4, const int y_step_q4, const int w, const int h) {
1713 int y;
1714 int y_q4 = y0_q4;
1715
1716 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1717 y = h;
1718 do {
1719 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1720 if (y_q4 & SUBPEL_MASK) {
1721 const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
1722 uint8x8_t s[8], d;
1723 load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
1724 &s[6], &s[7]);
1725 d = scale_filter_8(s, filters);
1726 vst1_u8(dst, d);
1727 } else {
1728 memcpy(dst, &src_y[3 * src_stride], w);
1729 }
1730 dst += dst_stride;
1731 y_q4 += y_step_q4;
1732 } while (--y);
1733 }
1734
scaledconvolve_vert_w16(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1735 static INLINE void scaledconvolve_vert_w16(
1736 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
1737 const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1738 const int y0_q4, const int y_step_q4, const int w, const int h) {
1739 int x, y;
1740 int y_q4 = y0_q4;
1741
1742 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1743 y = h;
1744 do {
1745 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1746 if (y_q4 & SUBPEL_MASK) {
1747 x = 0;
1748 do {
1749 const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
1750 uint8x16_t ss[8];
1751 uint8x8_t s[8], d[2];
1752 load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
1753 &ss[5], &ss[6], &ss[7]);
1754 s[0] = vget_low_u8(ss[0]);
1755 s[1] = vget_low_u8(ss[1]);
1756 s[2] = vget_low_u8(ss[2]);
1757 s[3] = vget_low_u8(ss[3]);
1758 s[4] = vget_low_u8(ss[4]);
1759 s[5] = vget_low_u8(ss[5]);
1760 s[6] = vget_low_u8(ss[6]);
1761 s[7] = vget_low_u8(ss[7]);
1762 d[0] = scale_filter_8(s, filters);
1763
1764 s[0] = vget_high_u8(ss[0]);
1765 s[1] = vget_high_u8(ss[1]);
1766 s[2] = vget_high_u8(ss[2]);
1767 s[3] = vget_high_u8(ss[3]);
1768 s[4] = vget_high_u8(ss[4]);
1769 s[5] = vget_high_u8(ss[5]);
1770 s[6] = vget_high_u8(ss[6]);
1771 s[7] = vget_high_u8(ss[7]);
1772 d[1] = scale_filter_8(s, filters);
1773 vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
1774 src_y += 16;
1775 x += 16;
1776 } while (x < w);
1777 } else {
1778 memcpy(dst, &src_y[3 * src_stride], w);
1779 }
1780 dst += dst_stride;
1781 y_q4 += y_step_q4;
1782 } while (--y);
1783 }
1784
aom_scaled_2d_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)1785 void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1786 ptrdiff_t dst_stride, const InterpKernel *filter,
1787 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
1788 int w, int h) {
1789 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1790 // 2d filtering proceeds in 2 steps:
1791 // (1) Interpolate horizontally into an intermediate buffer, temp.
1792 // (2) Interpolate temp vertically to derive the sub-pixel result.
1793 // Deriving the maximum number of rows in the temp buffer (135):
1794 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1795 // --Largest block size is 64x64 pixels.
1796 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1797 // original frame (in 1/16th pixel units).
1798 // --Must round-up because block may be located at sub-pixel position.
1799 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1800 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1801 // --Require an additional 8 rows for the horiz_w8 transpose tail.
1802 // When calling in frame scaling function, the smallest scaling factor is x1/4
1803 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
1804 // big enough.
1805 DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
1806 const int intermediate_height =
1807 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1808
1809 assert(w <= 64);
1810 assert(h <= 64);
1811 assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
1812 assert(x_step_q4 <= 64);
1813
1814 if (w >= 8) {
1815 scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1816 src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
1817 intermediate_height);
1818 } else {
1819 scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1820 src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
1821 intermediate_height);
1822 }
1823
1824 if (w >= 16) {
1825 scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1826 dst_stride, filter, y0_q4, y_step_q4, w, h);
1827 } else if (w == 8) {
1828 scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1829 dst_stride, filter, y0_q4, y_step_q4, w, h);
1830 } else {
1831 scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1832 dst_stride, filter, y0_q4, y_step_q4, w, h);
1833 }
1834 }
1835