1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13
14 #include "common/tools_common.h"
15
16 #include "config/aom_config.h"
17 #include "config/aom_dsp_rtcd.h"
18
19 #include "aom/aom_integer.h"
20
21 //------------------------------------------------------------------------------
22 // DC 4x4
23
24 // 'do_above' and 'do_left' facilitate branch removal when inlined.
dc_4x4(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int do_above,int do_left)25 static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
26 const uint8_t *left, int do_above, int do_left) {
27 uint16x8_t sum_top;
28 uint16x8_t sum_left;
29 uint8x8_t dc0;
30
31 if (do_above) {
32 const uint8x8_t A = vld1_u8(above); // top row
33 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
34 const uint16x4_t p1 = vpadd_u16(p0, p0);
35 sum_top = vcombine_u16(p1, p1);
36 }
37
38 if (do_left) {
39 const uint8x8_t L = vld1_u8(left); // left border
40 const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
41 const uint16x4_t p1 = vpadd_u16(p0, p0);
42 sum_left = vcombine_u16(p1, p1);
43 }
44
45 if (do_above && do_left) {
46 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
47 dc0 = vrshrn_n_u16(sum, 3);
48 } else if (do_above) {
49 dc0 = vrshrn_n_u16(sum_top, 2);
50 } else if (do_left) {
51 dc0 = vrshrn_n_u16(sum_left, 2);
52 } else {
53 dc0 = vdup_n_u8(0x80);
54 }
55
56 {
57 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
58 int i;
59 for (i = 0; i < 4; ++i) {
60 vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
61 }
62 }
63 }
64
aom_dc_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)65 void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
66 const uint8_t *above, const uint8_t *left) {
67 dc_4x4(dst, stride, above, left, 1, 1);
68 }
69
aom_dc_left_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)70 void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
71 const uint8_t *above, const uint8_t *left) {
72 (void)above;
73 dc_4x4(dst, stride, NULL, left, 0, 1);
74 }
75
aom_dc_top_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)76 void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
77 const uint8_t *above, const uint8_t *left) {
78 (void)left;
79 dc_4x4(dst, stride, above, NULL, 1, 0);
80 }
81
aom_dc_128_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)82 void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
83 const uint8_t *above, const uint8_t *left) {
84 (void)above;
85 (void)left;
86 dc_4x4(dst, stride, NULL, NULL, 0, 0);
87 }
88
89 //------------------------------------------------------------------------------
90 // DC 8x8
91
92 // 'do_above' and 'do_left' facilitate branch removal when inlined.
dc_8x8(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int do_above,int do_left)93 static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
94 const uint8_t *left, int do_above, int do_left) {
95 uint16x8_t sum_top;
96 uint16x8_t sum_left;
97 uint8x8_t dc0;
98
99 if (do_above) {
100 const uint8x8_t A = vld1_u8(above); // top row
101 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
102 const uint16x4_t p1 = vpadd_u16(p0, p0);
103 const uint16x4_t p2 = vpadd_u16(p1, p1);
104 sum_top = vcombine_u16(p2, p2);
105 }
106
107 if (do_left) {
108 const uint8x8_t L = vld1_u8(left); // left border
109 const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
110 const uint16x4_t p1 = vpadd_u16(p0, p0);
111 const uint16x4_t p2 = vpadd_u16(p1, p1);
112 sum_left = vcombine_u16(p2, p2);
113 }
114
115 if (do_above && do_left) {
116 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
117 dc0 = vrshrn_n_u16(sum, 4);
118 } else if (do_above) {
119 dc0 = vrshrn_n_u16(sum_top, 3);
120 } else if (do_left) {
121 dc0 = vrshrn_n_u16(sum_left, 3);
122 } else {
123 dc0 = vdup_n_u8(0x80);
124 }
125
126 {
127 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
128 int i;
129 for (i = 0; i < 8; ++i) {
130 vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
131 }
132 }
133 }
134
aom_dc_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)135 void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
136 const uint8_t *above, const uint8_t *left) {
137 dc_8x8(dst, stride, above, left, 1, 1);
138 }
139
aom_dc_left_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)140 void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
141 const uint8_t *above, const uint8_t *left) {
142 (void)above;
143 dc_8x8(dst, stride, NULL, left, 0, 1);
144 }
145
aom_dc_top_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)146 void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
147 const uint8_t *above, const uint8_t *left) {
148 (void)left;
149 dc_8x8(dst, stride, above, NULL, 1, 0);
150 }
151
aom_dc_128_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)152 void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
153 const uint8_t *above, const uint8_t *left) {
154 (void)above;
155 (void)left;
156 dc_8x8(dst, stride, NULL, NULL, 0, 0);
157 }
158
159 //------------------------------------------------------------------------------
160 // DC 16x16
161
162 // 'do_above' and 'do_left' facilitate branch removal when inlined.
dc_16x16(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int do_above,int do_left)163 static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
164 const uint8_t *above, const uint8_t *left,
165 int do_above, int do_left) {
166 uint16x8_t sum_top;
167 uint16x8_t sum_left;
168 uint8x8_t dc0;
169
170 if (do_above) {
171 const uint8x16_t A = vld1q_u8(above); // top row
172 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
173 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
174 const uint16x4_t p2 = vpadd_u16(p1, p1);
175 const uint16x4_t p3 = vpadd_u16(p2, p2);
176 sum_top = vcombine_u16(p3, p3);
177 }
178
179 if (do_left) {
180 const uint8x16_t L = vld1q_u8(left); // left row
181 const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left
182 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
183 const uint16x4_t p2 = vpadd_u16(p1, p1);
184 const uint16x4_t p3 = vpadd_u16(p2, p2);
185 sum_left = vcombine_u16(p3, p3);
186 }
187
188 if (do_above && do_left) {
189 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
190 dc0 = vrshrn_n_u16(sum, 5);
191 } else if (do_above) {
192 dc0 = vrshrn_n_u16(sum_top, 4);
193 } else if (do_left) {
194 dc0 = vrshrn_n_u16(sum_left, 4);
195 } else {
196 dc0 = vdup_n_u8(0x80);
197 }
198
199 {
200 const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
201 int i;
202 for (i = 0; i < 16; ++i) {
203 vst1q_u8(dst + i * stride, dc);
204 }
205 }
206 }
207
aom_dc_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)208 void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
209 const uint8_t *above, const uint8_t *left) {
210 dc_16x16(dst, stride, above, left, 1, 1);
211 }
212
aom_dc_left_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)213 void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
214 const uint8_t *above,
215 const uint8_t *left) {
216 (void)above;
217 dc_16x16(dst, stride, NULL, left, 0, 1);
218 }
219
aom_dc_top_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)220 void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
221 const uint8_t *above,
222 const uint8_t *left) {
223 (void)left;
224 dc_16x16(dst, stride, above, NULL, 1, 0);
225 }
226
aom_dc_128_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)227 void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
228 const uint8_t *above,
229 const uint8_t *left) {
230 (void)above;
231 (void)left;
232 dc_16x16(dst, stride, NULL, NULL, 0, 0);
233 }
234
235 //------------------------------------------------------------------------------
236 // DC 32x32
237
238 // 'do_above' and 'do_left' facilitate branch removal when inlined.
dc_32x32(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int do_above,int do_left)239 static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
240 const uint8_t *above, const uint8_t *left,
241 int do_above, int do_left) {
242 uint16x8_t sum_top;
243 uint16x8_t sum_left;
244 uint8x8_t dc0;
245
246 if (do_above) {
247 const uint8x16_t A0 = vld1q_u8(above); // top row
248 const uint8x16_t A1 = vld1q_u8(above + 16);
249 const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top
250 const uint16x8_t p1 = vpaddlq_u8(A1);
251 const uint16x8_t p2 = vaddq_u16(p0, p1);
252 const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
253 const uint16x4_t p4 = vpadd_u16(p3, p3);
254 const uint16x4_t p5 = vpadd_u16(p4, p4);
255 sum_top = vcombine_u16(p5, p5);
256 }
257
258 if (do_left) {
259 const uint8x16_t L0 = vld1q_u8(left); // left row
260 const uint8x16_t L1 = vld1q_u8(left + 16);
261 const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left
262 const uint16x8_t p1 = vpaddlq_u8(L1);
263 const uint16x8_t p2 = vaddq_u16(p0, p1);
264 const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
265 const uint16x4_t p4 = vpadd_u16(p3, p3);
266 const uint16x4_t p5 = vpadd_u16(p4, p4);
267 sum_left = vcombine_u16(p5, p5);
268 }
269
270 if (do_above && do_left) {
271 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
272 dc0 = vrshrn_n_u16(sum, 6);
273 } else if (do_above) {
274 dc0 = vrshrn_n_u16(sum_top, 5);
275 } else if (do_left) {
276 dc0 = vrshrn_n_u16(sum_left, 5);
277 } else {
278 dc0 = vdup_n_u8(0x80);
279 }
280
281 {
282 const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
283 int i;
284 for (i = 0; i < 32; ++i) {
285 vst1q_u8(dst + i * stride, dc);
286 vst1q_u8(dst + i * stride + 16, dc);
287 }
288 }
289 }
290
aom_dc_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)291 void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
292 const uint8_t *above, const uint8_t *left) {
293 dc_32x32(dst, stride, above, left, 1, 1);
294 }
295
aom_dc_left_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)296 void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
297 const uint8_t *above,
298 const uint8_t *left) {
299 (void)above;
300 dc_32x32(dst, stride, NULL, left, 0, 1);
301 }
302
aom_dc_top_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)303 void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
304 const uint8_t *above,
305 const uint8_t *left) {
306 (void)left;
307 dc_32x32(dst, stride, above, NULL, 1, 0);
308 }
309
aom_dc_128_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)310 void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
311 const uint8_t *above,
312 const uint8_t *left) {
313 (void)above;
314 (void)left;
315 dc_32x32(dst, stride, NULL, NULL, 0, 0);
316 }
317
318 // -----------------------------------------------------------------------------
319
aom_d135_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)320 void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
321 const uint8_t *above, const uint8_t *left) {
322 const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
323 const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
324 const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
325 const uint32x2_t zero = vdup_n_u32(0);
326 const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
327 const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
328 const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
329 const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
330 const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
331 const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
332 const uint8_t D = vget_lane_u8(XABCD_u8, 4);
333 const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
334 const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
335 const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
336 const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
337 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
338 const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
339 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
340 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
341 const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
342 vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
343 vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
344 vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
345 vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
346 }
347
aom_v_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)348 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
349 const uint8_t *above, const uint8_t *left) {
350 int i;
351 uint32x2_t d0u32 = vdup_n_u32(0);
352 (void)left;
353
354 d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
355 for (i = 0; i < 4; i++, dst += stride)
356 vst1_lane_u32((uint32_t *)dst, d0u32, 0);
357 }
358
aom_v_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)359 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
360 const uint8_t *above, const uint8_t *left) {
361 int i;
362 uint8x8_t d0u8 = vdup_n_u8(0);
363 (void)left;
364
365 d0u8 = vld1_u8(above);
366 for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
367 }
368
aom_v_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)369 void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
370 const uint8_t *above, const uint8_t *left) {
371 int i;
372 uint8x16_t q0u8 = vdupq_n_u8(0);
373 (void)left;
374
375 q0u8 = vld1q_u8(above);
376 for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
377 }
378
aom_v_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)379 void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
380 const uint8_t *above, const uint8_t *left) {
381 int i;
382 uint8x16_t q0u8 = vdupq_n_u8(0);
383 uint8x16_t q1u8 = vdupq_n_u8(0);
384 (void)left;
385
386 q0u8 = vld1q_u8(above);
387 q1u8 = vld1q_u8(above + 16);
388 for (i = 0; i < 32; i++, dst += stride) {
389 vst1q_u8(dst, q0u8);
390 vst1q_u8(dst + 16, q1u8);
391 }
392 }
393
aom_h_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)394 void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
395 const uint8_t *above, const uint8_t *left) {
396 uint8x8_t d0u8 = vdup_n_u8(0);
397 uint32x2_t d1u32 = vdup_n_u32(0);
398 (void)above;
399
400 d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
401
402 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
403 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
404 dst += stride;
405 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
406 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
407 dst += stride;
408 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
409 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
410 dst += stride;
411 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
412 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
413 }
414
aom_h_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)415 void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
416 const uint8_t *above, const uint8_t *left) {
417 uint8x8_t d0u8 = vdup_n_u8(0);
418 uint64x1_t d1u64 = vdup_n_u64(0);
419 (void)above;
420
421 d1u64 = vld1_u64((const uint64_t *)left);
422
423 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
424 vst1_u8(dst, d0u8);
425 dst += stride;
426 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
427 vst1_u8(dst, d0u8);
428 dst += stride;
429 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
430 vst1_u8(dst, d0u8);
431 dst += stride;
432 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
433 vst1_u8(dst, d0u8);
434 dst += stride;
435 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
436 vst1_u8(dst, d0u8);
437 dst += stride;
438 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
439 vst1_u8(dst, d0u8);
440 dst += stride;
441 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
442 vst1_u8(dst, d0u8);
443 dst += stride;
444 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
445 vst1_u8(dst, d0u8);
446 }
447
aom_h_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)448 void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
449 const uint8_t *above, const uint8_t *left) {
450 int j;
451 uint8x8_t d2u8 = vdup_n_u8(0);
452 uint8x16_t q0u8 = vdupq_n_u8(0);
453 uint8x16_t q1u8 = vdupq_n_u8(0);
454 (void)above;
455
456 q1u8 = vld1q_u8(left);
457 d2u8 = vget_low_u8(q1u8);
458 for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
459 q0u8 = vdupq_lane_u8(d2u8, 0);
460 vst1q_u8(dst, q0u8);
461 dst += stride;
462 q0u8 = vdupq_lane_u8(d2u8, 1);
463 vst1q_u8(dst, q0u8);
464 dst += stride;
465 q0u8 = vdupq_lane_u8(d2u8, 2);
466 vst1q_u8(dst, q0u8);
467 dst += stride;
468 q0u8 = vdupq_lane_u8(d2u8, 3);
469 vst1q_u8(dst, q0u8);
470 dst += stride;
471 q0u8 = vdupq_lane_u8(d2u8, 4);
472 vst1q_u8(dst, q0u8);
473 dst += stride;
474 q0u8 = vdupq_lane_u8(d2u8, 5);
475 vst1q_u8(dst, q0u8);
476 dst += stride;
477 q0u8 = vdupq_lane_u8(d2u8, 6);
478 vst1q_u8(dst, q0u8);
479 dst += stride;
480 q0u8 = vdupq_lane_u8(d2u8, 7);
481 vst1q_u8(dst, q0u8);
482 dst += stride;
483 }
484 }
485
aom_h_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)486 void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
487 const uint8_t *above, const uint8_t *left) {
488 int j, k;
489 uint8x8_t d2u8 = vdup_n_u8(0);
490 uint8x16_t q0u8 = vdupq_n_u8(0);
491 uint8x16_t q1u8 = vdupq_n_u8(0);
492 (void)above;
493
494 for (k = 0; k < 2; k++, left += 16) {
495 q1u8 = vld1q_u8(left);
496 d2u8 = vget_low_u8(q1u8);
497 for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
498 q0u8 = vdupq_lane_u8(d2u8, 0);
499 vst1q_u8(dst, q0u8);
500 vst1q_u8(dst + 16, q0u8);
501 dst += stride;
502 q0u8 = vdupq_lane_u8(d2u8, 1);
503 vst1q_u8(dst, q0u8);
504 vst1q_u8(dst + 16, q0u8);
505 dst += stride;
506 q0u8 = vdupq_lane_u8(d2u8, 2);
507 vst1q_u8(dst, q0u8);
508 vst1q_u8(dst + 16, q0u8);
509 dst += stride;
510 q0u8 = vdupq_lane_u8(d2u8, 3);
511 vst1q_u8(dst, q0u8);
512 vst1q_u8(dst + 16, q0u8);
513 dst += stride;
514 q0u8 = vdupq_lane_u8(d2u8, 4);
515 vst1q_u8(dst, q0u8);
516 vst1q_u8(dst + 16, q0u8);
517 dst += stride;
518 q0u8 = vdupq_lane_u8(d2u8, 5);
519 vst1q_u8(dst, q0u8);
520 vst1q_u8(dst + 16, q0u8);
521 dst += stride;
522 q0u8 = vdupq_lane_u8(d2u8, 6);
523 vst1q_u8(dst, q0u8);
524 vst1q_u8(dst + 16, q0u8);
525 dst += stride;
526 q0u8 = vdupq_lane_u8(d2u8, 7);
527 vst1q_u8(dst, q0u8);
528 vst1q_u8(dst + 16, q0u8);
529 dst += stride;
530 }
531 }
532 }
533
highbd_dc_predictor(uint16_t * dst,ptrdiff_t stride,int bw,const uint16_t * above,const uint16_t * left)534 static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
535 const uint16_t *above,
536 const uint16_t *left) {
537 assert(bw >= 4);
538 assert(IS_POWER_OF_TWO(bw));
539 int expected_dc, sum = 0;
540 const int count = bw * 2;
541 uint32x4_t sum_q = vdupq_n_u32(0);
542 uint32x2_t sum_d;
543 uint16_t *dst_1;
544 if (bw >= 8) {
545 for (int i = 0; i < bw; i += 8) {
546 sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
547 sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
548 above += 8;
549 left += 8;
550 }
551 sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
552 sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
553 expected_dc = (sum + (count >> 1)) / count;
554 const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
555 for (int r = 0; r < bw; r++) {
556 dst_1 = dst;
557 for (int i = 0; i < bw; i += 8) {
558 vst1q_u16(dst_1, dc);
559 dst_1 += 8;
560 }
561 dst += stride;
562 }
563 } else { // 4x4
564 sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
565 sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
566 sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
567 expected_dc = (sum + (count >> 1)) / count;
568 const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
569 for (int r = 0; r < bw; r++) {
570 vst1_u16(dst, dc);
571 dst += stride;
572 }
573 }
574 }
575
576 #define intra_pred_highbd_sized_neon(type, width) \
577 void aom_highbd_##type##_predictor_##width##x##width##_neon( \
578 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
579 const uint16_t *left, int bd) { \
580 (void)bd; \
581 highbd_##type##_predictor(dst, stride, width, above, left); \
582 }
583
584 #define intra_pred_square(type) \
585 intra_pred_highbd_sized_neon(type, 4); \
586 intra_pred_highbd_sized_neon(type, 8); \
587 intra_pred_highbd_sized_neon(type, 16); \
588 intra_pred_highbd_sized_neon(type, 32); \
589 intra_pred_highbd_sized_neon(type, 64);
590
591 intra_pred_square(dc);
592 #undef intra_pred_square
593
594 /* ---------------------P R E D I C T I O N Z 1--------------------------- */
595
596 static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
597 { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
598 { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
599 { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
600 { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
601 { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
602 { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
603 { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
604 { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
605 };
606
607 // Low bit depth functions
608 static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
609 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
610 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
611 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
612 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
613 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
614 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
615 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
617 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
619 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
620 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
621 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
623 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
625 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
627 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
628 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
629 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
632 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
635 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
636 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
638 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
639 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
640 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
641 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
642 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
643 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
644 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
645 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
647 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
648 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
650 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
651 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
652 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
653 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
654 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
655 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
656 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
657 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
658 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
659 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
660 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
662 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
663 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
665 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
666 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
668 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
669 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
670 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
671 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
672 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
673 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
674 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
675 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
676 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
677 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
678 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
679 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
680 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
681 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
682 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
683 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
684 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
685 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
686 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
687 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
688 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
689 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
690 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
691 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
692 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
693 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
694 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
695 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
696 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
697 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
698 };
699
700 /* clang-format on */
dr_prediction_z1_HxW_internal_neon_64(int H,int W,uint8x8_t * dst,const uint8_t * above,int upsample_above,int dx)701 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
702 int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
703 int dx) {
704 const int frac_bits = 6 - upsample_above;
705 const int max_base_x = ((W + H) - 1) << upsample_above;
706
707 assert(dx > 0);
708 // pre-filter above pixels
709 // store in temp buffers:
710 // above[x] * 32 + 16
711 // above[x+1] - above[x]
712 // final pixels will be calculated as:
713 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
714
715 uint16x8_t a0, a1;
716 uint16x8_t diff, a32;
717 uint16x8_t a16;
718 uint8x8_t a_mbase_x;
719
720 a16 = vdupq_n_u16(16);
721 a_mbase_x = vdup_n_u8(above[max_base_x]);
722 uint16x8_t v_32 = vdupq_n_u16(32);
723 int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
724 uint16x8_t c3f = vdupq_n_u16(0x3f);
725
726 int x = dx;
727 for (int r = 0; r < W; r++) {
728 uint16x8_t res;
729 uint16x8_t shift;
730 uint8x8x2_t v_tmp_a0_128;
731
732 int base = x >> frac_bits;
733 int base_max_diff = (max_base_x - base) >> upsample_above;
734 if (base_max_diff <= 0) {
735 for (int i = r; i < W; ++i) {
736 dst[i] = a_mbase_x; // save 4 values
737 }
738 return;
739 }
740
741 if (base_max_diff > H) base_max_diff = H;
742
743 if (upsample_above) {
744 v_tmp_a0_128 = vld2_u8(above + base);
745 shift = vshrq_n_u16(
746 vandq_u16(vshlq_u16(vdupq_n_u16(x), v_upsample_above), c3f), 1);
747 } else {
748 v_tmp_a0_128.val[0] = vld1_u8(above + base);
749 v_tmp_a0_128.val[1] = vld1_u8(above + base + 1);
750 shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
751 }
752 a0 = vmovl_u8(v_tmp_a0_128.val[0]);
753 a1 = vmovl_u8(v_tmp_a0_128.val[1]);
754 diff = vsubq_u16(a1, a0); // a[x+1] - a[x]
755 a32 = vmlaq_u16(a16, a0, v_32); // a[x] * 32 + 16
756 res = vmlaq_u16(a32, diff, shift);
757
758 uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
759 dst[r] =
760 vorr_u8(vand_u8(mask, vshrn_n_u16(res, 5)), vbic_u8(a_mbase_x, mask));
761
762 x += dx;
763 }
764 }
765
dr_prediction_z1_4xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)766 static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
767 const uint8_t *above, int upsample_above,
768 int dx) {
769 uint8x8_t dstvec[16];
770
771 dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
772 dx);
773 for (int i = 0; i < N; i++) {
774 vst1_lane_u32((uint32_t *)(dst + stride * i),
775 vreinterpret_u32_u8(dstvec[i]), 0);
776 }
777 }
778
dr_prediction_z1_8xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)779 static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
780 const uint8_t *above, int upsample_above,
781 int dx) {
782 uint8x8_t dstvec[32];
783
784 dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
785 dx);
786 for (int i = 0; i < N; i++) {
787 vst1_u8(dst + stride * i, dstvec[i]);
788 }
789 }
790
dr_prediction_z1_HxW_internal_neon(int H,int W,uint8x16_t * dst,const uint8_t * above,int upsample_above,int dx)791 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
792 int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
793 int dx) {
794 const int frac_bits = 6 - upsample_above;
795 const int max_base_x = ((W + H) - 1) << upsample_above;
796
797 assert(dx > 0);
798 // pre-filter above pixels
799 // store in temp buffers:
800 // above[x] * 32 + 16
801 // above[x+1] - above[x]
802 // final pixels will be calculated as:
803 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
804
805 uint8x16x2_t a0, a1;
806 uint16x8x2_t diff, a32;
807 uint16x8_t a16, c3f;
808 uint8x16_t a_mbase_x;
809
810 a16 = vdupq_n_u16(16);
811 a_mbase_x = vdupq_n_u8(above[max_base_x]);
812 c3f = vdupq_n_u16(0x3f);
813 uint16x8_t v_32 = vdupq_n_u16(32);
814 uint8x16_t v_zero = vdupq_n_u8(0);
815 int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
816
817 int x = dx;
818 for (int r = 0; r < W; r++) {
819 uint16x8x2_t res;
820 uint16x8_t shift;
821 uint8x16_t a0_128, a1_128;
822
823 int base = x >> frac_bits;
824 int base_max_diff = (max_base_x - base) >> upsample_above;
825 if (base_max_diff <= 0) {
826 for (int i = r; i < W; ++i) {
827 dst[i] = a_mbase_x; // save 4 values
828 }
829 return;
830 }
831
832 if (base_max_diff > H) base_max_diff = H;
833
834 if (upsample_above) {
835 uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
836 a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
837 a1_128 = vextq_u8(a0_128, v_zero, 8);
838 shift = vshrq_n_u16(
839 vandq_u16(vshlq_u16(vdupq_n_u16(x), v_upsample_above), c3f), 1);
840 } else {
841 a0_128 = vld1q_u8(above + base);
842 a1_128 = vld1q_u8(above + base + 1);
843 shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
844 }
845 a0 = vzipq_u8(a0_128, v_zero);
846 a1 = vzipq_u8(a1_128, v_zero);
847 diff.val[0] = vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
848 vreinterpretq_u16_u8(a0.val[0])); // a[x+1] - a[x]
849 diff.val[1] = vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
850 vreinterpretq_u16_u8(a0.val[1])); // a[x+1] - a[x]
851 a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
852 v_32); // a[x] * 32 + 16
853 a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
854 v_32); // a[x] * 32 + 16
855 res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
856 res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
857 uint8x16_t v_temp =
858 vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
859
860 uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
861 dst[r] = vorrq_u8(vandq_u8(mask, v_temp), vbicq_u8(a_mbase_x, mask));
862
863 x += dx;
864 }
865 }
866
dr_prediction_z1_16xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)867 static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
868 const uint8_t *above, int upsample_above,
869 int dx) {
870 uint8x16_t dstvec[64];
871
872 dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
873 for (int i = 0; i < N; i++) {
874 vst1q_u8(dst + stride * i, dstvec[i]);
875 }
876 }
877
dr_prediction_z1_32xN_internal_neon(int N,uint8x16x2_t * dstvec,const uint8_t * above,int upsample_above,int dx)878 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
879 int N, uint8x16x2_t *dstvec, const uint8_t *above, int upsample_above,
880 int dx) {
881 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
882 (void)upsample_above;
883 const int frac_bits = 6;
884 const int max_base_x = ((32 + N) - 1);
885
886 // pre-filter above pixels
887 // store in temp buffers:
888 // above[x] * 32 + 16
889 // above[x+1] - above[x]
890 // final pixels will be calculated as:
891 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
892
893 uint8x16_t a_mbase_x;
894 uint8x16x2_t a0, a1;
895 uint16x8x2_t diff, a32;
896 uint16x8_t a16, c3f;
897
898 a_mbase_x = vdupq_n_u8(above[max_base_x]);
899 a16 = vdupq_n_u16(16);
900 c3f = vdupq_n_u16(0x3f);
901 uint16x8_t v_32 = vdupq_n_u16(32);
902 uint8x16_t v_zero = vdupq_n_u8(0);
903
904 int x = dx;
905 for (int r = 0; r < N; r++) {
906 uint16x8x2_t res;
907 uint8x16_t res16[2];
908 uint8x16_t a0_128, a1_128;
909
910 int base = x >> frac_bits;
911 int base_max_diff = (max_base_x - base);
912 if (base_max_diff <= 0) {
913 for (int i = r; i < N; ++i) {
914 dstvec[i].val[0] = a_mbase_x; // save 32 values
915 dstvec[i].val[1] = a_mbase_x;
916 }
917 return;
918 }
919 if (base_max_diff > 32) base_max_diff = 32;
920
921 uint16x8_t shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
922
923 for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
924 int mdiff = base_max_diff - j;
925 if (mdiff <= 0) {
926 res16[jj] = a_mbase_x;
927 } else {
928 a0_128 = vld1q_u8(above + base + j);
929 a1_128 = vld1q_u8(above + base + j + 1);
930 a0 = vzipq_u8(a0_128, v_zero);
931 a1 = vzipq_u8(a1_128, v_zero);
932 diff.val[0] =
933 vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
934 vreinterpretq_u16_u8(a0.val[0])); // a[x+1] - a[x]
935 diff.val[1] =
936 vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
937 vreinterpretq_u16_u8(a0.val[1])); // a[x+1] - a[x]
938 a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
939 v_32); // a[x] * 32 + 16
940 a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
941 v_32); // a[x] * 32 + 16
942 res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
943 res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
944
945 res16[jj] =
946 vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
947 }
948 }
949
950 uint8x16x2_t mask;
951
952 mask.val[0] = vld1q_u8(BaseMask[base_max_diff]);
953 mask.val[1] = vld1q_u8(BaseMask[base_max_diff] + 16);
954 dstvec[r].val[0] = vorrq_u8(vandq_u8(mask.val[0], res16[0]),
955 vbicq_u8(a_mbase_x, mask.val[0]));
956 dstvec[r].val[1] = vorrq_u8(vandq_u8(mask.val[1], res16[1]),
957 vbicq_u8(a_mbase_x, mask.val[1]));
958 x += dx;
959 }
960 }
961
dr_prediction_z1_32xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)962 static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
963 const uint8_t *above, int upsample_above,
964 int dx) {
965 uint8x16x2_t dstvec[64];
966
967 dr_prediction_z1_32xN_internal_neon(N, dstvec, above, upsample_above, dx);
968 for (int i = 0; i < N; i++) {
969 vst1q_u8(dst + stride * i, dstvec[i].val[0]);
970 vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
971 }
972 }
973
dr_prediction_z1_64xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)974 static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
975 const uint8_t *above, int upsample_above,
976 int dx) {
977 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
978 (void)upsample_above;
979 const int frac_bits = 6;
980 const int max_base_x = ((64 + N) - 1);
981
982 // pre-filter above pixels
983 // store in temp buffers:
984 // above[x] * 32 + 16
985 // above[x+1] - above[x]
986 // final pixels will be calculated as:
987 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
988
989 uint8x16x2_t a0, a1;
990 uint16x8x2_t a32, diff;
991 uint16x8_t a16, c3f;
992 uint8x16_t a_mbase_x, max_base_x128, mask128;
993
994 a16 = vdupq_n_u16(16);
995 a_mbase_x = vdupq_n_u8(above[max_base_x]);
996 max_base_x128 = vdupq_n_u8(max_base_x);
997 c3f = vdupq_n_u16(0x3f);
998 uint16x8_t v_32 = vdupq_n_u16(32);
999 uint8x16_t v_zero = vdupq_n_u8(0);
1000 uint8x16_t step = vdupq_n_u8(16);
1001
1002 int x = dx;
1003 for (int r = 0; r < N; r++, dst += stride) {
1004 uint16x8x2_t res;
1005
1006 int base = x >> frac_bits;
1007 if (base >= max_base_x) {
1008 for (int i = r; i < N; ++i) {
1009 vst1q_u8(dst, a_mbase_x);
1010 vst1q_u8(dst + 16, a_mbase_x);
1011 vst1q_u8(dst + 32, a_mbase_x);
1012 vst1q_u8(dst + 48, a_mbase_x);
1013 dst += stride;
1014 }
1015 return;
1016 }
1017
1018 uint16x8_t shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
1019 uint8x16_t a0_128, a1_128, res128;
1020 uint8x16_t base_inc128 =
1021 vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
1022 vcreate_u8(0x0F0E0D0C0B0A0908)));
1023
1024 for (int j = 0; j < 64; j += 16) {
1025 int mdif = max_base_x - (base + j);
1026 if (mdif <= 0) {
1027 vst1q_u8(dst + j, a_mbase_x);
1028 } else {
1029 a0_128 = vld1q_u8(above + base + j);
1030 a1_128 = vld1q_u8(above + base + 1 + j);
1031 a0 = vzipq_u8(a0_128, v_zero);
1032 a1 = vzipq_u8(a1_128, v_zero);
1033 diff.val[0] =
1034 vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
1035 vreinterpretq_u16_u8(a0.val[0])); // a[x+1] - a[x]
1036 diff.val[1] =
1037 vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
1038 vreinterpretq_u16_u8(a0.val[1])); // a[x+1] - a[x]
1039 a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
1040 v_32); // a[x] * 32 + 16
1041 a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
1042 v_32); // a[x] * 32 + 16
1043 res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
1044 res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
1045 uint8x16_t v_temp =
1046 vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
1047
1048 mask128 = vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), v_zero);
1049 res128 =
1050 vorrq_u8(vandq_u8(mask128, v_temp), vbicq_u8(a_mbase_x, mask128));
1051 vst1q_u8(dst + j, res128);
1052
1053 base_inc128 = vaddq_u8(base_inc128, step);
1054 }
1055 }
1056 x += dx;
1057 }
1058 }
1059
1060 // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)1061 void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1062 const uint8_t *above, const uint8_t *left,
1063 int upsample_above, int dx, int dy) {
1064 (void)left;
1065 (void)dy;
1066
1067 switch (bw) {
1068 case 4:
1069 dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
1070 break;
1071 case 8:
1072 dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
1073 break;
1074 case 16:
1075 dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
1076 break;
1077 case 32:
1078 dr_prediction_z1_32xN_neon(bh, dst, stride, above, upsample_above, dx);
1079 break;
1080 case 64:
1081 dr_prediction_z1_64xN_neon(bh, dst, stride, above, upsample_above, dx);
1082 break;
1083 default: break;
1084 }
1085 return;
1086 }
1087
1088 /* ---------------------P R E D I C T I O N Z 2--------------------------- */
1089
1090 static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
1091 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1092 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
1093 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1094 0, 0, 0 },
1095 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1096 0xff, 0xff, 0xff, 0xff }
1097 };
1098
vector_shift_x4(uint8x8_t * vec,uint8x8_t * v_zero,int shift_value)1099 static AOM_FORCE_INLINE void vector_shift_x4(uint8x8_t *vec, uint8x8_t *v_zero,
1100 int shift_value) {
1101 switch (shift_value) {
1102 case 1: *vec = vext_u8(*v_zero, *vec, 7); break;
1103 case 2: *vec = vext_u8(*v_zero, *vec, 6); break;
1104 case 3: *vec = vext_u8(*v_zero, *vec, 5); break;
1105 default: break;
1106 }
1107 }
1108
dr_prediction_z2_Nx4_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1109 static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
1110 const uint8_t *above, const uint8_t *left,
1111 int upsample_above, int upsample_left,
1112 int dx, int dy) {
1113 const int min_base_x = -(1 << upsample_above);
1114 const int min_base_y = -(1 << upsample_left);
1115 const int frac_bits_x = 6 - upsample_above;
1116 const int frac_bits_y = 6 - upsample_left;
1117
1118 assert(dx > 0);
1119 // pre-filter above pixels
1120 // store in temp buffers:
1121 // above[x] * 32 + 16
1122 // above[x+1] - above[x]
1123 // final pixels will be calculated as:
1124 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1125 uint16x8_t a0_x, a1_x, a32, diff;
1126 uint16x8_t v_32 = vdupq_n_u16(32);
1127 uint16x8_t v_zero = vdupq_n_u16(0);
1128 uint16x8_t a16 = vdupq_n_u16(16);
1129
1130 uint8x8_t v_zero_u8 = vdup_n_u8(0);
1131 uint16x4_t v_c3f = vdup_n_u16(0x3f);
1132 uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
1133 int16x4_t v_upsample_left = vdup_n_s16(upsample_left);
1134 int16x4_t v_upsample_above = vdup_n_s16(upsample_above);
1135 int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
1136 int16x4_t dy64 = vdup_n_s16(dy);
1137 int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
1138 int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
1139 int16x4_t v_one = vdup_lane_s16(v_1234, 0);
1140
1141 for (int r = 0; r < N; r++) {
1142 uint16x8_t res, shift;
1143 uint16x4_t ydx;
1144 uint8x8_t resx, resy;
1145 uint16x4x2_t v_shift;
1146
1147 int y = r + 1;
1148 int base_x = (-y * dx) >> frac_bits_x;
1149 int base_shift = 0;
1150 if (base_x < (min_base_x - 1)) {
1151 base_shift = (min_base_x - base_x - 1) >> upsample_above;
1152 }
1153 int base_min_diff =
1154 (min_base_x - base_x + upsample_above) >> upsample_above;
1155 if (base_min_diff > 4) {
1156 base_min_diff = 4;
1157 } else {
1158 if (base_min_diff < 0) base_min_diff = 0;
1159 }
1160
1161 if (base_shift > 3) {
1162 a0_x = v_zero;
1163 a1_x = v_zero;
1164 v_shift.val[0] = vreinterpret_u16_u8(v_zero_u8);
1165 v_shift.val[1] = vreinterpret_u16_u8(v_zero_u8);
1166 } else {
1167 ydx = vdup_n_u16(y * dx);
1168
1169 if (upsample_above) {
1170 uint8x8x2_t v_tmp;
1171 v_tmp.val[0] = vld1_u8(above + base_x + base_shift);
1172 v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8);
1173 uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]);
1174 uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
1175 a0_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_low));
1176 a1_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_high));
1177 v_shift.val[0] = vshr_n_u16(
1178 vand_u16(vshl_u16(vsub_u16(r6, ydx), v_upsample_above), v_c3f), 1);
1179 } else {
1180 uint8x8_t v_a0_x64 = vld1_u8(above + base_x + base_shift);
1181 vector_shift_x4(&v_a0_x64, &v_zero_u8, base_shift);
1182 uint8x8_t v_a1_x64 = vext_u8(v_a0_x64, v_zero_u8, 1);
1183 v_shift.val[0] = vshr_n_u16(vand_u16(vsub_u16(r6, ydx), v_c3f), 1);
1184 a0_x = vmovl_u8(v_a0_x64);
1185 a1_x = vmovl_u8(v_a1_x64);
1186 }
1187 }
1188
1189 // y calc
1190 uint8x8_t a0_y, a1_y;
1191 if (base_x < min_base_x) {
1192 DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
1193 int16x4_t v_r6 = vdup_n_s16(r << 6);
1194 int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
1195 int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
1196 uint16x4_t mask64 = vcgt_s16(min_base_y64, base_y_c64);
1197
1198 base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
1199 vst1_s16(base_y_c, base_y_c64);
1200 a0_y = v_zero_u8;
1201 a0_y = vld1_lane_u8(left + base_y_c[0], a0_y, 0);
1202 a0_y = vld1_lane_u8(left + base_y_c[1], a0_y, 2);
1203 a0_y = vld1_lane_u8(left + base_y_c[2], a0_y, 4);
1204 a0_y = vld1_lane_u8(left + base_y_c[3], a0_y, 6);
1205
1206 base_y_c64 = vadd_s16(base_y_c64, v_one);
1207 vst1_s16(base_y_c, base_y_c64);
1208 a1_y = v_zero_u8;
1209 a1_y = vld1_lane_u8(left + base_y_c[0], a1_y, 0);
1210 a1_y = vld1_lane_u8(left + base_y_c[1], a1_y, 2);
1211 a1_y = vld1_lane_u8(left + base_y_c[2], a1_y, 4);
1212 a1_y = vld1_lane_u8(left + base_y_c[3], a1_y, 6);
1213
1214 if (upsample_left) {
1215 v_shift.val[1] = vshr_n_u16(
1216 vand_u16(vshl_u16(vreinterpret_u16_s16(y_c64), v_upsample_left),
1217 v_c3f),
1218 1);
1219 } else {
1220 v_shift.val[1] =
1221 vshr_n_u16(vand_u16(vreinterpret_u16_s16(y_c64), v_c3f), 1);
1222 }
1223
1224 a0_x = vcombine_u16(vget_low_u16(a0_x), vreinterpret_u16_u8(a0_y));
1225 a1_x = vcombine_u16(vget_low_u16(a1_x), vreinterpret_u16_u8(a1_y));
1226 }
1227 shift = vcombine_u16(v_shift.val[0], v_shift.val[1]);
1228 diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x]
1229 a32 = vmlaq_u16(a16, a0_x, v_32); // a[x] * 32 + 16
1230 res = vmlaq_u16(a32, diff, shift);
1231 resx = vshrn_n_u16(res, 5);
1232 resy = vext_u8(resx, v_zero_u8, 4);
1233
1234 uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1235 uint8x8_t v_resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
1236 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
1237
1238 dst += stride;
1239 }
1240 }
1241
vector_shuffle(uint8x16_t * vec,uint8x16_t * vzero,int shift_value)1242 static AOM_FORCE_INLINE void vector_shuffle(uint8x16_t *vec, uint8x16_t *vzero,
1243 int shift_value) {
1244 switch (shift_value) {
1245 case 1: *vec = vextq_u8(*vzero, *vec, 15); break;
1246 case 2: *vec = vextq_u8(*vzero, *vec, 14); break;
1247 case 3: *vec = vextq_u8(*vzero, *vec, 13); break;
1248 case 4: *vec = vextq_u8(*vzero, *vec, 12); break;
1249 case 5: *vec = vextq_u8(*vzero, *vec, 11); break;
1250 case 6: *vec = vextq_u8(*vzero, *vec, 10); break;
1251 case 7: *vec = vextq_u8(*vzero, *vec, 9); break;
1252 case 8: *vec = vextq_u8(*vzero, *vec, 8); break;
1253 case 9: *vec = vextq_u8(*vzero, *vec, 7); break;
1254 case 10: *vec = vextq_u8(*vzero, *vec, 6); break;
1255 case 11: *vec = vextq_u8(*vzero, *vec, 5); break;
1256 case 12: *vec = vextq_u8(*vzero, *vec, 4); break;
1257 case 13: *vec = vextq_u8(*vzero, *vec, 3); break;
1258 case 14: *vec = vextq_u8(*vzero, *vec, 2); break;
1259 case 15: *vec = vextq_u8(*vzero, *vec, 1); break;
1260 default: break;
1261 }
1262 }
1263
dr_prediction_z2_Nx8_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1264 static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
1265 const uint8_t *above, const uint8_t *left,
1266 int upsample_above, int upsample_left,
1267 int dx, int dy) {
1268 const int min_base_x = -(1 << upsample_above);
1269 const int min_base_y = -(1 << upsample_left);
1270 const int frac_bits_x = 6 - upsample_above;
1271 const int frac_bits_y = 6 - upsample_left;
1272
1273 // pre-filter above pixels
1274 // store in temp buffers:
1275 // above[x] * 32 + 16
1276 // above[x+1] - above[x]
1277 // final pixels will be calculated as:
1278 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1279 uint8x16x2_t a0_x, a1_x;
1280 uint16x8x2_t diff, a32;
1281 uint16x8_t c1234, a16, c3f;
1282 uint8x16_t a0_x128, a1_x128;
1283 int16x8_t min_base_y128, dy128;
1284 uint16x8_t v_32 = vdupq_n_u16(32);
1285 uint8x16_t v_zero = vdupq_n_u8(0);
1286 int16x8_t v_upsample_left = vdupq_n_s16(upsample_left);
1287 int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
1288 int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
1289
1290 a16 = vdupq_n_u16(16);
1291 c3f = vdupq_n_u16(0x3f);
1292 min_base_y128 = vdupq_n_s16(min_base_y);
1293 dy128 = vdupq_n_s16(dy);
1294 c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
1295 vcreate_u16(0x0008000700060005));
1296
1297 for (int r = 0; r < N; r++) {
1298 uint8x8_t resx, resy, resxy;
1299 uint16x8_t r6, ydx;
1300 uint16x8x2_t res, shift;
1301
1302 int y = r + 1;
1303 int base_x = (-y * dx) >> frac_bits_x;
1304 int base_shift = 0;
1305 if (base_x < (min_base_x - 1)) {
1306 base_shift = (min_base_x - base_x - 1) >> upsample_above;
1307 }
1308 int base_min_diff =
1309 (min_base_x - base_x + upsample_above) >> upsample_above;
1310 if (base_min_diff > 8) {
1311 base_min_diff = 8;
1312 } else {
1313 if (base_min_diff < 0) base_min_diff = 0;
1314 }
1315
1316 if (base_shift > 7) {
1317 a0_x.val[0] = v_zero;
1318 a0_x.val[1] = v_zero;
1319 a1_x.val[0] = v_zero;
1320 a1_x.val[1] = v_zero;
1321 shift.val[0] = vreinterpretq_u16_u8(v_zero);
1322 shift.val[1] = vreinterpretq_u16_u8(v_zero);
1323 } else {
1324 ydx = vdupq_n_u16(y * dx);
1325 r6 = vshlq_n_u16(vextq_u16(c1234, vreinterpretq_u16_u8(v_zero), 2), 6);
1326
1327 if (upsample_above) {
1328 uint8x8x2_t v_tmp;
1329 v_tmp.val[0] = vld1_u8(above + base_x + base_shift);
1330 v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8);
1331 uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]);
1332 uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
1333 shift.val[0] = vshrq_n_u16(
1334 vandq_u16(vshlq_u16(vsubq_u16(r6, ydx), v_upsample_above), c3f), 1);
1335 a0_x.val[0] =
1336 vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_low)));
1337 a1_x.val[0] =
1338 vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_high)));
1339 } else {
1340 a0_x128 = vld1q_u8(above + base_x + base_shift);
1341 a1_x128 = vextq_u8(a0_x128, v_zero, 1);
1342 vector_shuffle(&a0_x128, &v_zero, base_shift);
1343 vector_shuffle(&a1_x128, &v_zero, base_shift);
1344 shift.val[0] = vshrq_n_u16(vandq_u16(vsubq_u16(r6, ydx), c3f), 1);
1345 a0_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a0_x128)));
1346 a1_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a1_x128)));
1347 }
1348 }
1349
1350 // y calc
1351 if (base_x < min_base_x) {
1352 DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
1353 int16x8_t y_c128, base_y_c128;
1354 uint16x8_t mask128;
1355 int16x8_t v_r6 = vdupq_n_s16(r << 6);
1356
1357 y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
1358 base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
1359 mask128 = vcgtq_s16(min_base_y128, base_y_c128);
1360
1361 base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
1362 vst1q_s16(base_y_c, base_y_c128);
1363 a0_x.val[1] = v_zero;
1364 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a0_x.val[1], 0);
1365 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a0_x.val[1], 2);
1366 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a0_x.val[1], 4);
1367 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a0_x.val[1], 6);
1368 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a0_x.val[1], 8);
1369 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a0_x.val[1], 10);
1370 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a0_x.val[1], 12);
1371 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a0_x.val[1], 14);
1372
1373 base_y_c128 =
1374 vaddq_s16(base_y_c128, vreinterpretq_s16_u16(vshrq_n_u16(a16, 4)));
1375 vst1q_s16(base_y_c, base_y_c128);
1376 a1_x.val[1] = v_zero;
1377 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a1_x.val[1], 0);
1378 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a1_x.val[1], 2);
1379 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a1_x.val[1], 4);
1380 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a1_x.val[1], 6);
1381 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a1_x.val[1], 8);
1382 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a1_x.val[1], 10);
1383 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a1_x.val[1], 12);
1384 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a1_x.val[1], 14);
1385
1386 if (upsample_left) {
1387 shift.val[1] = vshrq_n_u16(
1388 vandq_u16(vshlq_u16(vreinterpretq_u16_s16(y_c128), v_upsample_left),
1389 c3f),
1390 1);
1391 } else {
1392 shift.val[1] =
1393 vshrq_n_u16(vandq_u16(vreinterpretq_u16_s16(y_c128), c3f), 1);
1394 }
1395 }
1396 diff.val[0] =
1397 vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
1398 vreinterpretq_u16_u8(a0_x.val[0])); // a[x+1] - a[x]
1399 diff.val[1] =
1400 vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
1401 vreinterpretq_u16_u8(a0_x.val[1])); // a[x+1] - a[x]
1402 a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
1403 v_32); // a[x] * 32 + 16
1404 a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
1405 v_32); // a[x] * 32 + 16
1406 res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
1407 res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
1408 resx = vshrn_n_u16(res.val[0], 5);
1409 resy = vshrn_n_u16(res.val[1], 5);
1410
1411 uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1412
1413 resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
1414 vst1_u8(dst, resxy);
1415 dst += stride;
1416 }
1417 }
1418
dr_prediction_z2_HxW_neon(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1419 static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
1420 ptrdiff_t stride, const uint8_t *above,
1421 const uint8_t *left, int upsample_above,
1422 int upsample_left, int dx, int dy) {
1423 // here upsample_above and upsample_left are 0 by design of
1424 // av1_use_intra_edge_upsample
1425 const int min_base_x = -1;
1426 const int min_base_y = -1;
1427 (void)upsample_above;
1428 (void)upsample_left;
1429 const int frac_bits_x = 6;
1430 const int frac_bits_y = 6;
1431
1432 uint16x8_t a16, c1, c3f;
1433 int16x8_t min_base_y256, dy256;
1434 uint16x8x2_t a32, c0123, c1234, diff, shifty;
1435 uint8x16x2_t a0_x, a1_x, a0_y, a1_y;
1436 uint8x16_t a0_x128, a1_x128;
1437 uint16x8_t v_32 = vdupq_n_u16(32);
1438 uint8x16_t v_zero = vdupq_n_u8(0);
1439 int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
1440
1441 DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
1442
1443 a16 = vdupq_n_u16(16);
1444 c1 = vshrq_n_u16(a16, 4);
1445 min_base_y256 = vdupq_n_s16(min_base_y);
1446 c3f = vdupq_n_u16(0x3f);
1447 dy256 = vdupq_n_s16(dy);
1448 c0123.val[0] = vcombine_u16(vcreate_u16(0x0003000200010000),
1449 vcreate_u16(0x0007000600050004));
1450 c0123.val[1] = vcombine_u16(vcreate_u16(0x000B000A00090008),
1451 vcreate_u16(0x000F000E000D000C));
1452 c1234.val[0] = vaddq_u16(c0123.val[0], c1);
1453 c1234.val[1] = vaddq_u16(c0123.val[1], c1);
1454
1455 for (int r = 0; r < H; r++) {
1456 uint16x8x2_t res, r6, shift;
1457 uint16x8_t ydx, j256;
1458 uint8x16_t resx, resy, resxy;
1459 int y = r + 1;
1460 ydx = vdupq_n_u16((uint16_t)(y * dx));
1461
1462 int base_x = (-y * dx) >> frac_bits_x;
1463 for (int j = 0; j < W; j += 16) {
1464 j256 = vdupq_n_u16(j);
1465
1466 int base_shift = 0;
1467 if ((base_x + j) < (min_base_x - 1)) {
1468 base_shift = (min_base_x - (base_x + j) - 1);
1469 }
1470 int base_min_diff = (min_base_x - base_x - j);
1471 if (base_min_diff > 16) {
1472 base_min_diff = 16;
1473 } else {
1474 if (base_min_diff < 0) base_min_diff = 0;
1475 }
1476
1477 if (base_shift < 16) {
1478 a0_x128 = vld1q_u8(above + base_x + base_shift + j);
1479 a1_x128 = vld1q_u8(above + base_x + base_shift + 1 + j);
1480 vector_shuffle(&a0_x128, &v_zero, base_shift);
1481 vector_shuffle(&a1_x128, &v_zero, base_shift);
1482 a0_x = vzipq_u8(a0_x128, v_zero);
1483 a1_x = vzipq_u8(a1_x128, v_zero);
1484 r6.val[0] = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
1485 r6.val[1] = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
1486 shift.val[0] =
1487 vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[0], ydx), c3f), 1);
1488 shift.val[1] =
1489 vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[1], ydx), c3f), 1);
1490 diff.val[0] =
1491 vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
1492 vreinterpretq_u16_u8(a0_x.val[0])); // a[x+1] - a[x]
1493 diff.val[1] =
1494 vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
1495 vreinterpretq_u16_u8(a0_x.val[1])); // a[x+1] - a[x]
1496 a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
1497 v_32); // a[x] * 32 + 16
1498 a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
1499 v_32); // a[x] * 32 + 16
1500 res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
1501 res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
1502 resx =
1503 vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
1504 } else {
1505 resx = v_zero;
1506 }
1507
1508 // y calc
1509 if (base_x < min_base_x) {
1510 uint16x8x2_t mask256;
1511 int16x8x2_t c256, y_c256, base_y_c256, mul16;
1512 int16x8_t v_r6 = vdupq_n_s16(r << 6);
1513
1514 c256.val[0] = vaddq_s16(vreinterpretq_s16_u16(j256),
1515 vreinterpretq_s16_u16(c1234.val[0]));
1516 c256.val[1] = vaddq_s16(vreinterpretq_s16_u16(j256),
1517 vreinterpretq_s16_u16(c1234.val[1]));
1518 mul16.val[0] = vminq_s16(vmulq_s16(c256.val[0], dy256),
1519 vreinterpretq_s16_u16(vshrq_n_u16(
1520 vreinterpretq_u16_s16(min_base_y256), 1)));
1521 mul16.val[1] = vminq_s16(vmulq_s16(c256.val[1], dy256),
1522 vreinterpretq_s16_u16(vshrq_n_u16(
1523 vreinterpretq_u16_s16(min_base_y256), 1)));
1524 y_c256.val[0] = vsubq_s16(v_r6, mul16.val[0]);
1525 y_c256.val[1] = vsubq_s16(v_r6, mul16.val[1]);
1526
1527 base_y_c256.val[0] = vshlq_s16(y_c256.val[0], v_frac_bits_y);
1528 base_y_c256.val[1] = vshlq_s16(y_c256.val[1], v_frac_bits_y);
1529 mask256.val[0] = vcgtq_s16(min_base_y256, base_y_c256.val[0]);
1530 mask256.val[1] = vcgtq_s16(min_base_y256, base_y_c256.val[1]);
1531
1532 base_y_c256.val[0] = vorrq_s16(
1533 vandq_s16(vreinterpretq_s16_u16(mask256.val[0]), min_base_y256),
1534 vbicq_s16(base_y_c256.val[0],
1535 vreinterpretq_s16_u16(mask256.val[0])));
1536 base_y_c256.val[1] = vorrq_s16(
1537 vandq_s16(vreinterpretq_s16_u16(mask256.val[1]), min_base_y256),
1538 vbicq_s16(base_y_c256.val[1],
1539 vreinterpretq_s16_u16(mask256.val[1])));
1540
1541 int16_t min_y = vgetq_lane_s16(base_y_c256.val[1], 7);
1542 int16_t max_y = vgetq_lane_s16(base_y_c256.val[0], 0);
1543 int16_t offset_diff = max_y - min_y;
1544
1545 if (offset_diff < 16) {
1546 int16x8_t min_y256 =
1547 vdupq_lane_s16(vget_high_s16(base_y_c256.val[1]), 3);
1548
1549 int16x8x2_t base_y_offset;
1550 base_y_offset.val[0] = vsubq_s16(base_y_c256.val[0], min_y256);
1551 base_y_offset.val[1] = vsubq_s16(base_y_c256.val[1], min_y256);
1552
1553 int8x16_t base_y_offset128 =
1554 vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
1555 vqmovn_s16(base_y_offset.val[1]));
1556
1557 uint8x16_t a0_y128, a1_y128;
1558 uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
1559 a0_y128 = vld1q_u8(left + min_y);
1560 a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
1561 a1_y128 = vld1q_u8(left + min_y + 1);
1562 a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
1563 #if defined(__aarch64__)
1564 a0_y128 = vqtbl1q_u8(a0_y128, vreinterpretq_u8_s8(base_y_offset128));
1565 a1_y128 = vqtbl1q_u8(a1_y128, vreinterpretq_u8_s8(base_y_offset128));
1566 #else
1567 uint8x8x2_t v_tmp;
1568 uint8x8x2_t v_res;
1569 uint8x8_t v_index_low =
1570 vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
1571 uint8x8_t v_index_high =
1572 vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
1573 v_tmp.val[0] = vget_low_u8(a0_y128);
1574 v_tmp.val[1] = vget_high_u8(a0_y128);
1575 v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1576 v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1577 a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1578 v_tmp.val[0] = vget_low_u8(a1_y128);
1579 v_tmp.val[1] = vget_high_u8(a1_y128);
1580 v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1581 v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1582 a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1583 #endif
1584 a0_y = vzipq_u8(a0_y128, v_zero);
1585 a1_y = vzipq_u8(a1_y128, v_zero);
1586 } else {
1587 base_y_c256.val[0] = vbicq_s16(base_y_c256.val[0],
1588 vreinterpretq_s16_u16(mask256.val[0]));
1589 base_y_c256.val[1] = vbicq_s16(base_y_c256.val[1],
1590 vreinterpretq_s16_u16(mask256.val[1]));
1591 vst1q_s16(base_y_c, base_y_c256.val[0]);
1592 vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
1593 a0_y.val[0] = v_zero;
1594 a0_y.val[1] = v_zero;
1595 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a0_y.val[0], 0);
1596 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a0_y.val[0], 2);
1597 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a0_y.val[0], 4);
1598 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a0_y.val[0], 6);
1599 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a0_y.val[0], 8);
1600 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a0_y.val[0], 10);
1601 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a0_y.val[0], 12);
1602 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a0_y.val[0], 14);
1603 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a0_y.val[1], 0);
1604 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a0_y.val[1], 2);
1605 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a0_y.val[1], 4);
1606 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a0_y.val[1], 6);
1607 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a0_y.val[1], 8);
1608 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a0_y.val[1], 10);
1609 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a0_y.val[1], 12);
1610 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a0_y.val[1], 14);
1611
1612 base_y_c256.val[0] =
1613 vaddq_s16(base_y_c256.val[0], vreinterpretq_s16_u16(c1));
1614 base_y_c256.val[1] =
1615 vaddq_s16(base_y_c256.val[1], vreinterpretq_s16_u16(c1));
1616 vst1q_s16(base_y_c, base_y_c256.val[0]);
1617 vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
1618 a1_y.val[0] = v_zero;
1619 a1_y.val[1] = v_zero;
1620 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a1_y.val[0], 0);
1621 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a1_y.val[0], 2);
1622 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a1_y.val[0], 4);
1623 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a1_y.val[0], 6);
1624 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a1_y.val[0], 8);
1625 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a1_y.val[0], 10);
1626 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a1_y.val[0], 12);
1627 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a1_y.val[0], 14);
1628 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a1_y.val[1], 0);
1629 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a1_y.val[1], 2);
1630 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a1_y.val[1], 4);
1631 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a1_y.val[1], 6);
1632 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a1_y.val[1], 8);
1633 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a1_y.val[1], 10);
1634 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a1_y.val[1], 12);
1635 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a1_y.val[1], 14);
1636 }
1637 shifty.val[0] = vshrq_n_u16(
1638 vandq_u16(vreinterpretq_u16_s16(y_c256.val[0]), c3f), 1);
1639 shifty.val[1] = vshrq_n_u16(
1640 vandq_u16(vreinterpretq_u16_s16(y_c256.val[1]), c3f), 1);
1641 diff.val[0] =
1642 vsubq_u16(vreinterpretq_u16_u8(a1_y.val[0]),
1643 vreinterpretq_u16_u8(a0_y.val[0])); // a[x+1] - a[x]
1644 diff.val[1] =
1645 vsubq_u16(vreinterpretq_u16_u8(a1_y.val[1]),
1646 vreinterpretq_u16_u8(a0_y.val[1])); // a[x+1] - a[x]
1647 a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[0]),
1648 v_32); // a[x] * 32 + 16
1649 a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[1]),
1650 v_32); // a[x] * 32 + 16
1651 res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shifty.val[0]);
1652 res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shifty.val[1]);
1653
1654 resy =
1655 vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
1656 } else {
1657 resy = v_zero;
1658 }
1659 uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
1660 resxy = vorrq_u8(vandq_u8(mask, resy), vbicq_u8(resx, mask));
1661 vst1q_u8(dst + j, resxy);
1662 } // for j
1663 dst += stride;
1664 }
1665 }
1666
1667 // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1668 void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1669 const uint8_t *above, const uint8_t *left,
1670 int upsample_above, int upsample_left, int dx,
1671 int dy) {
1672 assert(dx > 0);
1673 assert(dy > 0);
1674
1675 switch (bw) {
1676 case 4:
1677 dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
1678 upsample_left, dx, dy);
1679 break;
1680 case 8:
1681 dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
1682 upsample_left, dx, dy);
1683 break;
1684 default:
1685 dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left,
1686 upsample_above, upsample_left, dx, dy);
1687 break;
1688 }
1689 return;
1690 }
1691
1692 /* ---------------------P R E D I C T I O N Z 3--------------------------- */
1693
transpose4x16_neon(uint8x16_t * x,uint16x8x2_t * d)1694 static AOM_FORCE_INLINE void transpose4x16_neon(uint8x16_t *x,
1695 uint16x8x2_t *d) {
1696 uint8x16x2_t w0, w1;
1697
1698 w0 = vzipq_u8(x[0], x[1]);
1699 w1 = vzipq_u8(x[2], x[3]);
1700
1701 d[0] = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
1702 vreinterpretq_u16_u8(w1.val[0]));
1703 d[1] = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
1704 vreinterpretq_u16_u8(w1.val[1]));
1705 }
1706
transpose4x8_8x4_low_neon(uint8x8_t * x,uint16x4x2_t * d)1707 static AOM_FORCE_INLINE void transpose4x8_8x4_low_neon(uint8x8_t *x,
1708 uint16x4x2_t *d) {
1709 uint8x8x2_t w0, w1;
1710
1711 w0 = vzip_u8(x[0], x[1]);
1712 w1 = vzip_u8(x[2], x[3]);
1713
1714 *d = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1715 }
1716
transpose4x8_8x4_neon(uint8x8_t * x,uint16x4x2_t * d)1717 static AOM_FORCE_INLINE void transpose4x8_8x4_neon(uint8x8_t *x,
1718 uint16x4x2_t *d) {
1719 uint8x8x2_t w0, w1;
1720
1721 w0 = vzip_u8(x[0], x[1]);
1722 w1 = vzip_u8(x[2], x[3]);
1723
1724 d[0] =
1725 vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1726 d[1] =
1727 vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
1728 }
1729
transpose8x8_low_neon(uint8x8_t * x,uint32x2x2_t * d)1730 static AOM_FORCE_INLINE void transpose8x8_low_neon(uint8x8_t *x,
1731 uint32x2x2_t *d) {
1732 uint8x8x2_t w0, w1, w2, w3;
1733 uint16x4x2_t w4, w5;
1734
1735 w0 = vzip_u8(x[0], x[1]);
1736 w1 = vzip_u8(x[2], x[3]);
1737 w2 = vzip_u8(x[4], x[5]);
1738 w3 = vzip_u8(x[6], x[7]);
1739
1740 w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1741 w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
1742
1743 d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
1744 vreinterpret_u32_u16(w5.val[0]));
1745 d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
1746 vreinterpret_u32_u16(w5.val[1]));
1747 }
1748
transpose8x8_neon(uint8x8_t * x,uint32x2x2_t * d)1749 static AOM_FORCE_INLINE void transpose8x8_neon(uint8x8_t *x, uint32x2x2_t *d) {
1750 uint8x8x2_t w0, w1, w2, w3;
1751 uint16x4x2_t w4, w5, w6, w7;
1752
1753 w0 = vzip_u8(x[0], x[1]);
1754 w1 = vzip_u8(x[2], x[3]);
1755 w2 = vzip_u8(x[4], x[5]);
1756 w3 = vzip_u8(x[6], x[7]);
1757
1758 w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1759 w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
1760
1761 d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
1762 vreinterpret_u32_u16(w5.val[0]));
1763 d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
1764 vreinterpret_u32_u16(w5.val[1]));
1765
1766 w6 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
1767 w7 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
1768
1769 d[2] = vzip_u32(vreinterpret_u32_u16(w6.val[0]),
1770 vreinterpret_u32_u16(w7.val[0]));
1771 d[3] = vzip_u32(vreinterpret_u32_u16(w6.val[1]),
1772 vreinterpret_u32_u16(w7.val[1]));
1773 }
1774
transpose16x8_8x16_neon(uint8x8_t * x,uint64x2_t * d)1775 static AOM_FORCE_INLINE void transpose16x8_8x16_neon(uint8x8_t *x,
1776 uint64x2_t *d) {
1777 uint8x8x2_t w0, w1, w2, w3, w8, w9, w10, w11;
1778 uint16x4x2_t w4, w5, w12, w13;
1779 uint32x2x2_t w6, w7, w14, w15;
1780
1781 w0 = vzip_u8(x[0], x[1]);
1782 w1 = vzip_u8(x[2], x[3]);
1783 w2 = vzip_u8(x[4], x[5]);
1784 w3 = vzip_u8(x[6], x[7]);
1785
1786 w8 = vzip_u8(x[8], x[9]);
1787 w9 = vzip_u8(x[10], x[11]);
1788 w10 = vzip_u8(x[12], x[13]);
1789 w11 = vzip_u8(x[14], x[15]);
1790
1791 w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1792 w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
1793 w12 =
1794 vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
1795 w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
1796 vreinterpret_u16_u8(w11.val[0]));
1797
1798 w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
1799 vreinterpret_u32_u16(w5.val[0]));
1800 w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
1801 vreinterpret_u32_u16(w5.val[1]));
1802 w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
1803 vreinterpret_u32_u16(w13.val[0]));
1804 w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
1805 vreinterpret_u32_u16(w13.val[1]));
1806
1807 // Store first 4-line result
1808 d[0] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]),
1809 vreinterpret_u64_u32(w14.val[0]));
1810 d[1] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]),
1811 vreinterpret_u64_u32(w14.val[1]));
1812 d[2] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]),
1813 vreinterpret_u64_u32(w15.val[0]));
1814 d[3] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]),
1815 vreinterpret_u64_u32(w15.val[1]));
1816
1817 w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
1818 w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
1819 w12 =
1820 vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
1821 w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
1822 vreinterpret_u16_u8(w11.val[1]));
1823
1824 w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
1825 vreinterpret_u32_u16(w5.val[0]));
1826 w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
1827 vreinterpret_u32_u16(w5.val[1]));
1828 w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
1829 vreinterpret_u32_u16(w13.val[0]));
1830 w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
1831 vreinterpret_u32_u16(w13.val[1]));
1832
1833 // Store second 4-line result
1834 d[4] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]),
1835 vreinterpret_u64_u32(w14.val[0]));
1836 d[5] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]),
1837 vreinterpret_u64_u32(w14.val[1]));
1838 d[6] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]),
1839 vreinterpret_u64_u32(w15.val[0]));
1840 d[7] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]),
1841 vreinterpret_u64_u32(w15.val[1]));
1842 }
1843
transpose8x16_16x8_neon(uint8x16_t * x,uint64x2_t * d)1844 static AOM_FORCE_INLINE void transpose8x16_16x8_neon(uint8x16_t *x,
1845 uint64x2_t *d) {
1846 uint8x16x2_t w0, w1, w2, w3;
1847 uint16x8x2_t w4, w5, w6, w7;
1848 uint32x4x2_t w8, w9, w10, w11;
1849
1850 w0 = vzipq_u8(x[0], x[1]);
1851 w1 = vzipq_u8(x[2], x[3]);
1852 w2 = vzipq_u8(x[4], x[5]);
1853 w3 = vzipq_u8(x[6], x[7]);
1854
1855 w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
1856 vreinterpretq_u16_u8(w1.val[0]));
1857 w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
1858 vreinterpretq_u16_u8(w3.val[0]));
1859 w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
1860 vreinterpretq_u16_u8(w1.val[1]));
1861 w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
1862 vreinterpretq_u16_u8(w3.val[1]));
1863
1864 w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
1865 vreinterpretq_u32_u16(w5.val[0]));
1866 w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
1867 vreinterpretq_u32_u16(w7.val[0]));
1868 w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
1869 vreinterpretq_u32_u16(w5.val[1]));
1870 w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
1871 vreinterpretq_u32_u16(w7.val[1]));
1872
1873 #if defined(__aarch64__)
1874 d[0] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[0]),
1875 vreinterpretq_u64_u32(w9.val[0]));
1876 d[1] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[0]),
1877 vreinterpretq_u64_u32(w9.val[0]));
1878 d[2] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[1]),
1879 vreinterpretq_u64_u32(w9.val[1]));
1880 d[3] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[1]),
1881 vreinterpretq_u64_u32(w9.val[1]));
1882 d[4] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[0]),
1883 vreinterpretq_u64_u32(w11.val[0]));
1884 d[5] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[0]),
1885 vreinterpretq_u64_u32(w11.val[0]));
1886 d[6] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[1]),
1887 vreinterpretq_u64_u32(w11.val[1]));
1888 d[7] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[1]),
1889 vreinterpretq_u64_u32(w11.val[1]));
1890 #else
1891 d[0] = vreinterpretq_u64_u32(
1892 vcombine_u32(vget_low_u32(w8.val[0]), vget_low_u32(w9.val[0])));
1893 d[1] = vreinterpretq_u64_u32(
1894 vcombine_u32(vget_high_u32(w8.val[0]), vget_high_u32(w9.val[0])));
1895 d[2] = vreinterpretq_u64_u32(
1896 vcombine_u32(vget_low_u32(w8.val[1]), vget_low_u32(w9.val[1])));
1897 d[3] = vreinterpretq_u64_u32(
1898 vcombine_u32(vget_high_u32(w8.val[1]), vget_high_u32(w9.val[1])));
1899 d[4] = vreinterpretq_u64_u32(
1900 vcombine_u32(vget_low_u32(w10.val[0]), vget_low_u32(w11.val[0])));
1901 d[5] = vreinterpretq_u64_u32(
1902 vcombine_u32(vget_high_u32(w10.val[0]), vget_high_u32(w11.val[0])));
1903 d[6] = vreinterpretq_u64_u32(
1904 vcombine_u32(vget_low_u32(w10.val[1]), vget_low_u32(w11.val[1])));
1905 d[7] = vreinterpretq_u64_u32(
1906 vcombine_u32(vget_high_u32(w10.val[1]), vget_high_u32(w11.val[1])));
1907 #endif
1908 }
1909
transpose16x16_neon(uint8x16_t * x,uint64x2_t * d)1910 static AOM_FORCE_INLINE void transpose16x16_neon(uint8x16_t *x, uint64x2_t *d) {
1911 uint8x16x2_t w0, w1, w2, w3, w4, w5, w6, w7;
1912 uint16x8x2_t w8, w9, w10, w11;
1913 uint32x4x2_t w12, w13, w14, w15;
1914
1915 w0 = vzipq_u8(x[0], x[1]);
1916 w1 = vzipq_u8(x[2], x[3]);
1917 w2 = vzipq_u8(x[4], x[5]);
1918 w3 = vzipq_u8(x[6], x[7]);
1919
1920 w4 = vzipq_u8(x[8], x[9]);
1921 w5 = vzipq_u8(x[10], x[11]);
1922 w6 = vzipq_u8(x[12], x[13]);
1923 w7 = vzipq_u8(x[14], x[15]);
1924
1925 w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
1926 vreinterpretq_u16_u8(w1.val[0]));
1927 w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
1928 vreinterpretq_u16_u8(w3.val[0]));
1929 w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
1930 vreinterpretq_u16_u8(w5.val[0]));
1931 w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
1932 vreinterpretq_u16_u8(w7.val[0]));
1933
1934 w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
1935 vreinterpretq_u32_u16(w9.val[0]));
1936 w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
1937 vreinterpretq_u32_u16(w11.val[0]));
1938 w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
1939 vreinterpretq_u32_u16(w9.val[1]));
1940 w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
1941 vreinterpretq_u32_u16(w11.val[1]));
1942
1943 #if defined(__aarch64__)
1944 d[0] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
1945 vreinterpretq_u64_u32(w13.val[0]));
1946 d[1] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
1947 vreinterpretq_u64_u32(w13.val[0]));
1948 d[2] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]),
1949 vreinterpretq_u64_u32(w13.val[1]));
1950 d[3] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]),
1951 vreinterpretq_u64_u32(w13.val[1]));
1952 d[4] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]),
1953 vreinterpretq_u64_u32(w15.val[0]));
1954 d[5] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]),
1955 vreinterpretq_u64_u32(w15.val[0]));
1956 d[6] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]),
1957 vreinterpretq_u64_u32(w15.val[1]));
1958 d[7] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]),
1959 vreinterpretq_u64_u32(w15.val[1]));
1960 #else
1961 d[0] = vreinterpretq_u64_u32(
1962 vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0])));
1963 d[1] = vreinterpretq_u64_u32(
1964 vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0])));
1965 d[2] = vreinterpretq_u64_u32(
1966 vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1])));
1967 d[3] = vreinterpretq_u64_u32(
1968 vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1])));
1969 d[4] = vreinterpretq_u64_u32(
1970 vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0])));
1971 d[5] = vreinterpretq_u64_u32(
1972 vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0])));
1973 d[6] = vreinterpretq_u64_u32(
1974 vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1])));
1975 d[7] = vreinterpretq_u64_u32(
1976 vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1])));
1977 #endif
1978
1979 // upper half
1980 w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
1981 vreinterpretq_u16_u8(w1.val[1]));
1982 w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
1983 vreinterpretq_u16_u8(w3.val[1]));
1984 w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
1985 vreinterpretq_u16_u8(w5.val[1]));
1986 w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
1987 vreinterpretq_u16_u8(w7.val[1]));
1988
1989 w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
1990 vreinterpretq_u32_u16(w9.val[0]));
1991 w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
1992 vreinterpretq_u32_u16(w11.val[0]));
1993 w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
1994 vreinterpretq_u32_u16(w9.val[1]));
1995 w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
1996 vreinterpretq_u32_u16(w11.val[1]));
1997
1998 #if defined(__aarch64__)
1999 d[8] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
2000 vreinterpretq_u64_u32(w13.val[0]));
2001 d[9] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
2002 vreinterpretq_u64_u32(w13.val[0]));
2003 d[10] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]),
2004 vreinterpretq_u64_u32(w13.val[1]));
2005 d[11] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]),
2006 vreinterpretq_u64_u32(w13.val[1]));
2007 d[12] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]),
2008 vreinterpretq_u64_u32(w15.val[0]));
2009 d[13] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]),
2010 vreinterpretq_u64_u32(w15.val[0]));
2011 d[14] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]),
2012 vreinterpretq_u64_u32(w15.val[1]));
2013 d[15] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]),
2014 vreinterpretq_u64_u32(w15.val[1]));
2015 #else
2016 d[8] = vreinterpretq_u64_u32(
2017 vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0])));
2018 d[9] = vreinterpretq_u64_u32(
2019 vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0])));
2020 d[10] = vreinterpretq_u64_u32(
2021 vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1])));
2022 d[11] = vreinterpretq_u64_u32(
2023 vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1])));
2024 d[12] = vreinterpretq_u64_u32(
2025 vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0])));
2026 d[13] = vreinterpretq_u64_u32(
2027 vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0])));
2028 d[14] = vreinterpretq_u64_u32(
2029 vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1])));
2030 d[15] = vreinterpretq_u64_u32(
2031 vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1])));
2032 #endif
2033 }
2034
transpose16x32_neon(uint8x16x2_t * x,uint64x2x2_t * d)2035 static AOM_FORCE_INLINE void transpose16x32_neon(uint8x16x2_t *x,
2036 uint64x2x2_t *d) {
2037 uint8x16x2_t w0, w1, w2, w3, w8, w9, w10, w11;
2038 uint16x8x2_t w4, w5, w12, w13;
2039 uint32x4x2_t w6, w7, w14, w15;
2040
2041 w0 = vzipq_u8(x[0].val[0], x[1].val[0]);
2042 w1 = vzipq_u8(x[2].val[0], x[3].val[0]);
2043 w2 = vzipq_u8(x[4].val[0], x[5].val[0]);
2044 w3 = vzipq_u8(x[6].val[0], x[7].val[0]);
2045
2046 w8 = vzipq_u8(x[8].val[0], x[9].val[0]);
2047 w9 = vzipq_u8(x[10].val[0], x[11].val[0]);
2048 w10 = vzipq_u8(x[12].val[0], x[13].val[0]);
2049 w11 = vzipq_u8(x[14].val[0], x[15].val[0]);
2050
2051 w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
2052 vreinterpretq_u16_u8(w1.val[0]));
2053 w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
2054 vreinterpretq_u16_u8(w3.val[0]));
2055 w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]),
2056 vreinterpretq_u16_u8(w9.val[0]));
2057 w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]),
2058 vreinterpretq_u16_u8(w11.val[0]));
2059
2060 w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
2061 vreinterpretq_u32_u16(w5.val[0]));
2062 w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
2063 vreinterpretq_u32_u16(w5.val[1]));
2064 w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
2065 vreinterpretq_u32_u16(w13.val[0]));
2066 w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
2067 vreinterpretq_u32_u16(w13.val[1]));
2068
2069 // Store first 4-line result
2070
2071 #if defined(__aarch64__)
2072 d[0].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
2073 vreinterpretq_u64_u32(w14.val[0]));
2074 d[0].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
2075 vreinterpretq_u64_u32(w14.val[0]));
2076 d[1].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
2077 vreinterpretq_u64_u32(w14.val[1]));
2078 d[1].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
2079 vreinterpretq_u64_u32(w14.val[1]));
2080 d[2].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
2081 vreinterpretq_u64_u32(w15.val[0]));
2082 d[2].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
2083 vreinterpretq_u64_u32(w15.val[0]));
2084 d[3].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
2085 vreinterpretq_u64_u32(w15.val[1]));
2086 d[3].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
2087 vreinterpretq_u64_u32(w15.val[1]));
2088 #else
2089 d[0].val[0] = vreinterpretq_u64_u32(
2090 vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
2091 d[0].val[1] = vreinterpretq_u64_u32(
2092 vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
2093 d[1].val[0] = vreinterpretq_u64_u32(
2094 vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
2095 d[1].val[1] = vreinterpretq_u64_u32(
2096 vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
2097 d[2].val[0] = vreinterpretq_u64_u32(
2098 vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
2099 d[2].val[1] = vreinterpretq_u64_u32(
2100 vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
2101 d[3].val[0] = vreinterpretq_u64_u32(
2102 vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
2103 d[3].val[1] = vreinterpretq_u64_u32(
2104 vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
2105 #endif
2106
2107 w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
2108 vreinterpretq_u16_u8(w1.val[1]));
2109 w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
2110 vreinterpretq_u16_u8(w3.val[1]));
2111 w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]),
2112 vreinterpretq_u16_u8(w9.val[1]));
2113 w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]),
2114 vreinterpretq_u16_u8(w11.val[1]));
2115
2116 w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
2117 vreinterpretq_u32_u16(w5.val[0]));
2118 w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
2119 vreinterpretq_u32_u16(w5.val[1]));
2120 w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
2121 vreinterpretq_u32_u16(w13.val[0]));
2122 w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
2123 vreinterpretq_u32_u16(w13.val[1]));
2124
2125 // Store second 4-line result
2126
2127 #if defined(__aarch64__)
2128 d[4].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
2129 vreinterpretq_u64_u32(w14.val[0]));
2130 d[4].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
2131 vreinterpretq_u64_u32(w14.val[0]));
2132 d[5].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
2133 vreinterpretq_u64_u32(w14.val[1]));
2134 d[5].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
2135 vreinterpretq_u64_u32(w14.val[1]));
2136 d[6].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
2137 vreinterpretq_u64_u32(w15.val[0]));
2138 d[6].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
2139 vreinterpretq_u64_u32(w15.val[0]));
2140 d[7].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
2141 vreinterpretq_u64_u32(w15.val[1]));
2142 d[7].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
2143 vreinterpretq_u64_u32(w15.val[1]));
2144 #else
2145 d[4].val[0] = vreinterpretq_u64_u32(
2146 vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
2147 d[4].val[1] = vreinterpretq_u64_u32(
2148 vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
2149 d[5].val[0] = vreinterpretq_u64_u32(
2150 vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
2151 d[5].val[1] = vreinterpretq_u64_u32(
2152 vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
2153 d[6].val[0] = vreinterpretq_u64_u32(
2154 vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
2155 d[6].val[1] = vreinterpretq_u64_u32(
2156 vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
2157 d[7].val[0] = vreinterpretq_u64_u32(
2158 vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
2159 d[7].val[1] = vreinterpretq_u64_u32(
2160 vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
2161 #endif
2162
2163 // upper half
2164 w0 = vzipq_u8(x[0].val[1], x[1].val[1]);
2165 w1 = vzipq_u8(x[2].val[1], x[3].val[1]);
2166 w2 = vzipq_u8(x[4].val[1], x[5].val[1]);
2167 w3 = vzipq_u8(x[6].val[1], x[7].val[1]);
2168
2169 w8 = vzipq_u8(x[8].val[1], x[9].val[1]);
2170 w9 = vzipq_u8(x[10].val[1], x[11].val[1]);
2171 w10 = vzipq_u8(x[12].val[1], x[13].val[1]);
2172 w11 = vzipq_u8(x[14].val[1], x[15].val[1]);
2173
2174 w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
2175 vreinterpretq_u16_u8(w1.val[0]));
2176 w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
2177 vreinterpretq_u16_u8(w3.val[0]));
2178 w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]),
2179 vreinterpretq_u16_u8(w9.val[0]));
2180 w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]),
2181 vreinterpretq_u16_u8(w11.val[0]));
2182
2183 w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
2184 vreinterpretq_u32_u16(w5.val[0]));
2185 w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
2186 vreinterpretq_u32_u16(w5.val[1]));
2187 w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
2188 vreinterpretq_u32_u16(w13.val[0]));
2189 w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
2190 vreinterpretq_u32_u16(w13.val[1]));
2191
2192 // Store first 4-line result
2193
2194 #if defined(__aarch64__)
2195 d[8].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
2196 vreinterpretq_u64_u32(w14.val[0]));
2197 d[8].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
2198 vreinterpretq_u64_u32(w14.val[0]));
2199 d[9].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
2200 vreinterpretq_u64_u32(w14.val[1]));
2201 d[9].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
2202 vreinterpretq_u64_u32(w14.val[1]));
2203 d[10].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
2204 vreinterpretq_u64_u32(w15.val[0]));
2205 d[10].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
2206 vreinterpretq_u64_u32(w15.val[0]));
2207 d[11].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
2208 vreinterpretq_u64_u32(w15.val[1]));
2209 d[11].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
2210 vreinterpretq_u64_u32(w15.val[1]));
2211 #else
2212 d[8].val[0] = vreinterpretq_u64_u32(
2213 vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
2214 d[8].val[1] = vreinterpretq_u64_u32(
2215 vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
2216 d[9].val[0] = vreinterpretq_u64_u32(
2217 vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
2218 d[9].val[1] = vreinterpretq_u64_u32(
2219 vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
2220 d[10].val[0] = vreinterpretq_u64_u32(
2221 vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
2222 d[10].val[1] = vreinterpretq_u64_u32(
2223 vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
2224 d[11].val[0] = vreinterpretq_u64_u32(
2225 vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
2226 d[11].val[1] = vreinterpretq_u64_u32(
2227 vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
2228 #endif
2229
2230 w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
2231 vreinterpretq_u16_u8(w1.val[1]));
2232 w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
2233 vreinterpretq_u16_u8(w3.val[1]));
2234 w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]),
2235 vreinterpretq_u16_u8(w9.val[1]));
2236 w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]),
2237 vreinterpretq_u16_u8(w11.val[1]));
2238
2239 w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
2240 vreinterpretq_u32_u16(w5.val[0]));
2241 w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
2242 vreinterpretq_u32_u16(w5.val[1]));
2243 w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
2244 vreinterpretq_u32_u16(w13.val[0]));
2245 w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
2246 vreinterpretq_u32_u16(w13.val[1]));
2247
2248 // Store second 4-line result
2249
2250 #if defined(__aarch64__)
2251 d[12].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
2252 vreinterpretq_u64_u32(w14.val[0]));
2253 d[12].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
2254 vreinterpretq_u64_u32(w14.val[0]));
2255 d[13].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
2256 vreinterpretq_u64_u32(w14.val[1]));
2257 d[13].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
2258 vreinterpretq_u64_u32(w14.val[1]));
2259 d[14].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
2260 vreinterpretq_u64_u32(w15.val[0]));
2261 d[14].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
2262 vreinterpretq_u64_u32(w15.val[0]));
2263 d[15].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
2264 vreinterpretq_u64_u32(w15.val[1]));
2265 d[15].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
2266 vreinterpretq_u64_u32(w15.val[1]));
2267 #else
2268 d[12].val[0] = vreinterpretq_u64_u32(
2269 vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
2270 d[12].val[1] = vreinterpretq_u64_u32(
2271 vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
2272 d[13].val[0] = vreinterpretq_u64_u32(
2273 vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
2274 d[13].val[1] = vreinterpretq_u64_u32(
2275 vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
2276 d[14].val[0] = vreinterpretq_u64_u32(
2277 vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
2278 d[14].val[1] = vreinterpretq_u64_u32(
2279 vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
2280 d[15].val[0] = vreinterpretq_u64_u32(
2281 vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
2282 d[15].val[1] = vreinterpretq_u64_u32(
2283 vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
2284 #endif
2285 }
2286
transpose_TX_16X16(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst)2287 static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
2288 uint8_t *dst, ptrdiff_t pitchDst) {
2289 uint8x16_t r[16];
2290 uint64x2_t d[16];
2291 for (int i = 0; i < 16; i++) {
2292 r[i] = vld1q_u8(src + i * pitchSrc);
2293 }
2294 transpose16x16_neon(r, d);
2295 for (int i = 0; i < 16; i++) {
2296 vst1q_u8(dst + i * pitchDst, vreinterpretq_u8_u64(d[i]));
2297 }
2298 }
2299
transpose(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst,int width,int height)2300 static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
2301 ptrdiff_t pitchDst, int width, int height) {
2302 for (int j = 0; j < height; j += 16) {
2303 for (int i = 0; i < width; i += 16) {
2304 transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
2305 dst + j * pitchDst + i, pitchDst);
2306 }
2307 }
2308 }
2309
dr_prediction_z3_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2310 static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
2311 const uint8_t *left, int upsample_left,
2312 int dy) {
2313 uint8x8_t dstvec[4];
2314 uint16x4x2_t dest;
2315
2316 dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
2317 transpose4x8_8x4_low_neon(dstvec, &dest);
2318 vst1_lane_u32((uint32_t *)(dst + stride * 0),
2319 vreinterpret_u32_u16(dest.val[0]), 0);
2320 vst1_lane_u32((uint32_t *)(dst + stride * 1),
2321 vreinterpret_u32_u16(dest.val[0]), 1);
2322 vst1_lane_u32((uint32_t *)(dst + stride * 2),
2323 vreinterpret_u32_u16(dest.val[1]), 0);
2324 vst1_lane_u32((uint32_t *)(dst + stride * 3),
2325 vreinterpret_u32_u16(dest.val[1]), 1);
2326 }
2327
dr_prediction_z3_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2328 static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
2329 const uint8_t *left, int upsample_left,
2330 int dy) {
2331 uint8x8_t dstvec[8];
2332 uint32x2x2_t d[4];
2333
2334 dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
2335 transpose8x8_neon(dstvec, d);
2336 vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]);
2337 vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]);
2338 vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]);
2339 vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]);
2340 vst1_u32((uint32_t *)(dst + 4 * stride), d[2].val[0]);
2341 vst1_u32((uint32_t *)(dst + 5 * stride), d[2].val[1]);
2342 vst1_u32((uint32_t *)(dst + 6 * stride), d[3].val[0]);
2343 vst1_u32((uint32_t *)(dst + 7 * stride), d[3].val[1]);
2344 }
2345
dr_prediction_z3_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2346 static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
2347 const uint8_t *left, int upsample_left,
2348 int dy) {
2349 uint8x8_t dstvec[4];
2350 uint16x4x2_t d[2];
2351
2352 dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
2353 transpose4x8_8x4_neon(dstvec, d);
2354 vst1_lane_u32((uint32_t *)(dst + stride * 0),
2355 vreinterpret_u32_u16(d[0].val[0]), 0);
2356 vst1_lane_u32((uint32_t *)(dst + stride * 1),
2357 vreinterpret_u32_u16(d[0].val[0]), 1);
2358 vst1_lane_u32((uint32_t *)(dst + stride * 2),
2359 vreinterpret_u32_u16(d[0].val[1]), 0);
2360 vst1_lane_u32((uint32_t *)(dst + stride * 3),
2361 vreinterpret_u32_u16(d[0].val[1]), 1);
2362 vst1_lane_u32((uint32_t *)(dst + stride * 4),
2363 vreinterpret_u32_u16(d[1].val[0]), 0);
2364 vst1_lane_u32((uint32_t *)(dst + stride * 5),
2365 vreinterpret_u32_u16(d[1].val[0]), 1);
2366 vst1_lane_u32((uint32_t *)(dst + stride * 6),
2367 vreinterpret_u32_u16(d[1].val[1]), 0);
2368 vst1_lane_u32((uint32_t *)(dst + stride * 7),
2369 vreinterpret_u32_u16(d[1].val[1]), 1);
2370 }
2371
dr_prediction_z3_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2372 static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
2373 const uint8_t *left, int upsample_left,
2374 int dy) {
2375 uint8x8_t dstvec[8];
2376 uint32x2x2_t d[2];
2377
2378 dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
2379 transpose8x8_low_neon(dstvec, d);
2380 vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]);
2381 vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]);
2382 vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]);
2383 vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]);
2384 }
2385
dr_prediction_z3_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2386 static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
2387 const uint8_t *left, int upsample_left,
2388 int dy) {
2389 uint8x16_t dstvec[8];
2390 uint64x2_t d[8];
2391
2392 dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
2393 transpose8x16_16x8_neon(dstvec, d);
2394 for (int i = 0; i < 8; i++) {
2395 vst1_u8(dst + i * stride, vreinterpret_u8_u64(vget_low_u64(d[i])));
2396 vst1_u8(dst + (i + 8) * stride, vreinterpret_u8_u64(vget_high_u64(d[i])));
2397 }
2398 }
2399
dr_prediction_z3_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2400 static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
2401 const uint8_t *left, int upsample_left,
2402 int dy) {
2403 uint8x8_t dstvec[16];
2404 uint64x2_t d[8];
2405
2406 dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
2407 transpose16x8_8x16_neon(dstvec, d);
2408 for (int i = 0; i < 8; i++) {
2409 vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
2410 }
2411 }
2412
dr_prediction_z3_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2413 static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
2414 const uint8_t *left, int upsample_left,
2415 int dy) {
2416 uint8x16_t dstvec[4];
2417 uint16x8x2_t d[2];
2418
2419 dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
2420 transpose4x16_neon(dstvec, d);
2421 vst1q_lane_u32((uint32_t *)(dst + stride * 0),
2422 vreinterpretq_u32_u16(d[0].val[0]), 0);
2423 vst1q_lane_u32((uint32_t *)(dst + stride * 1),
2424 vreinterpretq_u32_u16(d[0].val[0]), 1);
2425 vst1q_lane_u32((uint32_t *)(dst + stride * 2),
2426 vreinterpretq_u32_u16(d[0].val[0]), 2);
2427 vst1q_lane_u32((uint32_t *)(dst + stride * 3),
2428 vreinterpretq_u32_u16(d[0].val[0]), 3);
2429
2430 vst1q_lane_u32((uint32_t *)(dst + stride * 4),
2431 vreinterpretq_u32_u16(d[0].val[1]), 0);
2432 vst1q_lane_u32((uint32_t *)(dst + stride * 5),
2433 vreinterpretq_u32_u16(d[0].val[1]), 1);
2434 vst1q_lane_u32((uint32_t *)(dst + stride * 6),
2435 vreinterpretq_u32_u16(d[0].val[1]), 2);
2436 vst1q_lane_u32((uint32_t *)(dst + stride * 7),
2437 vreinterpretq_u32_u16(d[0].val[1]), 3);
2438
2439 vst1q_lane_u32((uint32_t *)(dst + stride * 8),
2440 vreinterpretq_u32_u16(d[1].val[0]), 0);
2441 vst1q_lane_u32((uint32_t *)(dst + stride * 9),
2442 vreinterpretq_u32_u16(d[1].val[0]), 1);
2443 vst1q_lane_u32((uint32_t *)(dst + stride * 10),
2444 vreinterpretq_u32_u16(d[1].val[0]), 2);
2445 vst1q_lane_u32((uint32_t *)(dst + stride * 11),
2446 vreinterpretq_u32_u16(d[1].val[0]), 3);
2447
2448 vst1q_lane_u32((uint32_t *)(dst + stride * 12),
2449 vreinterpretq_u32_u16(d[1].val[1]), 0);
2450 vst1q_lane_u32((uint32_t *)(dst + stride * 13),
2451 vreinterpretq_u32_u16(d[1].val[1]), 1);
2452 vst1q_lane_u32((uint32_t *)(dst + stride * 14),
2453 vreinterpretq_u32_u16(d[1].val[1]), 2);
2454 vst1q_lane_u32((uint32_t *)(dst + stride * 15),
2455 vreinterpretq_u32_u16(d[1].val[1]), 3);
2456 }
2457
dr_prediction_z3_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2458 static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
2459 const uint8_t *left, int upsample_left,
2460 int dy) {
2461 uint8x8_t dstvec[16];
2462 uint64x2_t d[8];
2463
2464 dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
2465 transpose16x8_8x16_neon(dstvec, d);
2466 for (int i = 0; i < 4; i++) {
2467 vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
2468 }
2469 }
2470
dr_prediction_z3_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2471 static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
2472 const uint8_t *left, int upsample_left,
2473 int dy) {
2474 uint8x16x2_t dstvec[16];
2475 uint64x2x2_t d[16];
2476 uint8x16_t v_zero = vdupq_n_u8(0);
2477
2478 dr_prediction_z1_32xN_internal_neon(8, dstvec, left, upsample_left, dy);
2479 for (int i = 8; i < 16; i++) {
2480 dstvec[i].val[0] = v_zero;
2481 dstvec[i].val[1] = v_zero;
2482 }
2483 transpose16x32_neon(dstvec, d);
2484 for (int i = 0; i < 16; i++) {
2485 vst1_u8(dst + 2 * i * stride,
2486 vreinterpret_u8_u64(vget_low_u64(d[i].val[0])));
2487 vst1_u8(dst + (2 * i + 1) * stride,
2488 vreinterpret_u8_u64(vget_low_u64(d[i].val[1])));
2489 }
2490 }
2491
dr_prediction_z3_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2492 static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
2493 const uint8_t *left, int upsample_left,
2494 int dy) {
2495 uint8x8_t dstvec[32];
2496 uint64x2_t d[16];
2497
2498 dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
2499 transpose16x8_8x16_neon(dstvec, d);
2500 transpose16x8_8x16_neon(dstvec + 16, d + 8);
2501 for (int i = 0; i < 8; i++) {
2502 vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
2503 vst1q_u8(dst + i * stride + 16, vreinterpretq_u8_u64(d[i + 8]));
2504 }
2505 }
2506
dr_prediction_z3_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2507 static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
2508 const uint8_t *left, int upsample_left,
2509 int dy) {
2510 uint8x16_t dstvec[16];
2511 uint64x2_t d[16];
2512
2513 dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
2514 transpose16x16_neon(dstvec, d);
2515 for (int i = 0; i < 16; i++) {
2516 vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
2517 }
2518 }
2519
dr_prediction_z3_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2520 static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
2521 const uint8_t *left, int upsample_left,
2522 int dy) {
2523 uint8x16x2_t dstvec[32];
2524 uint64x2x2_t d[32];
2525
2526 dr_prediction_z1_32xN_internal_neon(32, dstvec, left, upsample_left, dy);
2527 transpose16x32_neon(dstvec, d);
2528 transpose16x32_neon(dstvec + 16, d + 16);
2529 for (int i = 0; i < 16; i++) {
2530 vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0]));
2531 vst1q_u8(dst + 2 * i * stride + 16, vreinterpretq_u8_u64(d[i + 16].val[0]));
2532 vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1]));
2533 vst1q_u8(dst + (2 * i + 1) * stride + 16,
2534 vreinterpretq_u8_u64(d[i + 16].val[1]));
2535 }
2536 }
2537
dr_prediction_z3_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2538 static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
2539 const uint8_t *left, int upsample_left,
2540 int dy) {
2541 DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
2542
2543 dr_prediction_z1_64xN_neon(64, dstT, 64, left, upsample_left, dy);
2544 transpose(dstT, 64, dst, stride, 64, 64);
2545 }
2546
dr_prediction_z3_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2547 static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
2548 const uint8_t *left, int upsample_left,
2549 int dy) {
2550 uint8x16x2_t dstvec[16];
2551 uint64x2x2_t d[16];
2552
2553 dr_prediction_z1_32xN_internal_neon(16, dstvec, left, upsample_left, dy);
2554 transpose16x32_neon(dstvec, d);
2555 for (int i = 0; i < 16; i++) {
2556 vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0]));
2557 vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1]));
2558 }
2559 }
2560
dr_prediction_z3_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2561 static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
2562 const uint8_t *left, int upsample_left,
2563 int dy) {
2564 uint8x16_t dstvec[32];
2565 uint64x2_t d[16];
2566
2567 dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
2568 for (int i = 0; i < 32; i += 16) {
2569 transpose16x16_neon((dstvec + i), d);
2570 for (int j = 0; j < 16; j++) {
2571 vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j]));
2572 }
2573 }
2574 }
2575
dr_prediction_z3_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2576 static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
2577 const uint8_t *left, int upsample_left,
2578 int dy) {
2579 uint8_t dstT[64 * 32];
2580
2581 dr_prediction_z1_64xN_neon(32, dstT, 64, left, upsample_left, dy);
2582 transpose(dstT, 64, dst, stride, 32, 64);
2583 }
2584
dr_prediction_z3_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2585 static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
2586 const uint8_t *left, int upsample_left,
2587 int dy) {
2588 uint8_t dstT[32 * 64];
2589
2590 dr_prediction_z1_32xN_neon(64, dstT, 32, left, upsample_left, dy);
2591 transpose(dstT, 32, dst, stride, 64, 32);
2592 }
2593
dr_prediction_z3_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2594 static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
2595 const uint8_t *left, int upsample_left,
2596 int dy) {
2597 uint8_t dstT[64 * 16];
2598
2599 dr_prediction_z1_64xN_neon(16, dstT, 64, left, upsample_left, dy);
2600 transpose(dstT, 64, dst, stride, 16, 64);
2601 }
2602
dr_prediction_z3_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2603 static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
2604 const uint8_t *left, int upsample_left,
2605 int dy) {
2606 uint8x16_t dstvec[64];
2607 uint64x2_t d[16];
2608
2609 dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
2610 for (int i = 0; i < 64; i += 16) {
2611 transpose16x16_neon((dstvec + i), d);
2612 for (int j = 0; j < 16; j++) {
2613 vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j]));
2614 }
2615 }
2616 }
2617
av1_dr_prediction_z3_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)2618 void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
2619 const uint8_t *above, const uint8_t *left,
2620 int upsample_left, int dx, int dy) {
2621 (void)above;
2622 (void)dx;
2623 assert(dx == 1);
2624 assert(dy > 0);
2625
2626 if (bw == bh) {
2627 switch (bw) {
2628 case 4:
2629 dr_prediction_z3_4x4_neon(dst, stride, left, upsample_left, dy);
2630 break;
2631 case 8:
2632 dr_prediction_z3_8x8_neon(dst, stride, left, upsample_left, dy);
2633 break;
2634 case 16:
2635 dr_prediction_z3_16x16_neon(dst, stride, left, upsample_left, dy);
2636 break;
2637 case 32:
2638 dr_prediction_z3_32x32_neon(dst, stride, left, upsample_left, dy);
2639 break;
2640 case 64:
2641 dr_prediction_z3_64x64_neon(dst, stride, left, upsample_left, dy);
2642 break;
2643 }
2644 } else {
2645 if (bw < bh) {
2646 if (bw + bw == bh) {
2647 switch (bw) {
2648 case 4:
2649 dr_prediction_z3_4x8_neon(dst, stride, left, upsample_left, dy);
2650 break;
2651 case 8:
2652 dr_prediction_z3_8x16_neon(dst, stride, left, upsample_left, dy);
2653 break;
2654 case 16:
2655 dr_prediction_z3_16x32_neon(dst, stride, left, upsample_left, dy);
2656 break;
2657 case 32:
2658 dr_prediction_z3_32x64_neon(dst, stride, left, upsample_left, dy);
2659 break;
2660 }
2661 } else {
2662 switch (bw) {
2663 case 4:
2664 dr_prediction_z3_4x16_neon(dst, stride, left, upsample_left, dy);
2665 break;
2666 case 8:
2667 dr_prediction_z3_8x32_neon(dst, stride, left, upsample_left, dy);
2668 break;
2669 case 16:
2670 dr_prediction_z3_16x64_neon(dst, stride, left, upsample_left, dy);
2671 break;
2672 }
2673 }
2674 } else {
2675 if (bh + bh == bw) {
2676 switch (bh) {
2677 case 4:
2678 dr_prediction_z3_8x4_neon(dst, stride, left, upsample_left, dy);
2679 break;
2680 case 8:
2681 dr_prediction_z3_16x8_neon(dst, stride, left, upsample_left, dy);
2682 break;
2683 case 16:
2684 dr_prediction_z3_32x16_neon(dst, stride, left, upsample_left, dy);
2685 break;
2686 case 32:
2687 dr_prediction_z3_64x32_neon(dst, stride, left, upsample_left, dy);
2688 break;
2689 }
2690 } else {
2691 switch (bh) {
2692 case 4:
2693 dr_prediction_z3_16x4_neon(dst, stride, left, upsample_left, dy);
2694 break;
2695 case 8:
2696 dr_prediction_z3_32x8_neon(dst, stride, left, upsample_left, dy);
2697 break;
2698 case 16:
2699 dr_prediction_z3_64x16_neon(dst, stride, left, upsample_left, dy);
2700 break;
2701 }
2702 }
2703 }
2704 }
2705 }
2706 static const int sm_weight_log2_scale = 8;
2707
2708 // max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
2709 #define MAX_BLOCK_DIM 64
2710
2711 /* clang-format off */
2712 static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
2713 // Unused, because we always offset by bs, which is at least 2.
2714 0, 0,
2715 // bs = 2
2716 255, 128,
2717 // bs = 4
2718 255, 149, 85, 64,
2719 // bs = 8
2720 255, 197, 146, 105, 73, 50, 37, 32,
2721 // bs = 16
2722 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
2723 // bs = 32
2724 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
2725 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
2726 // bs = 64
2727 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
2728 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
2729 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
2730 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
2731 };
2732 /* clang-format on */
2733
2734 // -----------------------------------------------------------------------------
2735 // SMOOTH_PRED
2736
2737 // pixels[0]: above and below_pred interleave vector
2738 // pixels[1]: left vector
2739 // pixels[2]: right_pred vector
load_pixel_w4(const uint8_t * above,const uint8_t * left,int height,uint8x16_t * pixels)2740 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
2741 int height, uint8x16_t *pixels) {
2742 uint32x4_t zero = vdupq_n_u32(0);
2743 const uint8x8_t d = vcreate_u8(((const uint32_t *)above)[0]);
2744 if (height == 4)
2745 pixels[1] =
2746 vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero, 0));
2747 else if (height == 8) {
2748 pixels[1] = vreinterpretq_u8_u64(vsetq_lane_u64(
2749 ((const uint64_t *)left)[0], vreinterpretq_u64_u32(zero), 0));
2750 } else {
2751 pixels[1] = vld1q_u8(left);
2752 }
2753
2754 pixels[2] = vreinterpretq_u8_u16(vdupq_n_u16(above[3]));
2755
2756 const uint16x8_t bp = vdupq_n_u16(left[height - 1]);
2757 #if defined(__aarch64__)
2758 pixels[0] = vreinterpretq_u8_u16(vzip1q_u16(vmovl_u8(d), bp));
2759 #else
2760 pixels[0] = vreinterpretq_u8_u16(vzipq_u16(vmovl_u8(d), bp).val[0]);
2761 #endif // (__aarch64__)
2762 }
2763
2764 // weight_h[0]: weight_h vector
2765 // weight_h[1]: scale - weight_h vector
2766 // weight_h[2]: same as [0], second half for height = 16 only
2767 // weight_h[3]: same as [1], second half for height = 16 only
2768 // weight_w[0]: weights_w and scale - weights_w interleave vector
load_weight_w4(const uint8_t * weight_array,int height,uint16x8_t * weight_h,uint16x8_t * weight_w)2769 static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
2770 uint16x8_t *weight_h, uint16x8_t *weight_w) {
2771 const uint16x8_t d = vdupq_n_u16((uint16_t)(1 << sm_weight_log2_scale));
2772 const uint8x8_t t = vcreate_u8(((const uint32_t *)(weight_array))[1]);
2773 weight_h[0] = vmovl_u8(t);
2774 weight_h[1] = vsubw_u8(d, t);
2775 #if defined(__aarch64__)
2776 weight_w[0] = vzip1q_u16(weight_h[0], weight_h[1]);
2777 #else
2778 weight_w[0] = vzipq_u16(weight_h[0], weight_h[1]).val[0];
2779 #endif // (__aarch64__)
2780
2781 if (height == 8) {
2782 const uint8x8_t weight = vld1_u8(&weight_array[8]);
2783 weight_h[0] = vmovl_u8(weight);
2784 weight_h[1] = vsubw_u8(d, weight);
2785 } else if (height == 16) {
2786 const uint8x16_t zero = vdupq_n_u8(0);
2787 const uint8x16_t weight = vld1q_u8(&weight_array[16]);
2788 const uint8x16x2_t weight_h_02 = vzipq_u8(weight, zero);
2789 weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
2790 weight_h[1] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[0]));
2791 weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
2792 weight_h[3] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[1]));
2793 }
2794 }
2795
smooth_pred_4xh(const uint8x16_t * pixel,const uint16x8_t * wh,const uint16x8_t * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)2796 static INLINE void smooth_pred_4xh(const uint8x16_t *pixel,
2797 const uint16x8_t *wh, const uint16x8_t *ww,
2798 int h, uint8_t *dst, ptrdiff_t stride,
2799 int second_half) {
2800 const uint16x4_t one = vdup_n_u16(1);
2801 const uint16x4_t inc = vdup_n_u16(0x202);
2802 uint16x4_t rep =
2803 second_half ? vdup_n_u16((uint16_t)0x8008) : vdup_n_u16((uint16_t)0x8000);
2804 uint16x4_t d = vdup_n_u16(0x100);
2805 const uint16x4_t v_pixel_0_lo = vmovn_u32(vreinterpretq_u32_u8(pixel[0]));
2806 const uint16x4_t v_pixel_0_hi =
2807 vmovn_u32(vreinterpretq_u32_u8(vextq_u8(pixel[0], pixel[0], 2)));
2808 const uint16x4_t v_pixel_2 = vget_low_u16(vreinterpretq_u16_u8(pixel[2]));
2809 const uint16x4_t ww_0_lo = vmovn_u32(vreinterpretq_u32_u16(ww[0]));
2810 const uint16x4_t ww_0_hi =
2811 vmovn_u32(vreinterpretq_u32_u16(vextq_u16(ww[0], ww[0], 1)));
2812 const uint8x8_t save_mask = vcreate_u8(0 + (2 << 8) + (4 << 16) + (6 << 24));
2813
2814 #if !defined(__aarch64__)
2815 const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
2816 vget_high_u8(
2817 vreinterpretq_u8_u16(wh[0])) } };
2818 const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
2819 vget_high_u8(
2820 vreinterpretq_u8_u16(wh[1])) } };
2821 const uint8x8x2_t v_split3 = { { vget_low_u8(pixel[1]),
2822 vget_high_u8(pixel[1]) } };
2823 #endif // (__aarch64__)
2824
2825 for (int i = 0; i < h; ++i) {
2826 #if defined(__aarch64__)
2827 const uint8x8_t wg =
2828 vqtbl1_u8(vreinterpretq_u8_u16(wh[0]), vreinterpret_u8_u16(d));
2829 const uint8x8_t sc =
2830 vqtbl1_u8(vreinterpretq_u8_u16(wh[1]), vreinterpret_u8_u16(d));
2831 #else
2832 const uint8x8_t wg = vtbl2_u8(v_split1, vreinterpret_u8_u16(d));
2833 const uint8x8_t sc = vtbl2_u8(v_split2, vreinterpret_u8_u16(d));
2834 #endif // (__aarch64__)
2835
2836 uint32x4_t sum = vmull_u16(v_pixel_0_lo, vreinterpret_u16_u8(wg));
2837 sum = vmlal_u16(sum, v_pixel_0_hi, vreinterpret_u16_u8(sc));
2838
2839 #if defined(__aarch64__)
2840 uint8x8_t b = vqtbl1_u8(pixel[1], vreinterpret_u8_u16(rep));
2841 #else
2842 uint8x8_t b = vtbl2_u8(v_split3, vreinterpret_u8_u16(rep));
2843 #endif // (__aarch64__)
2844
2845 sum = vmlal_u16(sum, vreinterpret_u16_u8(b), ww_0_lo);
2846 sum = vmlal_u16(sum, v_pixel_2, ww_0_hi);
2847 uint8x8_t sum_l = vreinterpret_u8_u16(vqrshrn_n_u32(sum, 9));
2848 uint32x2_t predsh = vreinterpret_u32_u8(vtbl1_u8(sum_l, save_mask));
2849 vst1_lane_u32((uint32_t *)dst, predsh, 0);
2850
2851 dst += stride;
2852
2853 rep = vadd_u16(rep, one);
2854 d = vadd_u16(d, inc);
2855 }
2856 }
2857
aom_smooth_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)2858 void aom_smooth_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
2859 const uint8_t *above, const uint8_t *left) {
2860 uint8x16_t pixels[3];
2861 load_pixel_w4(above, left, 4, pixels);
2862
2863 uint16x8_t wh[4], ww[2];
2864 load_weight_w4(sm_weight_arrays, 4, wh, ww);
2865
2866 smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
2867 }
2868
aom_smooth_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)2869 void aom_smooth_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
2870 const uint8_t *above, const uint8_t *left) {
2871 uint8x16_t pixels[3];
2872 load_pixel_w4(above, left, 8, pixels);
2873
2874 uint16x8_t wh[4], ww[2];
2875 load_weight_w4(sm_weight_arrays, 8, wh, ww);
2876
2877 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
2878 }
2879
aom_smooth_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)2880 void aom_smooth_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
2881 const uint8_t *above, const uint8_t *left) {
2882 uint8x16_t pixels[3];
2883 load_pixel_w4(above, left, 16, pixels);
2884
2885 uint16x8_t wh[4], ww[2];
2886 load_weight_w4(sm_weight_arrays, 16, wh, ww);
2887
2888 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
2889 dst += stride << 3;
2890 smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
2891 }
2892
2893 // pixels[0]: above and below_pred interleave vector, first half
2894 // pixels[1]: above and below_pred interleave vector, second half
2895 // pixels[2]: left vector
2896 // pixels[3]: right_pred vector
2897 // pixels[4]: above and below_pred interleave vector, first half
2898 // pixels[5]: above and below_pred interleave vector, second half
2899 // pixels[6]: left vector + 16
2900 // pixels[7]: right_pred vector
load_pixel_w8(const uint8_t * above,const uint8_t * left,int height,uint8x16_t * pixels)2901 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
2902 int height, uint8x16_t *pixels) {
2903 pixels[0] = vreinterpretq_u8_u16(vmovl_u8(vld1_u8(above)));
2904 pixels[1] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)left[height - 1]));
2905 pixels[3] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)above[7]));
2906
2907 if (height == 4) {
2908 const uint32x4_t zero32 = vdupq_n_u32(0);
2909 pixels[2] =
2910 vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero32, 0));
2911 } else if (height == 8) {
2912 const uint64x2_t zero64 = vdupq_n_u64(0);
2913 pixels[2] = vreinterpretq_u8_u64(
2914 vsetq_lane_u64(((const uint64_t *)left)[0], zero64, 0));
2915 } else if (height == 16) {
2916 pixels[2] = vld1q_u8(left);
2917 } else {
2918 pixels[2] = vld1q_u8(left);
2919 pixels[4] = pixels[0];
2920 pixels[5] = pixels[1];
2921 pixels[6] = vld1q_u8(left + 16);
2922 pixels[7] = pixels[3];
2923 }
2924 }
2925
2926 // weight_h[0]: weight_h vector
2927 // weight_h[1]: scale - weight_h vector
2928 // weight_h[2]: same as [0], offset 8
2929 // weight_h[3]: same as [1], offset 8
2930 // weight_h[4]: same as [0], offset 16
2931 // weight_h[5]: same as [1], offset 16
2932 // weight_h[6]: same as [0], offset 24
2933 // weight_h[7]: same as [1], offset 24
2934 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
2935 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_w8(const uint8_t * weight_array,int height,uint16x8_t * weight_h,uint16x8_t * weight_w)2936 static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
2937 uint16x8_t *weight_h, uint16x8_t *weight_w) {
2938 const uint8x16_t zero = vdupq_n_u8(0);
2939 const int we_offset = height < 8 ? 4 : 8;
2940 uint8x16_t we = vld1q_u8(&weight_array[we_offset]);
2941 #if defined(__aarch64__)
2942 weight_h[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
2943 #else
2944 weight_h[0] = vreinterpretq_u16_u8(vzipq_u8(we, zero).val[0]);
2945 #endif // (__aarch64__)
2946 const uint16x8_t d = vdupq_n_u16(256);
2947 weight_h[1] = vsubq_u16(d, weight_h[0]);
2948
2949 if (height == 4) {
2950 we = vextq_u8(we, zero, 4);
2951 #if defined(__aarch64__)
2952 weight_w[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
2953 #else
2954 weight_w[0] = vmovl_u8(vget_low_u8(we));
2955 #endif // (__aarch64__)
2956 weight_w[1] = vsubq_u16(d, weight_w[0]);
2957 } else {
2958 weight_w[0] = weight_h[0];
2959 weight_w[1] = weight_h[1];
2960 }
2961
2962 if (height == 16) {
2963 we = vld1q_u8(&weight_array[16]);
2964 const uint8x16x2_t weight_h_02 = vzipq_u8(we, zero);
2965 weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
2966 weight_h[1] = vsubq_u16(d, weight_h[0]);
2967 weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
2968 weight_h[3] = vsubq_u16(d, weight_h[2]);
2969 } else if (height == 32) {
2970 const uint8x16_t weight_lo = vld1q_u8(&weight_array[32]);
2971 const uint8x16x2_t weight_h_02 = vzipq_u8(weight_lo, zero);
2972 weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
2973 weight_h[1] = vsubq_u16(d, weight_h[0]);
2974 weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
2975 weight_h[3] = vsubq_u16(d, weight_h[2]);
2976 const uint8x16_t weight_hi = vld1q_u8(&weight_array[32 + 16]);
2977 const uint8x16x2_t weight_h_46 = vzipq_u8(weight_hi, zero);
2978 weight_h[4] = vreinterpretq_u16_u8(weight_h_46.val[0]);
2979 weight_h[5] = vsubq_u16(d, weight_h[4]);
2980 weight_h[6] = vreinterpretq_u16_u8(weight_h_46.val[1]);
2981 weight_h[7] = vsubq_u16(d, weight_h[6]);
2982 }
2983 }
2984
smooth_pred_8xh(const uint8x16_t * pixels,const uint16x8_t * wh,const uint16x8_t * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)2985 static INLINE void smooth_pred_8xh(const uint8x16_t *pixels,
2986 const uint16x8_t *wh, const uint16x8_t *ww,
2987 int h, uint8_t *dst, ptrdiff_t stride,
2988 int second_half) {
2989 const uint16x8_t one = vdupq_n_u16(1);
2990 const uint16x8_t inc = vdupq_n_u16(0x202);
2991 uint16x8_t rep = second_half ? vdupq_n_u16((uint16_t)0x8008)
2992 : vdupq_n_u16((uint16_t)0x8000);
2993 uint16x8_t d = vdupq_n_u16(0x100);
2994
2995 #if !defined(__aarch64__)
2996 const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
2997 vget_high_u8(
2998 vreinterpretq_u8_u16(wh[0])) } };
2999 const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
3000 vget_high_u8(
3001 vreinterpretq_u8_u16(wh[1])) } };
3002 const uint8x8x2_t v_split3 = { { vget_low_u8(pixels[2]),
3003 vget_high_u8(pixels[2]) } };
3004 #endif
3005
3006 for (int i = 0; i < h; ++i) {
3007 #if defined(__aarch64__)
3008 const uint8x16_t wg_wg =
3009 vqtbl1q_u8(vreinterpretq_u8_u16(wh[0]), vreinterpretq_u8_u16(d));
3010 const uint8x16_t sc_sc =
3011 vqtbl1q_u8(vreinterpretq_u8_u16(wh[1]), vreinterpretq_u8_u16(d));
3012 #else
3013 const uint8x8_t v_d_lo = vreinterpret_u8_u16(vget_low_u16(d));
3014 const uint8x8_t v_d_hi = vreinterpret_u8_u16(vget_high_u16(d));
3015 const uint8x16_t wg_wg =
3016 vcombine_u8(vtbl2_u8(v_split1, v_d_lo), vtbl2_u8(v_split1, v_d_hi));
3017 const uint8x16_t sc_sc =
3018 vcombine_u8(vtbl2_u8(v_split2, v_d_lo), vtbl2_u8(v_split2, v_d_hi));
3019 #endif // (__aarch64__)
3020 uint16x8_t s01 =
3021 vmulq_u16(vreinterpretq_u16_u8(pixels[0]), vreinterpretq_u16_u8(wg_wg));
3022 s01 = vmlaq_u16(s01, vreinterpretq_u16_u8(pixels[1]),
3023 vreinterpretq_u16_u8(sc_sc));
3024 #if defined(__aarch64__)
3025 const uint8x16_t b = vqtbl1q_u8(pixels[2], vreinterpretq_u8_u16(rep));
3026 #else
3027 const uint8x16_t b = vcombine_u8(
3028 vtbl2_u8(v_split3, vget_low_u8(vreinterpretq_u8_u16(rep))),
3029 vtbl2_u8(v_split3, vget_high_u8(vreinterpretq_u8_u16(rep))));
3030 #endif // (__aarch64__)
3031 uint16x8_t sum0 = vmulq_u16(vreinterpretq_u16_u8(b), ww[0]);
3032 sum0 = vmlaq_u16(sum0, vreinterpretq_u16_u8(pixels[3]), ww[1]);
3033
3034 uint32x4_t s0 = vaddl_u16(vget_low_u16(s01), vget_low_u16(sum0));
3035 #if defined(__aarch64__)
3036 uint32x4_t s1 = vaddl_high_u16(s01, sum0);
3037 #else
3038 uint32x4_t s1 = vaddl_u16(vget_high_u16(s01), vget_high_u16(sum0));
3039 #endif // (__aarch64__)
3040
3041 sum0 = vcombine_u16(vqrshrn_n_u32(s0, 9), vqrshrn_n_u32(s1, 9));
3042 uint8x8_t predsh = vqmovn_u16(sum0);
3043 vst1_u8(dst, predsh);
3044
3045 dst += stride;
3046 rep = vaddq_u16(rep, one);
3047 d = vaddq_u16(d, inc);
3048 }
3049 }
3050
aom_smooth_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3051 void aom_smooth_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
3052 const uint8_t *above, const uint8_t *left) {
3053 uint8x16_t pixels[4];
3054 load_pixel_w8(above, left, 4, pixels);
3055
3056 uint16x8_t wh[4], ww[2];
3057 load_weight_w8(sm_weight_arrays, 4, wh, ww);
3058
3059 smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
3060 }
3061
aom_smooth_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3062 void aom_smooth_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
3063 const uint8_t *above, const uint8_t *left) {
3064 uint8x16_t pixels[4];
3065 load_pixel_w8(above, left, 8, pixels);
3066
3067 uint16x8_t wh[4], ww[2];
3068 load_weight_w8(sm_weight_arrays, 8, wh, ww);
3069
3070 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
3071 }
3072
aom_smooth_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3073 void aom_smooth_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
3074 const uint8_t *above, const uint8_t *left) {
3075 uint8x16_t pixels[4];
3076 load_pixel_w8(above, left, 16, pixels);
3077
3078 uint16x8_t wh[4], ww[2];
3079 load_weight_w8(sm_weight_arrays, 16, wh, ww);
3080
3081 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
3082 dst += stride << 3;
3083 smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
3084 }
3085
aom_smooth_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3086 void aom_smooth_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
3087 const uint8_t *above, const uint8_t *left) {
3088 uint8x16_t pixels[8];
3089 load_pixel_w8(above, left, 32, pixels);
3090
3091 uint16x8_t wh[8], ww[2];
3092 load_weight_w8(sm_weight_arrays, 32, wh, ww);
3093
3094 smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
3095 dst += stride << 3;
3096 smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
3097 dst += stride << 3;
3098 smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
3099 dst += stride << 3;
3100 smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
3101 }
3102
smooth_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)3103 static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
3104 const uint8_t *above,
3105 const uint8_t *left, uint32_t bw,
3106 uint32_t bh) {
3107 const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
3108 const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
3109 const uint16x8_t scale_value = vdupq_n_u16(256);
3110
3111 for (uint32_t y = 0; y < bh; ++y) {
3112 const uint8x8_t left_y = vdup_n_u8(left[y]);
3113 const uint8x8_t weights_y_dup = vdup_n_u8(sm_weights_h[y]);
3114 const uint32x4_t pred_scaled_bl =
3115 vdupq_n_u32(256 + (256 - sm_weights_h[y]) * left[bh - 1]);
3116
3117 for (uint32_t x = 0; x < bw; x += 8) {
3118 const uint8x8_t weights_x = vld1_u8(sm_weights_w + x);
3119 const uint8x8_t top_x = vld1_u8(above + x);
3120
3121 uint16x8_t pred_m1, pred_m2;
3122 uint32x4_t pred_lo, pred_hi;
3123 pred_m1 = vmull_u8(top_x, weights_y_dup);
3124 pred_m2 = vmull_u8(weights_x, left_y);
3125
3126 pred_lo = vaddl_u16(vget_low_u16(pred_m1), vget_low_u16(pred_m2));
3127 #if defined(__aarch64__)
3128 pred_hi = vaddl_high_u16(pred_m1, pred_m2);
3129 #else
3130 pred_hi = vaddl_u16(vget_high_u16(pred_m1), vget_high_u16(pred_m2));
3131 #endif // (__aarch64__)
3132
3133 const uint16x8_t scale_m_weights_x = vsubw_u8(scale_value, weights_x);
3134
3135 const uint16x8_t swxtr = vmulq_n_u16(scale_m_weights_x, above[bw - 1]);
3136
3137 pred_lo = vaddq_u32(pred_lo, pred_scaled_bl);
3138 pred_hi = vaddq_u32(pred_hi, pred_scaled_bl);
3139
3140 pred_lo = vaddw_u16(pred_lo, vget_low_u16(swxtr));
3141 #if defined(__aarch64__)
3142 pred_hi = vaddw_high_u16(pred_hi, swxtr);
3143 #else
3144 pred_hi = vaddw_u16(pred_hi, vget_high_u16(swxtr));
3145 #endif // (__aarch64__)
3146
3147 uint16x8_t pred =
3148 vcombine_u16(vshrn_n_u32(pred_lo, 9), vshrn_n_u32(pred_hi, 9));
3149
3150 uint8x8_t predsh = vqmovn_u16(pred);
3151
3152 vst1_u8(dst + x, predsh);
3153 }
3154
3155 dst += stride;
3156 }
3157 }
3158
aom_smooth_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3159 void aom_smooth_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
3160 const uint8_t *above, const uint8_t *left) {
3161 smooth_predictor_wxh(dst, stride, above, left, 16, 4);
3162 }
3163
aom_smooth_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3164 void aom_smooth_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
3165 const uint8_t *above, const uint8_t *left) {
3166 smooth_predictor_wxh(dst, stride, above, left, 16, 8);
3167 }
3168
aom_smooth_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3169 void aom_smooth_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
3170 const uint8_t *above,
3171 const uint8_t *left) {
3172 smooth_predictor_wxh(dst, stride, above, left, 16, 16);
3173 }
3174
aom_smooth_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3175 void aom_smooth_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
3176 const uint8_t *above,
3177 const uint8_t *left) {
3178 smooth_predictor_wxh(dst, stride, above, left, 16, 32);
3179 }
3180
aom_smooth_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3181 void aom_smooth_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
3182 const uint8_t *above, const uint8_t *left) {
3183 smooth_predictor_wxh(dst, stride, above, left, 32, 8);
3184 }
3185
aom_smooth_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3186 void aom_smooth_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
3187 const uint8_t *above,
3188 const uint8_t *left) {
3189 smooth_predictor_wxh(dst, stride, above, left, 32, 16);
3190 }
3191
aom_smooth_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3192 void aom_smooth_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
3193 const uint8_t *above,
3194 const uint8_t *left) {
3195 smooth_predictor_wxh(dst, stride, above, left, 32, 32);
3196 }
3197
aom_smooth_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3198 void aom_smooth_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
3199 const uint8_t *above,
3200 const uint8_t *left) {
3201 smooth_predictor_wxh(dst, stride, above, left, 32, 64);
3202 }
3203
aom_smooth_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3204 void aom_smooth_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
3205 const uint8_t *above,
3206 const uint8_t *left) {
3207 smooth_predictor_wxh(dst, stride, above, left, 64, 64);
3208 }
3209
aom_smooth_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3210 void aom_smooth_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
3211 const uint8_t *above,
3212 const uint8_t *left) {
3213 smooth_predictor_wxh(dst, stride, above, left, 64, 32);
3214 }
3215
aom_smooth_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3216 void aom_smooth_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
3217 const uint8_t *above,
3218 const uint8_t *left) {
3219 smooth_predictor_wxh(dst, stride, above, left, 64, 16);
3220 }
3221
aom_smooth_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3222 void aom_smooth_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
3223 const uint8_t *above,
3224 const uint8_t *left) {
3225 smooth_predictor_wxh(dst, stride, above, left, 16, 64);
3226 }
3227