1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 
14 #include "common/tools_common.h"
15 
16 #include "config/aom_config.h"
17 #include "config/aom_dsp_rtcd.h"
18 
19 #include "aom/aom_integer.h"
20 
21 //------------------------------------------------------------------------------
22 // DC 4x4
23 
24 // 'do_above' and 'do_left' facilitate branch removal when inlined.
dc_4x4(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int do_above,int do_left)25 static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
26                           const uint8_t *left, int do_above, int do_left) {
27   uint16x8_t sum_top;
28   uint16x8_t sum_left;
29   uint8x8_t dc0;
30 
31   if (do_above) {
32     const uint8x8_t A = vld1_u8(above);  // top row
33     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
34     const uint16x4_t p1 = vpadd_u16(p0, p0);
35     sum_top = vcombine_u16(p1, p1);
36   }
37 
38   if (do_left) {
39     const uint8x8_t L = vld1_u8(left);   // left border
40     const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
41     const uint16x4_t p1 = vpadd_u16(p0, p0);
42     sum_left = vcombine_u16(p1, p1);
43   }
44 
45   if (do_above && do_left) {
46     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
47     dc0 = vrshrn_n_u16(sum, 3);
48   } else if (do_above) {
49     dc0 = vrshrn_n_u16(sum_top, 2);
50   } else if (do_left) {
51     dc0 = vrshrn_n_u16(sum_left, 2);
52   } else {
53     dc0 = vdup_n_u8(0x80);
54   }
55 
56   {
57     const uint8x8_t dc = vdup_lane_u8(dc0, 0);
58     int i;
59     for (i = 0; i < 4; ++i) {
60       vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
61     }
62   }
63 }
64 
aom_dc_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)65 void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
66                                const uint8_t *above, const uint8_t *left) {
67   dc_4x4(dst, stride, above, left, 1, 1);
68 }
69 
aom_dc_left_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)70 void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
71                                     const uint8_t *above, const uint8_t *left) {
72   (void)above;
73   dc_4x4(dst, stride, NULL, left, 0, 1);
74 }
75 
aom_dc_top_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)76 void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
77                                    const uint8_t *above, const uint8_t *left) {
78   (void)left;
79   dc_4x4(dst, stride, above, NULL, 1, 0);
80 }
81 
aom_dc_128_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)82 void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
83                                    const uint8_t *above, const uint8_t *left) {
84   (void)above;
85   (void)left;
86   dc_4x4(dst, stride, NULL, NULL, 0, 0);
87 }
88 
89 //------------------------------------------------------------------------------
90 // DC 8x8
91 
92 // 'do_above' and 'do_left' facilitate branch removal when inlined.
dc_8x8(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int do_above,int do_left)93 static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
94                           const uint8_t *left, int do_above, int do_left) {
95   uint16x8_t sum_top;
96   uint16x8_t sum_left;
97   uint8x8_t dc0;
98 
99   if (do_above) {
100     const uint8x8_t A = vld1_u8(above);  // top row
101     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
102     const uint16x4_t p1 = vpadd_u16(p0, p0);
103     const uint16x4_t p2 = vpadd_u16(p1, p1);
104     sum_top = vcombine_u16(p2, p2);
105   }
106 
107   if (do_left) {
108     const uint8x8_t L = vld1_u8(left);   // left border
109     const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
110     const uint16x4_t p1 = vpadd_u16(p0, p0);
111     const uint16x4_t p2 = vpadd_u16(p1, p1);
112     sum_left = vcombine_u16(p2, p2);
113   }
114 
115   if (do_above && do_left) {
116     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
117     dc0 = vrshrn_n_u16(sum, 4);
118   } else if (do_above) {
119     dc0 = vrshrn_n_u16(sum_top, 3);
120   } else if (do_left) {
121     dc0 = vrshrn_n_u16(sum_left, 3);
122   } else {
123     dc0 = vdup_n_u8(0x80);
124   }
125 
126   {
127     const uint8x8_t dc = vdup_lane_u8(dc0, 0);
128     int i;
129     for (i = 0; i < 8; ++i) {
130       vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
131     }
132   }
133 }
134 
aom_dc_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)135 void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
136                                const uint8_t *above, const uint8_t *left) {
137   dc_8x8(dst, stride, above, left, 1, 1);
138 }
139 
aom_dc_left_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)140 void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
141                                     const uint8_t *above, const uint8_t *left) {
142   (void)above;
143   dc_8x8(dst, stride, NULL, left, 0, 1);
144 }
145 
aom_dc_top_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)146 void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
147                                    const uint8_t *above, const uint8_t *left) {
148   (void)left;
149   dc_8x8(dst, stride, above, NULL, 1, 0);
150 }
151 
aom_dc_128_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)152 void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
153                                    const uint8_t *above, const uint8_t *left) {
154   (void)above;
155   (void)left;
156   dc_8x8(dst, stride, NULL, NULL, 0, 0);
157 }
158 
159 //------------------------------------------------------------------------------
160 // DC 16x16
161 
162 // 'do_above' and 'do_left' facilitate branch removal when inlined.
dc_16x16(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int do_above,int do_left)163 static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
164                             const uint8_t *above, const uint8_t *left,
165                             int do_above, int do_left) {
166   uint16x8_t sum_top;
167   uint16x8_t sum_left;
168   uint8x8_t dc0;
169 
170   if (do_above) {
171     const uint8x16_t A = vld1q_u8(above);  // top row
172     const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
173     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
174     const uint16x4_t p2 = vpadd_u16(p1, p1);
175     const uint16x4_t p3 = vpadd_u16(p2, p2);
176     sum_top = vcombine_u16(p3, p3);
177   }
178 
179   if (do_left) {
180     const uint8x16_t L = vld1q_u8(left);  // left row
181     const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
182     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
183     const uint16x4_t p2 = vpadd_u16(p1, p1);
184     const uint16x4_t p3 = vpadd_u16(p2, p2);
185     sum_left = vcombine_u16(p3, p3);
186   }
187 
188   if (do_above && do_left) {
189     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
190     dc0 = vrshrn_n_u16(sum, 5);
191   } else if (do_above) {
192     dc0 = vrshrn_n_u16(sum_top, 4);
193   } else if (do_left) {
194     dc0 = vrshrn_n_u16(sum_left, 4);
195   } else {
196     dc0 = vdup_n_u8(0x80);
197   }
198 
199   {
200     const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
201     int i;
202     for (i = 0; i < 16; ++i) {
203       vst1q_u8(dst + i * stride, dc);
204     }
205   }
206 }
207 
aom_dc_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)208 void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
209                                  const uint8_t *above, const uint8_t *left) {
210   dc_16x16(dst, stride, above, left, 1, 1);
211 }
212 
aom_dc_left_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)213 void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
214                                       const uint8_t *above,
215                                       const uint8_t *left) {
216   (void)above;
217   dc_16x16(dst, stride, NULL, left, 0, 1);
218 }
219 
aom_dc_top_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)220 void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
221                                      const uint8_t *above,
222                                      const uint8_t *left) {
223   (void)left;
224   dc_16x16(dst, stride, above, NULL, 1, 0);
225 }
226 
aom_dc_128_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)227 void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
228                                      const uint8_t *above,
229                                      const uint8_t *left) {
230   (void)above;
231   (void)left;
232   dc_16x16(dst, stride, NULL, NULL, 0, 0);
233 }
234 
235 //------------------------------------------------------------------------------
236 // DC 32x32
237 
238 // 'do_above' and 'do_left' facilitate branch removal when inlined.
dc_32x32(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int do_above,int do_left)239 static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
240                             const uint8_t *above, const uint8_t *left,
241                             int do_above, int do_left) {
242   uint16x8_t sum_top;
243   uint16x8_t sum_left;
244   uint8x8_t dc0;
245 
246   if (do_above) {
247     const uint8x16_t A0 = vld1q_u8(above);  // top row
248     const uint8x16_t A1 = vld1q_u8(above + 16);
249     const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
250     const uint16x8_t p1 = vpaddlq_u8(A1);
251     const uint16x8_t p2 = vaddq_u16(p0, p1);
252     const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
253     const uint16x4_t p4 = vpadd_u16(p3, p3);
254     const uint16x4_t p5 = vpadd_u16(p4, p4);
255     sum_top = vcombine_u16(p5, p5);
256   }
257 
258   if (do_left) {
259     const uint8x16_t L0 = vld1q_u8(left);  // left row
260     const uint8x16_t L1 = vld1q_u8(left + 16);
261     const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
262     const uint16x8_t p1 = vpaddlq_u8(L1);
263     const uint16x8_t p2 = vaddq_u16(p0, p1);
264     const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
265     const uint16x4_t p4 = vpadd_u16(p3, p3);
266     const uint16x4_t p5 = vpadd_u16(p4, p4);
267     sum_left = vcombine_u16(p5, p5);
268   }
269 
270   if (do_above && do_left) {
271     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
272     dc0 = vrshrn_n_u16(sum, 6);
273   } else if (do_above) {
274     dc0 = vrshrn_n_u16(sum_top, 5);
275   } else if (do_left) {
276     dc0 = vrshrn_n_u16(sum_left, 5);
277   } else {
278     dc0 = vdup_n_u8(0x80);
279   }
280 
281   {
282     const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
283     int i;
284     for (i = 0; i < 32; ++i) {
285       vst1q_u8(dst + i * stride, dc);
286       vst1q_u8(dst + i * stride + 16, dc);
287     }
288   }
289 }
290 
aom_dc_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)291 void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
292                                  const uint8_t *above, const uint8_t *left) {
293   dc_32x32(dst, stride, above, left, 1, 1);
294 }
295 
aom_dc_left_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)296 void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
297                                       const uint8_t *above,
298                                       const uint8_t *left) {
299   (void)above;
300   dc_32x32(dst, stride, NULL, left, 0, 1);
301 }
302 
aom_dc_top_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)303 void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
304                                      const uint8_t *above,
305                                      const uint8_t *left) {
306   (void)left;
307   dc_32x32(dst, stride, above, NULL, 1, 0);
308 }
309 
aom_dc_128_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)310 void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
311                                      const uint8_t *above,
312                                      const uint8_t *left) {
313   (void)above;
314   (void)left;
315   dc_32x32(dst, stride, NULL, NULL, 0, 0);
316 }
317 
318 // -----------------------------------------------------------------------------
319 
aom_d135_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)320 void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
321                                  const uint8_t *above, const uint8_t *left) {
322   const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
323   const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
324   const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
325   const uint32x2_t zero = vdup_n_u32(0);
326   const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
327   const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
328   const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
329   const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
330   const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
331   const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
332   const uint8_t D = vget_lane_u8(XABCD_u8, 4);
333   const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
334   const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
335   const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
336   const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
337   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
338   const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
339   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
340   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
341   const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
342   vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
343   vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
344   vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
345   vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
346 }
347 
aom_v_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)348 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
349                               const uint8_t *above, const uint8_t *left) {
350   int i;
351   uint32x2_t d0u32 = vdup_n_u32(0);
352   (void)left;
353 
354   d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
355   for (i = 0; i < 4; i++, dst += stride)
356     vst1_lane_u32((uint32_t *)dst, d0u32, 0);
357 }
358 
aom_v_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)359 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
360                               const uint8_t *above, const uint8_t *left) {
361   int i;
362   uint8x8_t d0u8 = vdup_n_u8(0);
363   (void)left;
364 
365   d0u8 = vld1_u8(above);
366   for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
367 }
368 
aom_v_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)369 void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
370                                 const uint8_t *above, const uint8_t *left) {
371   int i;
372   uint8x16_t q0u8 = vdupq_n_u8(0);
373   (void)left;
374 
375   q0u8 = vld1q_u8(above);
376   for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
377 }
378 
aom_v_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)379 void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
380                                 const uint8_t *above, const uint8_t *left) {
381   int i;
382   uint8x16_t q0u8 = vdupq_n_u8(0);
383   uint8x16_t q1u8 = vdupq_n_u8(0);
384   (void)left;
385 
386   q0u8 = vld1q_u8(above);
387   q1u8 = vld1q_u8(above + 16);
388   for (i = 0; i < 32; i++, dst += stride) {
389     vst1q_u8(dst, q0u8);
390     vst1q_u8(dst + 16, q1u8);
391   }
392 }
393 
aom_h_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)394 void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
395                               const uint8_t *above, const uint8_t *left) {
396   uint8x8_t d0u8 = vdup_n_u8(0);
397   uint32x2_t d1u32 = vdup_n_u32(0);
398   (void)above;
399 
400   d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
401 
402   d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
403   vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
404   dst += stride;
405   d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
406   vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
407   dst += stride;
408   d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
409   vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
410   dst += stride;
411   d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
412   vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
413 }
414 
aom_h_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)415 void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
416                               const uint8_t *above, const uint8_t *left) {
417   uint8x8_t d0u8 = vdup_n_u8(0);
418   uint64x1_t d1u64 = vdup_n_u64(0);
419   (void)above;
420 
421   d1u64 = vld1_u64((const uint64_t *)left);
422 
423   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
424   vst1_u8(dst, d0u8);
425   dst += stride;
426   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
427   vst1_u8(dst, d0u8);
428   dst += stride;
429   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
430   vst1_u8(dst, d0u8);
431   dst += stride;
432   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
433   vst1_u8(dst, d0u8);
434   dst += stride;
435   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
436   vst1_u8(dst, d0u8);
437   dst += stride;
438   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
439   vst1_u8(dst, d0u8);
440   dst += stride;
441   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
442   vst1_u8(dst, d0u8);
443   dst += stride;
444   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
445   vst1_u8(dst, d0u8);
446 }
447 
aom_h_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)448 void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
449                                 const uint8_t *above, const uint8_t *left) {
450   int j;
451   uint8x8_t d2u8 = vdup_n_u8(0);
452   uint8x16_t q0u8 = vdupq_n_u8(0);
453   uint8x16_t q1u8 = vdupq_n_u8(0);
454   (void)above;
455 
456   q1u8 = vld1q_u8(left);
457   d2u8 = vget_low_u8(q1u8);
458   for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
459     q0u8 = vdupq_lane_u8(d2u8, 0);
460     vst1q_u8(dst, q0u8);
461     dst += stride;
462     q0u8 = vdupq_lane_u8(d2u8, 1);
463     vst1q_u8(dst, q0u8);
464     dst += stride;
465     q0u8 = vdupq_lane_u8(d2u8, 2);
466     vst1q_u8(dst, q0u8);
467     dst += stride;
468     q0u8 = vdupq_lane_u8(d2u8, 3);
469     vst1q_u8(dst, q0u8);
470     dst += stride;
471     q0u8 = vdupq_lane_u8(d2u8, 4);
472     vst1q_u8(dst, q0u8);
473     dst += stride;
474     q0u8 = vdupq_lane_u8(d2u8, 5);
475     vst1q_u8(dst, q0u8);
476     dst += stride;
477     q0u8 = vdupq_lane_u8(d2u8, 6);
478     vst1q_u8(dst, q0u8);
479     dst += stride;
480     q0u8 = vdupq_lane_u8(d2u8, 7);
481     vst1q_u8(dst, q0u8);
482     dst += stride;
483   }
484 }
485 
aom_h_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)486 void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
487                                 const uint8_t *above, const uint8_t *left) {
488   int j, k;
489   uint8x8_t d2u8 = vdup_n_u8(0);
490   uint8x16_t q0u8 = vdupq_n_u8(0);
491   uint8x16_t q1u8 = vdupq_n_u8(0);
492   (void)above;
493 
494   for (k = 0; k < 2; k++, left += 16) {
495     q1u8 = vld1q_u8(left);
496     d2u8 = vget_low_u8(q1u8);
497     for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
498       q0u8 = vdupq_lane_u8(d2u8, 0);
499       vst1q_u8(dst, q0u8);
500       vst1q_u8(dst + 16, q0u8);
501       dst += stride;
502       q0u8 = vdupq_lane_u8(d2u8, 1);
503       vst1q_u8(dst, q0u8);
504       vst1q_u8(dst + 16, q0u8);
505       dst += stride;
506       q0u8 = vdupq_lane_u8(d2u8, 2);
507       vst1q_u8(dst, q0u8);
508       vst1q_u8(dst + 16, q0u8);
509       dst += stride;
510       q0u8 = vdupq_lane_u8(d2u8, 3);
511       vst1q_u8(dst, q0u8);
512       vst1q_u8(dst + 16, q0u8);
513       dst += stride;
514       q0u8 = vdupq_lane_u8(d2u8, 4);
515       vst1q_u8(dst, q0u8);
516       vst1q_u8(dst + 16, q0u8);
517       dst += stride;
518       q0u8 = vdupq_lane_u8(d2u8, 5);
519       vst1q_u8(dst, q0u8);
520       vst1q_u8(dst + 16, q0u8);
521       dst += stride;
522       q0u8 = vdupq_lane_u8(d2u8, 6);
523       vst1q_u8(dst, q0u8);
524       vst1q_u8(dst + 16, q0u8);
525       dst += stride;
526       q0u8 = vdupq_lane_u8(d2u8, 7);
527       vst1q_u8(dst, q0u8);
528       vst1q_u8(dst + 16, q0u8);
529       dst += stride;
530     }
531   }
532 }
533 
highbd_dc_predictor(uint16_t * dst,ptrdiff_t stride,int bw,const uint16_t * above,const uint16_t * left)534 static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
535                                        const uint16_t *above,
536                                        const uint16_t *left) {
537   assert(bw >= 4);
538   assert(IS_POWER_OF_TWO(bw));
539   int expected_dc, sum = 0;
540   const int count = bw * 2;
541   uint32x4_t sum_q = vdupq_n_u32(0);
542   uint32x2_t sum_d;
543   uint16_t *dst_1;
544   if (bw >= 8) {
545     for (int i = 0; i < bw; i += 8) {
546       sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
547       sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
548       above += 8;
549       left += 8;
550     }
551     sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
552     sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
553     expected_dc = (sum + (count >> 1)) / count;
554     const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
555     for (int r = 0; r < bw; r++) {
556       dst_1 = dst;
557       for (int i = 0; i < bw; i += 8) {
558         vst1q_u16(dst_1, dc);
559         dst_1 += 8;
560       }
561       dst += stride;
562     }
563   } else {  // 4x4
564     sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
565     sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
566     sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
567     expected_dc = (sum + (count >> 1)) / count;
568     const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
569     for (int r = 0; r < bw; r++) {
570       vst1_u16(dst, dc);
571       dst += stride;
572     }
573   }
574 }
575 
576 #define intra_pred_highbd_sized_neon(type, width)               \
577   void aom_highbd_##type##_predictor_##width##x##width##_neon(  \
578       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
579       const uint16_t *left, int bd) {                           \
580     (void)bd;                                                   \
581     highbd_##type##_predictor(dst, stride, width, above, left); \
582   }
583 
584 #define intra_pred_square(type)           \
585   intra_pred_highbd_sized_neon(type, 4);  \
586   intra_pred_highbd_sized_neon(type, 8);  \
587   intra_pred_highbd_sized_neon(type, 16); \
588   intra_pred_highbd_sized_neon(type, 32); \
589   intra_pred_highbd_sized_neon(type, 64);
590 
591 intra_pred_square(dc);
592 #undef intra_pred_square
593 
594 /* ---------------------P R E D I C T I O N   Z 1--------------------------- */
595 
596 static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
597   { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
598   { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
599   { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
600   { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
601   { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
602   { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
603   { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
604   { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
605 };
606 
607 // Low bit depth functions
608 static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
609   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
610     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
611   { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
612     0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
613   { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
614     0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
615   { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
616     0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
617   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618     0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
619   { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
620     0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
621   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622     0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
623   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624     0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
625   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
626     0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
627   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
628     0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
629   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
630     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
631     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
632   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
634     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
635   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
636     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
637     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
638   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
639     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
640     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
641   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
642     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
643     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
644   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
645     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
646     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
647   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
648     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
649     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
650   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
651     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
652     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
653   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
654     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
655     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
656   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
657     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
658     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
659   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
660     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
661     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
662   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
663     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
664     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
665   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
666     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
667     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
668   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
669     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
670     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
671   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
672     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
673     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
674   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
675     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
676     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
677   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
678     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
679     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
680   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
681     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
682     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
683   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
684     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
685     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
686   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
687     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
688     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
689   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
690     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
691     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
692   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
693     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
694     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
695   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
696     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
697     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
698 };
699 
700 /* clang-format on */
dr_prediction_z1_HxW_internal_neon_64(int H,int W,uint8x8_t * dst,const uint8_t * above,int upsample_above,int dx)701 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
702     int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
703     int dx) {
704   const int frac_bits = 6 - upsample_above;
705   const int max_base_x = ((W + H) - 1) << upsample_above;
706 
707   assert(dx > 0);
708   // pre-filter above pixels
709   // store in temp buffers:
710   //   above[x] * 32 + 16
711   //   above[x+1] - above[x]
712   // final pixels will be calculated as:
713   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
714 
715   uint16x8_t a0, a1;
716   uint16x8_t diff, a32;
717   uint16x8_t a16;
718   uint8x8_t a_mbase_x;
719 
720   a16 = vdupq_n_u16(16);
721   a_mbase_x = vdup_n_u8(above[max_base_x]);
722   uint16x8_t v_32 = vdupq_n_u16(32);
723   int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
724   uint16x8_t c3f = vdupq_n_u16(0x3f);
725 
726   int x = dx;
727   for (int r = 0; r < W; r++) {
728     uint16x8_t res;
729     uint16x8_t shift;
730     uint8x8x2_t v_tmp_a0_128;
731 
732     int base = x >> frac_bits;
733     int base_max_diff = (max_base_x - base) >> upsample_above;
734     if (base_max_diff <= 0) {
735       for (int i = r; i < W; ++i) {
736         dst[i] = a_mbase_x;  // save 4 values
737       }
738       return;
739     }
740 
741     if (base_max_diff > H) base_max_diff = H;
742 
743     if (upsample_above) {
744       v_tmp_a0_128 = vld2_u8(above + base);
745       shift = vshrq_n_u16(
746           vandq_u16(vshlq_u16(vdupq_n_u16(x), v_upsample_above), c3f), 1);
747     } else {
748       v_tmp_a0_128.val[0] = vld1_u8(above + base);
749       v_tmp_a0_128.val[1] = vld1_u8(above + base + 1);
750       shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
751     }
752     a0 = vmovl_u8(v_tmp_a0_128.val[0]);
753     a1 = vmovl_u8(v_tmp_a0_128.val[1]);
754     diff = vsubq_u16(a1, a0);        // a[x+1] - a[x]
755     a32 = vmlaq_u16(a16, a0, v_32);  // a[x] * 32 + 16
756     res = vmlaq_u16(a32, diff, shift);
757 
758     uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
759     dst[r] =
760         vorr_u8(vand_u8(mask, vshrn_n_u16(res, 5)), vbic_u8(a_mbase_x, mask));
761 
762     x += dx;
763   }
764 }
765 
dr_prediction_z1_4xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)766 static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
767                                       const uint8_t *above, int upsample_above,
768                                       int dx) {
769   uint8x8_t dstvec[16];
770 
771   dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
772                                         dx);
773   for (int i = 0; i < N; i++) {
774     vst1_lane_u32((uint32_t *)(dst + stride * i),
775                   vreinterpret_u32_u8(dstvec[i]), 0);
776   }
777 }
778 
dr_prediction_z1_8xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)779 static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
780                                       const uint8_t *above, int upsample_above,
781                                       int dx) {
782   uint8x8_t dstvec[32];
783 
784   dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
785                                         dx);
786   for (int i = 0; i < N; i++) {
787     vst1_u8(dst + stride * i, dstvec[i]);
788   }
789 }
790 
dr_prediction_z1_HxW_internal_neon(int H,int W,uint8x16_t * dst,const uint8_t * above,int upsample_above,int dx)791 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
792     int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
793     int dx) {
794   const int frac_bits = 6 - upsample_above;
795   const int max_base_x = ((W + H) - 1) << upsample_above;
796 
797   assert(dx > 0);
798   // pre-filter above pixels
799   // store in temp buffers:
800   //   above[x] * 32 + 16
801   //   above[x+1] - above[x]
802   // final pixels will be calculated as:
803   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
804 
805   uint8x16x2_t a0, a1;
806   uint16x8x2_t diff, a32;
807   uint16x8_t a16, c3f;
808   uint8x16_t a_mbase_x;
809 
810   a16 = vdupq_n_u16(16);
811   a_mbase_x = vdupq_n_u8(above[max_base_x]);
812   c3f = vdupq_n_u16(0x3f);
813   uint16x8_t v_32 = vdupq_n_u16(32);
814   uint8x16_t v_zero = vdupq_n_u8(0);
815   int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
816 
817   int x = dx;
818   for (int r = 0; r < W; r++) {
819     uint16x8x2_t res;
820     uint16x8_t shift;
821     uint8x16_t a0_128, a1_128;
822 
823     int base = x >> frac_bits;
824     int base_max_diff = (max_base_x - base) >> upsample_above;
825     if (base_max_diff <= 0) {
826       for (int i = r; i < W; ++i) {
827         dst[i] = a_mbase_x;  // save 4 values
828       }
829       return;
830     }
831 
832     if (base_max_diff > H) base_max_diff = H;
833 
834     if (upsample_above) {
835       uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
836       a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
837       a1_128 = vextq_u8(a0_128, v_zero, 8);
838       shift = vshrq_n_u16(
839           vandq_u16(vshlq_u16(vdupq_n_u16(x), v_upsample_above), c3f), 1);
840     } else {
841       a0_128 = vld1q_u8(above + base);
842       a1_128 = vld1q_u8(above + base + 1);
843       shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
844     }
845     a0 = vzipq_u8(a0_128, v_zero);
846     a1 = vzipq_u8(a1_128, v_zero);
847     diff.val[0] = vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
848                             vreinterpretq_u16_u8(a0.val[0]));  // a[x+1] - a[x]
849     diff.val[1] = vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
850                             vreinterpretq_u16_u8(a0.val[1]));  // a[x+1] - a[x]
851     a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
852                            v_32);  // a[x] * 32 + 16
853     a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
854                            v_32);  // a[x] * 32 + 16
855     res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
856     res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
857     uint8x16_t v_temp =
858         vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
859 
860     uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
861     dst[r] = vorrq_u8(vandq_u8(mask, v_temp), vbicq_u8(a_mbase_x, mask));
862 
863     x += dx;
864   }
865 }
866 
dr_prediction_z1_16xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)867 static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
868                                        const uint8_t *above, int upsample_above,
869                                        int dx) {
870   uint8x16_t dstvec[64];
871 
872   dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
873   for (int i = 0; i < N; i++) {
874     vst1q_u8(dst + stride * i, dstvec[i]);
875   }
876 }
877 
dr_prediction_z1_32xN_internal_neon(int N,uint8x16x2_t * dstvec,const uint8_t * above,int upsample_above,int dx)878 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
879     int N, uint8x16x2_t *dstvec, const uint8_t *above, int upsample_above,
880     int dx) {
881   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
882   (void)upsample_above;
883   const int frac_bits = 6;
884   const int max_base_x = ((32 + N) - 1);
885 
886   // pre-filter above pixels
887   // store in temp buffers:
888   //   above[x] * 32 + 16
889   //   above[x+1] - above[x]
890   // final pixels will be calculated as:
891   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
892 
893   uint8x16_t a_mbase_x;
894   uint8x16x2_t a0, a1;
895   uint16x8x2_t diff, a32;
896   uint16x8_t a16, c3f;
897 
898   a_mbase_x = vdupq_n_u8(above[max_base_x]);
899   a16 = vdupq_n_u16(16);
900   c3f = vdupq_n_u16(0x3f);
901   uint16x8_t v_32 = vdupq_n_u16(32);
902   uint8x16_t v_zero = vdupq_n_u8(0);
903 
904   int x = dx;
905   for (int r = 0; r < N; r++) {
906     uint16x8x2_t res;
907     uint8x16_t res16[2];
908     uint8x16_t a0_128, a1_128;
909 
910     int base = x >> frac_bits;
911     int base_max_diff = (max_base_x - base);
912     if (base_max_diff <= 0) {
913       for (int i = r; i < N; ++i) {
914         dstvec[i].val[0] = a_mbase_x;  // save 32 values
915         dstvec[i].val[1] = a_mbase_x;
916       }
917       return;
918     }
919     if (base_max_diff > 32) base_max_diff = 32;
920 
921     uint16x8_t shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
922 
923     for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
924       int mdiff = base_max_diff - j;
925       if (mdiff <= 0) {
926         res16[jj] = a_mbase_x;
927       } else {
928         a0_128 = vld1q_u8(above + base + j);
929         a1_128 = vld1q_u8(above + base + j + 1);
930         a0 = vzipq_u8(a0_128, v_zero);
931         a1 = vzipq_u8(a1_128, v_zero);
932         diff.val[0] =
933             vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
934                       vreinterpretq_u16_u8(a0.val[0]));  // a[x+1] - a[x]
935         diff.val[1] =
936             vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
937                       vreinterpretq_u16_u8(a0.val[1]));  // a[x+1] - a[x]
938         a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
939                                v_32);  // a[x] * 32 + 16
940         a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
941                                v_32);  // a[x] * 32 + 16
942         res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
943         res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
944 
945         res16[jj] =
946             vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
947       }
948     }
949 
950     uint8x16x2_t mask;
951 
952     mask.val[0] = vld1q_u8(BaseMask[base_max_diff]);
953     mask.val[1] = vld1q_u8(BaseMask[base_max_diff] + 16);
954     dstvec[r].val[0] = vorrq_u8(vandq_u8(mask.val[0], res16[0]),
955                                 vbicq_u8(a_mbase_x, mask.val[0]));
956     dstvec[r].val[1] = vorrq_u8(vandq_u8(mask.val[1], res16[1]),
957                                 vbicq_u8(a_mbase_x, mask.val[1]));
958     x += dx;
959   }
960 }
961 
dr_prediction_z1_32xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)962 static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
963                                        const uint8_t *above, int upsample_above,
964                                        int dx) {
965   uint8x16x2_t dstvec[64];
966 
967   dr_prediction_z1_32xN_internal_neon(N, dstvec, above, upsample_above, dx);
968   for (int i = 0; i < N; i++) {
969     vst1q_u8(dst + stride * i, dstvec[i].val[0]);
970     vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
971   }
972 }
973 
dr_prediction_z1_64xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)974 static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
975                                        const uint8_t *above, int upsample_above,
976                                        int dx) {
977   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
978   (void)upsample_above;
979   const int frac_bits = 6;
980   const int max_base_x = ((64 + N) - 1);
981 
982   // pre-filter above pixels
983   // store in temp buffers:
984   //   above[x] * 32 + 16
985   //   above[x+1] - above[x]
986   // final pixels will be calculated as:
987   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
988 
989   uint8x16x2_t a0, a1;
990   uint16x8x2_t a32, diff;
991   uint16x8_t a16, c3f;
992   uint8x16_t a_mbase_x, max_base_x128, mask128;
993 
994   a16 = vdupq_n_u16(16);
995   a_mbase_x = vdupq_n_u8(above[max_base_x]);
996   max_base_x128 = vdupq_n_u8(max_base_x);
997   c3f = vdupq_n_u16(0x3f);
998   uint16x8_t v_32 = vdupq_n_u16(32);
999   uint8x16_t v_zero = vdupq_n_u8(0);
1000   uint8x16_t step = vdupq_n_u8(16);
1001 
1002   int x = dx;
1003   for (int r = 0; r < N; r++, dst += stride) {
1004     uint16x8x2_t res;
1005 
1006     int base = x >> frac_bits;
1007     if (base >= max_base_x) {
1008       for (int i = r; i < N; ++i) {
1009         vst1q_u8(dst, a_mbase_x);
1010         vst1q_u8(dst + 16, a_mbase_x);
1011         vst1q_u8(dst + 32, a_mbase_x);
1012         vst1q_u8(dst + 48, a_mbase_x);
1013         dst += stride;
1014       }
1015       return;
1016     }
1017 
1018     uint16x8_t shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
1019     uint8x16_t a0_128, a1_128, res128;
1020     uint8x16_t base_inc128 =
1021         vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
1022                                                vcreate_u8(0x0F0E0D0C0B0A0908)));
1023 
1024     for (int j = 0; j < 64; j += 16) {
1025       int mdif = max_base_x - (base + j);
1026       if (mdif <= 0) {
1027         vst1q_u8(dst + j, a_mbase_x);
1028       } else {
1029         a0_128 = vld1q_u8(above + base + j);
1030         a1_128 = vld1q_u8(above + base + 1 + j);
1031         a0 = vzipq_u8(a0_128, v_zero);
1032         a1 = vzipq_u8(a1_128, v_zero);
1033         diff.val[0] =
1034             vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
1035                       vreinterpretq_u16_u8(a0.val[0]));  // a[x+1] - a[x]
1036         diff.val[1] =
1037             vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
1038                       vreinterpretq_u16_u8(a0.val[1]));  // a[x+1] - a[x]
1039         a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
1040                                v_32);  // a[x] * 32 + 16
1041         a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
1042                                v_32);  // a[x] * 32 + 16
1043         res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
1044         res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
1045         uint8x16_t v_temp =
1046             vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
1047 
1048         mask128 = vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), v_zero);
1049         res128 =
1050             vorrq_u8(vandq_u8(mask128, v_temp), vbicq_u8(a_mbase_x, mask128));
1051         vst1q_u8(dst + j, res128);
1052 
1053         base_inc128 = vaddq_u8(base_inc128, step);
1054       }
1055     }
1056     x += dx;
1057   }
1058 }
1059 
1060 // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)1061 void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1062                                const uint8_t *above, const uint8_t *left,
1063                                int upsample_above, int dx, int dy) {
1064   (void)left;
1065   (void)dy;
1066 
1067   switch (bw) {
1068     case 4:
1069       dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
1070       break;
1071     case 8:
1072       dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
1073       break;
1074     case 16:
1075       dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
1076       break;
1077     case 32:
1078       dr_prediction_z1_32xN_neon(bh, dst, stride, above, upsample_above, dx);
1079       break;
1080     case 64:
1081       dr_prediction_z1_64xN_neon(bh, dst, stride, above, upsample_above, dx);
1082       break;
1083     default: break;
1084   }
1085   return;
1086 }
1087 
1088 /* ---------------------P R E D I C T I O N   Z 2--------------------------- */
1089 
1090 static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
1091   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1092   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
1093   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1094     0, 0, 0 },
1095   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1096     0xff, 0xff, 0xff, 0xff }
1097 };
1098 
vector_shift_x4(uint8x8_t * vec,uint8x8_t * v_zero,int shift_value)1099 static AOM_FORCE_INLINE void vector_shift_x4(uint8x8_t *vec, uint8x8_t *v_zero,
1100                                              int shift_value) {
1101   switch (shift_value) {
1102     case 1: *vec = vext_u8(*v_zero, *vec, 7); break;
1103     case 2: *vec = vext_u8(*v_zero, *vec, 6); break;
1104     case 3: *vec = vext_u8(*v_zero, *vec, 5); break;
1105     default: break;
1106   }
1107 }
1108 
dr_prediction_z2_Nx4_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1109 static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
1110                                       const uint8_t *above, const uint8_t *left,
1111                                       int upsample_above, int upsample_left,
1112                                       int dx, int dy) {
1113   const int min_base_x = -(1 << upsample_above);
1114   const int min_base_y = -(1 << upsample_left);
1115   const int frac_bits_x = 6 - upsample_above;
1116   const int frac_bits_y = 6 - upsample_left;
1117 
1118   assert(dx > 0);
1119   // pre-filter above pixels
1120   // store in temp buffers:
1121   //   above[x] * 32 + 16
1122   //   above[x+1] - above[x]
1123   // final pixels will be calculated as:
1124   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1125   uint16x8_t a0_x, a1_x, a32, diff;
1126   uint16x8_t v_32 = vdupq_n_u16(32);
1127   uint16x8_t v_zero = vdupq_n_u16(0);
1128   uint16x8_t a16 = vdupq_n_u16(16);
1129 
1130   uint8x8_t v_zero_u8 = vdup_n_u8(0);
1131   uint16x4_t v_c3f = vdup_n_u16(0x3f);
1132   uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
1133   int16x4_t v_upsample_left = vdup_n_s16(upsample_left);
1134   int16x4_t v_upsample_above = vdup_n_s16(upsample_above);
1135   int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
1136   int16x4_t dy64 = vdup_n_s16(dy);
1137   int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
1138   int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
1139   int16x4_t v_one = vdup_lane_s16(v_1234, 0);
1140 
1141   for (int r = 0; r < N; r++) {
1142     uint16x8_t res, shift;
1143     uint16x4_t ydx;
1144     uint8x8_t resx, resy;
1145     uint16x4x2_t v_shift;
1146 
1147     int y = r + 1;
1148     int base_x = (-y * dx) >> frac_bits_x;
1149     int base_shift = 0;
1150     if (base_x < (min_base_x - 1)) {
1151       base_shift = (min_base_x - base_x - 1) >> upsample_above;
1152     }
1153     int base_min_diff =
1154         (min_base_x - base_x + upsample_above) >> upsample_above;
1155     if (base_min_diff > 4) {
1156       base_min_diff = 4;
1157     } else {
1158       if (base_min_diff < 0) base_min_diff = 0;
1159     }
1160 
1161     if (base_shift > 3) {
1162       a0_x = v_zero;
1163       a1_x = v_zero;
1164       v_shift.val[0] = vreinterpret_u16_u8(v_zero_u8);
1165       v_shift.val[1] = vreinterpret_u16_u8(v_zero_u8);
1166     } else {
1167       ydx = vdup_n_u16(y * dx);
1168 
1169       if (upsample_above) {
1170         uint8x8x2_t v_tmp;
1171         v_tmp.val[0] = vld1_u8(above + base_x + base_shift);
1172         v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8);
1173         uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]);
1174         uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
1175         a0_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_low));
1176         a1_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_high));
1177         v_shift.val[0] = vshr_n_u16(
1178             vand_u16(vshl_u16(vsub_u16(r6, ydx), v_upsample_above), v_c3f), 1);
1179       } else {
1180         uint8x8_t v_a0_x64 = vld1_u8(above + base_x + base_shift);
1181         vector_shift_x4(&v_a0_x64, &v_zero_u8, base_shift);
1182         uint8x8_t v_a1_x64 = vext_u8(v_a0_x64, v_zero_u8, 1);
1183         v_shift.val[0] = vshr_n_u16(vand_u16(vsub_u16(r6, ydx), v_c3f), 1);
1184         a0_x = vmovl_u8(v_a0_x64);
1185         a1_x = vmovl_u8(v_a1_x64);
1186       }
1187     }
1188 
1189     // y calc
1190     uint8x8_t a0_y, a1_y;
1191     if (base_x < min_base_x) {
1192       DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
1193       int16x4_t v_r6 = vdup_n_s16(r << 6);
1194       int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
1195       int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
1196       uint16x4_t mask64 = vcgt_s16(min_base_y64, base_y_c64);
1197 
1198       base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
1199       vst1_s16(base_y_c, base_y_c64);
1200       a0_y = v_zero_u8;
1201       a0_y = vld1_lane_u8(left + base_y_c[0], a0_y, 0);
1202       a0_y = vld1_lane_u8(left + base_y_c[1], a0_y, 2);
1203       a0_y = vld1_lane_u8(left + base_y_c[2], a0_y, 4);
1204       a0_y = vld1_lane_u8(left + base_y_c[3], a0_y, 6);
1205 
1206       base_y_c64 = vadd_s16(base_y_c64, v_one);
1207       vst1_s16(base_y_c, base_y_c64);
1208       a1_y = v_zero_u8;
1209       a1_y = vld1_lane_u8(left + base_y_c[0], a1_y, 0);
1210       a1_y = vld1_lane_u8(left + base_y_c[1], a1_y, 2);
1211       a1_y = vld1_lane_u8(left + base_y_c[2], a1_y, 4);
1212       a1_y = vld1_lane_u8(left + base_y_c[3], a1_y, 6);
1213 
1214       if (upsample_left) {
1215         v_shift.val[1] = vshr_n_u16(
1216             vand_u16(vshl_u16(vreinterpret_u16_s16(y_c64), v_upsample_left),
1217                      v_c3f),
1218             1);
1219       } else {
1220         v_shift.val[1] =
1221             vshr_n_u16(vand_u16(vreinterpret_u16_s16(y_c64), v_c3f), 1);
1222       }
1223 
1224       a0_x = vcombine_u16(vget_low_u16(a0_x), vreinterpret_u16_u8(a0_y));
1225       a1_x = vcombine_u16(vget_low_u16(a1_x), vreinterpret_u16_u8(a1_y));
1226     }
1227     shift = vcombine_u16(v_shift.val[0], v_shift.val[1]);
1228     diff = vsubq_u16(a1_x, a0_x);      // a[x+1] - a[x]
1229     a32 = vmlaq_u16(a16, a0_x, v_32);  // a[x] * 32 + 16
1230     res = vmlaq_u16(a32, diff, shift);
1231     resx = vshrn_n_u16(res, 5);
1232     resy = vext_u8(resx, v_zero_u8, 4);
1233 
1234     uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1235     uint8x8_t v_resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
1236     vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
1237 
1238     dst += stride;
1239   }
1240 }
1241 
vector_shuffle(uint8x16_t * vec,uint8x16_t * vzero,int shift_value)1242 static AOM_FORCE_INLINE void vector_shuffle(uint8x16_t *vec, uint8x16_t *vzero,
1243                                             int shift_value) {
1244   switch (shift_value) {
1245     case 1: *vec = vextq_u8(*vzero, *vec, 15); break;
1246     case 2: *vec = vextq_u8(*vzero, *vec, 14); break;
1247     case 3: *vec = vextq_u8(*vzero, *vec, 13); break;
1248     case 4: *vec = vextq_u8(*vzero, *vec, 12); break;
1249     case 5: *vec = vextq_u8(*vzero, *vec, 11); break;
1250     case 6: *vec = vextq_u8(*vzero, *vec, 10); break;
1251     case 7: *vec = vextq_u8(*vzero, *vec, 9); break;
1252     case 8: *vec = vextq_u8(*vzero, *vec, 8); break;
1253     case 9: *vec = vextq_u8(*vzero, *vec, 7); break;
1254     case 10: *vec = vextq_u8(*vzero, *vec, 6); break;
1255     case 11: *vec = vextq_u8(*vzero, *vec, 5); break;
1256     case 12: *vec = vextq_u8(*vzero, *vec, 4); break;
1257     case 13: *vec = vextq_u8(*vzero, *vec, 3); break;
1258     case 14: *vec = vextq_u8(*vzero, *vec, 2); break;
1259     case 15: *vec = vextq_u8(*vzero, *vec, 1); break;
1260     default: break;
1261   }
1262 }
1263 
dr_prediction_z2_Nx8_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1264 static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
1265                                       const uint8_t *above, const uint8_t *left,
1266                                       int upsample_above, int upsample_left,
1267                                       int dx, int dy) {
1268   const int min_base_x = -(1 << upsample_above);
1269   const int min_base_y = -(1 << upsample_left);
1270   const int frac_bits_x = 6 - upsample_above;
1271   const int frac_bits_y = 6 - upsample_left;
1272 
1273   // pre-filter above pixels
1274   // store in temp buffers:
1275   //   above[x] * 32 + 16
1276   //   above[x+1] - above[x]
1277   // final pixels will be calculated as:
1278   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1279   uint8x16x2_t a0_x, a1_x;
1280   uint16x8x2_t diff, a32;
1281   uint16x8_t c1234, a16, c3f;
1282   uint8x16_t a0_x128, a1_x128;
1283   int16x8_t min_base_y128, dy128;
1284   uint16x8_t v_32 = vdupq_n_u16(32);
1285   uint8x16_t v_zero = vdupq_n_u8(0);
1286   int16x8_t v_upsample_left = vdupq_n_s16(upsample_left);
1287   int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
1288   int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
1289 
1290   a16 = vdupq_n_u16(16);
1291   c3f = vdupq_n_u16(0x3f);
1292   min_base_y128 = vdupq_n_s16(min_base_y);
1293   dy128 = vdupq_n_s16(dy);
1294   c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
1295                        vcreate_u16(0x0008000700060005));
1296 
1297   for (int r = 0; r < N; r++) {
1298     uint8x8_t resx, resy, resxy;
1299     uint16x8_t r6, ydx;
1300     uint16x8x2_t res, shift;
1301 
1302     int y = r + 1;
1303     int base_x = (-y * dx) >> frac_bits_x;
1304     int base_shift = 0;
1305     if (base_x < (min_base_x - 1)) {
1306       base_shift = (min_base_x - base_x - 1) >> upsample_above;
1307     }
1308     int base_min_diff =
1309         (min_base_x - base_x + upsample_above) >> upsample_above;
1310     if (base_min_diff > 8) {
1311       base_min_diff = 8;
1312     } else {
1313       if (base_min_diff < 0) base_min_diff = 0;
1314     }
1315 
1316     if (base_shift > 7) {
1317       a0_x.val[0] = v_zero;
1318       a0_x.val[1] = v_zero;
1319       a1_x.val[0] = v_zero;
1320       a1_x.val[1] = v_zero;
1321       shift.val[0] = vreinterpretq_u16_u8(v_zero);
1322       shift.val[1] = vreinterpretq_u16_u8(v_zero);
1323     } else {
1324       ydx = vdupq_n_u16(y * dx);
1325       r6 = vshlq_n_u16(vextq_u16(c1234, vreinterpretq_u16_u8(v_zero), 2), 6);
1326 
1327       if (upsample_above) {
1328         uint8x8x2_t v_tmp;
1329         v_tmp.val[0] = vld1_u8(above + base_x + base_shift);
1330         v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8);
1331         uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]);
1332         uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
1333         shift.val[0] = vshrq_n_u16(
1334             vandq_u16(vshlq_u16(vsubq_u16(r6, ydx), v_upsample_above), c3f), 1);
1335         a0_x.val[0] =
1336             vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_low)));
1337         a1_x.val[0] =
1338             vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_high)));
1339       } else {
1340         a0_x128 = vld1q_u8(above + base_x + base_shift);
1341         a1_x128 = vextq_u8(a0_x128, v_zero, 1);
1342         vector_shuffle(&a0_x128, &v_zero, base_shift);
1343         vector_shuffle(&a1_x128, &v_zero, base_shift);
1344         shift.val[0] = vshrq_n_u16(vandq_u16(vsubq_u16(r6, ydx), c3f), 1);
1345         a0_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a0_x128)));
1346         a1_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a1_x128)));
1347       }
1348     }
1349 
1350     // y calc
1351     if (base_x < min_base_x) {
1352       DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
1353       int16x8_t y_c128, base_y_c128;
1354       uint16x8_t mask128;
1355       int16x8_t v_r6 = vdupq_n_s16(r << 6);
1356 
1357       y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
1358       base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
1359       mask128 = vcgtq_s16(min_base_y128, base_y_c128);
1360 
1361       base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
1362       vst1q_s16(base_y_c, base_y_c128);
1363       a0_x.val[1] = v_zero;
1364       a0_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a0_x.val[1], 0);
1365       a0_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a0_x.val[1], 2);
1366       a0_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a0_x.val[1], 4);
1367       a0_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a0_x.val[1], 6);
1368       a0_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a0_x.val[1], 8);
1369       a0_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a0_x.val[1], 10);
1370       a0_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a0_x.val[1], 12);
1371       a0_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a0_x.val[1], 14);
1372 
1373       base_y_c128 =
1374           vaddq_s16(base_y_c128, vreinterpretq_s16_u16(vshrq_n_u16(a16, 4)));
1375       vst1q_s16(base_y_c, base_y_c128);
1376       a1_x.val[1] = v_zero;
1377       a1_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a1_x.val[1], 0);
1378       a1_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a1_x.val[1], 2);
1379       a1_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a1_x.val[1], 4);
1380       a1_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a1_x.val[1], 6);
1381       a1_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a1_x.val[1], 8);
1382       a1_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a1_x.val[1], 10);
1383       a1_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a1_x.val[1], 12);
1384       a1_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a1_x.val[1], 14);
1385 
1386       if (upsample_left) {
1387         shift.val[1] = vshrq_n_u16(
1388             vandq_u16(vshlq_u16(vreinterpretq_u16_s16(y_c128), v_upsample_left),
1389                       c3f),
1390             1);
1391       } else {
1392         shift.val[1] =
1393             vshrq_n_u16(vandq_u16(vreinterpretq_u16_s16(y_c128), c3f), 1);
1394       }
1395     }
1396     diff.val[0] =
1397         vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
1398                   vreinterpretq_u16_u8(a0_x.val[0]));  // a[x+1] - a[x]
1399     diff.val[1] =
1400         vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
1401                   vreinterpretq_u16_u8(a0_x.val[1]));  // a[x+1] - a[x]
1402     a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
1403                            v_32);  // a[x] * 32 + 16
1404     a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
1405                            v_32);  // a[x] * 32 + 16
1406     res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
1407     res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
1408     resx = vshrn_n_u16(res.val[0], 5);
1409     resy = vshrn_n_u16(res.val[1], 5);
1410 
1411     uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1412 
1413     resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
1414     vst1_u8(dst, resxy);
1415     dst += stride;
1416   }
1417 }
1418 
dr_prediction_z2_HxW_neon(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1419 static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
1420                                       ptrdiff_t stride, const uint8_t *above,
1421                                       const uint8_t *left, int upsample_above,
1422                                       int upsample_left, int dx, int dy) {
1423   // here upsample_above and upsample_left are 0 by design of
1424   // av1_use_intra_edge_upsample
1425   const int min_base_x = -1;
1426   const int min_base_y = -1;
1427   (void)upsample_above;
1428   (void)upsample_left;
1429   const int frac_bits_x = 6;
1430   const int frac_bits_y = 6;
1431 
1432   uint16x8_t a16, c1, c3f;
1433   int16x8_t min_base_y256, dy256;
1434   uint16x8x2_t a32, c0123, c1234, diff, shifty;
1435   uint8x16x2_t a0_x, a1_x, a0_y, a1_y;
1436   uint8x16_t a0_x128, a1_x128;
1437   uint16x8_t v_32 = vdupq_n_u16(32);
1438   uint8x16_t v_zero = vdupq_n_u8(0);
1439   int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
1440 
1441   DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
1442 
1443   a16 = vdupq_n_u16(16);
1444   c1 = vshrq_n_u16(a16, 4);
1445   min_base_y256 = vdupq_n_s16(min_base_y);
1446   c3f = vdupq_n_u16(0x3f);
1447   dy256 = vdupq_n_s16(dy);
1448   c0123.val[0] = vcombine_u16(vcreate_u16(0x0003000200010000),
1449                               vcreate_u16(0x0007000600050004));
1450   c0123.val[1] = vcombine_u16(vcreate_u16(0x000B000A00090008),
1451                               vcreate_u16(0x000F000E000D000C));
1452   c1234.val[0] = vaddq_u16(c0123.val[0], c1);
1453   c1234.val[1] = vaddq_u16(c0123.val[1], c1);
1454 
1455   for (int r = 0; r < H; r++) {
1456     uint16x8x2_t res, r6, shift;
1457     uint16x8_t ydx, j256;
1458     uint8x16_t resx, resy, resxy;
1459     int y = r + 1;
1460     ydx = vdupq_n_u16((uint16_t)(y * dx));
1461 
1462     int base_x = (-y * dx) >> frac_bits_x;
1463     for (int j = 0; j < W; j += 16) {
1464       j256 = vdupq_n_u16(j);
1465 
1466       int base_shift = 0;
1467       if ((base_x + j) < (min_base_x - 1)) {
1468         base_shift = (min_base_x - (base_x + j) - 1);
1469       }
1470       int base_min_diff = (min_base_x - base_x - j);
1471       if (base_min_diff > 16) {
1472         base_min_diff = 16;
1473       } else {
1474         if (base_min_diff < 0) base_min_diff = 0;
1475       }
1476 
1477       if (base_shift < 16) {
1478         a0_x128 = vld1q_u8(above + base_x + base_shift + j);
1479         a1_x128 = vld1q_u8(above + base_x + base_shift + 1 + j);
1480         vector_shuffle(&a0_x128, &v_zero, base_shift);
1481         vector_shuffle(&a1_x128, &v_zero, base_shift);
1482         a0_x = vzipq_u8(a0_x128, v_zero);
1483         a1_x = vzipq_u8(a1_x128, v_zero);
1484         r6.val[0] = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
1485         r6.val[1] = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
1486         shift.val[0] =
1487             vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[0], ydx), c3f), 1);
1488         shift.val[1] =
1489             vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[1], ydx), c3f), 1);
1490         diff.val[0] =
1491             vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
1492                       vreinterpretq_u16_u8(a0_x.val[0]));  // a[x+1] - a[x]
1493         diff.val[1] =
1494             vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
1495                       vreinterpretq_u16_u8(a0_x.val[1]));  // a[x+1] - a[x]
1496         a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
1497                                v_32);  // a[x] * 32 + 16
1498         a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
1499                                v_32);  // a[x] * 32 + 16
1500         res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
1501         res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
1502         resx =
1503             vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
1504       } else {
1505         resx = v_zero;
1506       }
1507 
1508       // y calc
1509       if (base_x < min_base_x) {
1510         uint16x8x2_t mask256;
1511         int16x8x2_t c256, y_c256, base_y_c256, mul16;
1512         int16x8_t v_r6 = vdupq_n_s16(r << 6);
1513 
1514         c256.val[0] = vaddq_s16(vreinterpretq_s16_u16(j256),
1515                                 vreinterpretq_s16_u16(c1234.val[0]));
1516         c256.val[1] = vaddq_s16(vreinterpretq_s16_u16(j256),
1517                                 vreinterpretq_s16_u16(c1234.val[1]));
1518         mul16.val[0] = vminq_s16(vmulq_s16(c256.val[0], dy256),
1519                                  vreinterpretq_s16_u16(vshrq_n_u16(
1520                                      vreinterpretq_u16_s16(min_base_y256), 1)));
1521         mul16.val[1] = vminq_s16(vmulq_s16(c256.val[1], dy256),
1522                                  vreinterpretq_s16_u16(vshrq_n_u16(
1523                                      vreinterpretq_u16_s16(min_base_y256), 1)));
1524         y_c256.val[0] = vsubq_s16(v_r6, mul16.val[0]);
1525         y_c256.val[1] = vsubq_s16(v_r6, mul16.val[1]);
1526 
1527         base_y_c256.val[0] = vshlq_s16(y_c256.val[0], v_frac_bits_y);
1528         base_y_c256.val[1] = vshlq_s16(y_c256.val[1], v_frac_bits_y);
1529         mask256.val[0] = vcgtq_s16(min_base_y256, base_y_c256.val[0]);
1530         mask256.val[1] = vcgtq_s16(min_base_y256, base_y_c256.val[1]);
1531 
1532         base_y_c256.val[0] = vorrq_s16(
1533             vandq_s16(vreinterpretq_s16_u16(mask256.val[0]), min_base_y256),
1534             vbicq_s16(base_y_c256.val[0],
1535                       vreinterpretq_s16_u16(mask256.val[0])));
1536         base_y_c256.val[1] = vorrq_s16(
1537             vandq_s16(vreinterpretq_s16_u16(mask256.val[1]), min_base_y256),
1538             vbicq_s16(base_y_c256.val[1],
1539                       vreinterpretq_s16_u16(mask256.val[1])));
1540 
1541         int16_t min_y = vgetq_lane_s16(base_y_c256.val[1], 7);
1542         int16_t max_y = vgetq_lane_s16(base_y_c256.val[0], 0);
1543         int16_t offset_diff = max_y - min_y;
1544 
1545         if (offset_diff < 16) {
1546           int16x8_t min_y256 =
1547               vdupq_lane_s16(vget_high_s16(base_y_c256.val[1]), 3);
1548 
1549           int16x8x2_t base_y_offset;
1550           base_y_offset.val[0] = vsubq_s16(base_y_c256.val[0], min_y256);
1551           base_y_offset.val[1] = vsubq_s16(base_y_c256.val[1], min_y256);
1552 
1553           int8x16_t base_y_offset128 =
1554               vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
1555                           vqmovn_s16(base_y_offset.val[1]));
1556 
1557           uint8x16_t a0_y128, a1_y128;
1558           uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
1559           a0_y128 = vld1q_u8(left + min_y);
1560           a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
1561           a1_y128 = vld1q_u8(left + min_y + 1);
1562           a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
1563 #if defined(__aarch64__)
1564           a0_y128 = vqtbl1q_u8(a0_y128, vreinterpretq_u8_s8(base_y_offset128));
1565           a1_y128 = vqtbl1q_u8(a1_y128, vreinterpretq_u8_s8(base_y_offset128));
1566 #else
1567           uint8x8x2_t v_tmp;
1568           uint8x8x2_t v_res;
1569           uint8x8_t v_index_low =
1570               vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
1571           uint8x8_t v_index_high =
1572               vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
1573           v_tmp.val[0] = vget_low_u8(a0_y128);
1574           v_tmp.val[1] = vget_high_u8(a0_y128);
1575           v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1576           v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1577           a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1578           v_tmp.val[0] = vget_low_u8(a1_y128);
1579           v_tmp.val[1] = vget_high_u8(a1_y128);
1580           v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1581           v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1582           a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1583 #endif
1584           a0_y = vzipq_u8(a0_y128, v_zero);
1585           a1_y = vzipq_u8(a1_y128, v_zero);
1586         } else {
1587           base_y_c256.val[0] = vbicq_s16(base_y_c256.val[0],
1588                                          vreinterpretq_s16_u16(mask256.val[0]));
1589           base_y_c256.val[1] = vbicq_s16(base_y_c256.val[1],
1590                                          vreinterpretq_s16_u16(mask256.val[1]));
1591           vst1q_s16(base_y_c, base_y_c256.val[0]);
1592           vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
1593           a0_y.val[0] = v_zero;
1594           a0_y.val[1] = v_zero;
1595           a0_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a0_y.val[0], 0);
1596           a0_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a0_y.val[0], 2);
1597           a0_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a0_y.val[0], 4);
1598           a0_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a0_y.val[0], 6);
1599           a0_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a0_y.val[0], 8);
1600           a0_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a0_y.val[0], 10);
1601           a0_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a0_y.val[0], 12);
1602           a0_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a0_y.val[0], 14);
1603           a0_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a0_y.val[1], 0);
1604           a0_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a0_y.val[1], 2);
1605           a0_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a0_y.val[1], 4);
1606           a0_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a0_y.val[1], 6);
1607           a0_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a0_y.val[1], 8);
1608           a0_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a0_y.val[1], 10);
1609           a0_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a0_y.val[1], 12);
1610           a0_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a0_y.val[1], 14);
1611 
1612           base_y_c256.val[0] =
1613               vaddq_s16(base_y_c256.val[0], vreinterpretq_s16_u16(c1));
1614           base_y_c256.val[1] =
1615               vaddq_s16(base_y_c256.val[1], vreinterpretq_s16_u16(c1));
1616           vst1q_s16(base_y_c, base_y_c256.val[0]);
1617           vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
1618           a1_y.val[0] = v_zero;
1619           a1_y.val[1] = v_zero;
1620           a1_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a1_y.val[0], 0);
1621           a1_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a1_y.val[0], 2);
1622           a1_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a1_y.val[0], 4);
1623           a1_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a1_y.val[0], 6);
1624           a1_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a1_y.val[0], 8);
1625           a1_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a1_y.val[0], 10);
1626           a1_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a1_y.val[0], 12);
1627           a1_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a1_y.val[0], 14);
1628           a1_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a1_y.val[1], 0);
1629           a1_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a1_y.val[1], 2);
1630           a1_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a1_y.val[1], 4);
1631           a1_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a1_y.val[1], 6);
1632           a1_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a1_y.val[1], 8);
1633           a1_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a1_y.val[1], 10);
1634           a1_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a1_y.val[1], 12);
1635           a1_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a1_y.val[1], 14);
1636         }
1637         shifty.val[0] = vshrq_n_u16(
1638             vandq_u16(vreinterpretq_u16_s16(y_c256.val[0]), c3f), 1);
1639         shifty.val[1] = vshrq_n_u16(
1640             vandq_u16(vreinterpretq_u16_s16(y_c256.val[1]), c3f), 1);
1641         diff.val[0] =
1642             vsubq_u16(vreinterpretq_u16_u8(a1_y.val[0]),
1643                       vreinterpretq_u16_u8(a0_y.val[0]));  // a[x+1] - a[x]
1644         diff.val[1] =
1645             vsubq_u16(vreinterpretq_u16_u8(a1_y.val[1]),
1646                       vreinterpretq_u16_u8(a0_y.val[1]));  // a[x+1] - a[x]
1647         a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[0]),
1648                                v_32);  // a[x] * 32 + 16
1649         a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[1]),
1650                                v_32);  // a[x] * 32 + 16
1651         res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shifty.val[0]);
1652         res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shifty.val[1]);
1653 
1654         resy =
1655             vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
1656       } else {
1657         resy = v_zero;
1658       }
1659       uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
1660       resxy = vorrq_u8(vandq_u8(mask, resy), vbicq_u8(resx, mask));
1661       vst1q_u8(dst + j, resxy);
1662     }  // for j
1663     dst += stride;
1664   }
1665 }
1666 
1667 // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1668 void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1669                                const uint8_t *above, const uint8_t *left,
1670                                int upsample_above, int upsample_left, int dx,
1671                                int dy) {
1672   assert(dx > 0);
1673   assert(dy > 0);
1674 
1675   switch (bw) {
1676     case 4:
1677       dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
1678                                 upsample_left, dx, dy);
1679       break;
1680     case 8:
1681       dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
1682                                 upsample_left, dx, dy);
1683       break;
1684     default:
1685       dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left,
1686                                 upsample_above, upsample_left, dx, dy);
1687       break;
1688   }
1689   return;
1690 }
1691 
1692 /* ---------------------P R E D I C T I O N   Z 3--------------------------- */
1693 
transpose4x16_neon(uint8x16_t * x,uint16x8x2_t * d)1694 static AOM_FORCE_INLINE void transpose4x16_neon(uint8x16_t *x,
1695                                                 uint16x8x2_t *d) {
1696   uint8x16x2_t w0, w1;
1697 
1698   w0 = vzipq_u8(x[0], x[1]);
1699   w1 = vzipq_u8(x[2], x[3]);
1700 
1701   d[0] = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
1702                    vreinterpretq_u16_u8(w1.val[0]));
1703   d[1] = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
1704                    vreinterpretq_u16_u8(w1.val[1]));
1705 }
1706 
transpose4x8_8x4_low_neon(uint8x8_t * x,uint16x4x2_t * d)1707 static AOM_FORCE_INLINE void transpose4x8_8x4_low_neon(uint8x8_t *x,
1708                                                        uint16x4x2_t *d) {
1709   uint8x8x2_t w0, w1;
1710 
1711   w0 = vzip_u8(x[0], x[1]);
1712   w1 = vzip_u8(x[2], x[3]);
1713 
1714   *d = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1715 }
1716 
transpose4x8_8x4_neon(uint8x8_t * x,uint16x4x2_t * d)1717 static AOM_FORCE_INLINE void transpose4x8_8x4_neon(uint8x8_t *x,
1718                                                    uint16x4x2_t *d) {
1719   uint8x8x2_t w0, w1;
1720 
1721   w0 = vzip_u8(x[0], x[1]);
1722   w1 = vzip_u8(x[2], x[3]);
1723 
1724   d[0] =
1725       vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1726   d[1] =
1727       vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
1728 }
1729 
transpose8x8_low_neon(uint8x8_t * x,uint32x2x2_t * d)1730 static AOM_FORCE_INLINE void transpose8x8_low_neon(uint8x8_t *x,
1731                                                    uint32x2x2_t *d) {
1732   uint8x8x2_t w0, w1, w2, w3;
1733   uint16x4x2_t w4, w5;
1734 
1735   w0 = vzip_u8(x[0], x[1]);
1736   w1 = vzip_u8(x[2], x[3]);
1737   w2 = vzip_u8(x[4], x[5]);
1738   w3 = vzip_u8(x[6], x[7]);
1739 
1740   w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1741   w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
1742 
1743   d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
1744                   vreinterpret_u32_u16(w5.val[0]));
1745   d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
1746                   vreinterpret_u32_u16(w5.val[1]));
1747 }
1748 
transpose8x8_neon(uint8x8_t * x,uint32x2x2_t * d)1749 static AOM_FORCE_INLINE void transpose8x8_neon(uint8x8_t *x, uint32x2x2_t *d) {
1750   uint8x8x2_t w0, w1, w2, w3;
1751   uint16x4x2_t w4, w5, w6, w7;
1752 
1753   w0 = vzip_u8(x[0], x[1]);
1754   w1 = vzip_u8(x[2], x[3]);
1755   w2 = vzip_u8(x[4], x[5]);
1756   w3 = vzip_u8(x[6], x[7]);
1757 
1758   w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1759   w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
1760 
1761   d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
1762                   vreinterpret_u32_u16(w5.val[0]));
1763   d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
1764                   vreinterpret_u32_u16(w5.val[1]));
1765 
1766   w6 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
1767   w7 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
1768 
1769   d[2] = vzip_u32(vreinterpret_u32_u16(w6.val[0]),
1770                   vreinterpret_u32_u16(w7.val[0]));
1771   d[3] = vzip_u32(vreinterpret_u32_u16(w6.val[1]),
1772                   vreinterpret_u32_u16(w7.val[1]));
1773 }
1774 
transpose16x8_8x16_neon(uint8x8_t * x,uint64x2_t * d)1775 static AOM_FORCE_INLINE void transpose16x8_8x16_neon(uint8x8_t *x,
1776                                                      uint64x2_t *d) {
1777   uint8x8x2_t w0, w1, w2, w3, w8, w9, w10, w11;
1778   uint16x4x2_t w4, w5, w12, w13;
1779   uint32x2x2_t w6, w7, w14, w15;
1780 
1781   w0 = vzip_u8(x[0], x[1]);
1782   w1 = vzip_u8(x[2], x[3]);
1783   w2 = vzip_u8(x[4], x[5]);
1784   w3 = vzip_u8(x[6], x[7]);
1785 
1786   w8 = vzip_u8(x[8], x[9]);
1787   w9 = vzip_u8(x[10], x[11]);
1788   w10 = vzip_u8(x[12], x[13]);
1789   w11 = vzip_u8(x[14], x[15]);
1790 
1791   w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1792   w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
1793   w12 =
1794       vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
1795   w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
1796                  vreinterpret_u16_u8(w11.val[0]));
1797 
1798   w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
1799                 vreinterpret_u32_u16(w5.val[0]));
1800   w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
1801                 vreinterpret_u32_u16(w5.val[1]));
1802   w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
1803                  vreinterpret_u32_u16(w13.val[0]));
1804   w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
1805                  vreinterpret_u32_u16(w13.val[1]));
1806 
1807   // Store first 4-line result
1808   d[0] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]),
1809                       vreinterpret_u64_u32(w14.val[0]));
1810   d[1] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]),
1811                       vreinterpret_u64_u32(w14.val[1]));
1812   d[2] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]),
1813                       vreinterpret_u64_u32(w15.val[0]));
1814   d[3] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]),
1815                       vreinterpret_u64_u32(w15.val[1]));
1816 
1817   w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
1818   w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
1819   w12 =
1820       vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
1821   w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
1822                  vreinterpret_u16_u8(w11.val[1]));
1823 
1824   w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
1825                 vreinterpret_u32_u16(w5.val[0]));
1826   w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
1827                 vreinterpret_u32_u16(w5.val[1]));
1828   w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
1829                  vreinterpret_u32_u16(w13.val[0]));
1830   w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
1831                  vreinterpret_u32_u16(w13.val[1]));
1832 
1833   // Store second 4-line result
1834   d[4] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]),
1835                       vreinterpret_u64_u32(w14.val[0]));
1836   d[5] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]),
1837                       vreinterpret_u64_u32(w14.val[1]));
1838   d[6] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]),
1839                       vreinterpret_u64_u32(w15.val[0]));
1840   d[7] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]),
1841                       vreinterpret_u64_u32(w15.val[1]));
1842 }
1843 
transpose8x16_16x8_neon(uint8x16_t * x,uint64x2_t * d)1844 static AOM_FORCE_INLINE void transpose8x16_16x8_neon(uint8x16_t *x,
1845                                                      uint64x2_t *d) {
1846   uint8x16x2_t w0, w1, w2, w3;
1847   uint16x8x2_t w4, w5, w6, w7;
1848   uint32x4x2_t w8, w9, w10, w11;
1849 
1850   w0 = vzipq_u8(x[0], x[1]);
1851   w1 = vzipq_u8(x[2], x[3]);
1852   w2 = vzipq_u8(x[4], x[5]);
1853   w3 = vzipq_u8(x[6], x[7]);
1854 
1855   w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
1856                  vreinterpretq_u16_u8(w1.val[0]));
1857   w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
1858                  vreinterpretq_u16_u8(w3.val[0]));
1859   w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
1860                  vreinterpretq_u16_u8(w1.val[1]));
1861   w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
1862                  vreinterpretq_u16_u8(w3.val[1]));
1863 
1864   w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
1865                  vreinterpretq_u32_u16(w5.val[0]));
1866   w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
1867                  vreinterpretq_u32_u16(w7.val[0]));
1868   w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
1869                   vreinterpretq_u32_u16(w5.val[1]));
1870   w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
1871                   vreinterpretq_u32_u16(w7.val[1]));
1872 
1873 #if defined(__aarch64__)
1874   d[0] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[0]),
1875                     vreinterpretq_u64_u32(w9.val[0]));
1876   d[1] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[0]),
1877                     vreinterpretq_u64_u32(w9.val[0]));
1878   d[2] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[1]),
1879                     vreinterpretq_u64_u32(w9.val[1]));
1880   d[3] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[1]),
1881                     vreinterpretq_u64_u32(w9.val[1]));
1882   d[4] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[0]),
1883                     vreinterpretq_u64_u32(w11.val[0]));
1884   d[5] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[0]),
1885                     vreinterpretq_u64_u32(w11.val[0]));
1886   d[6] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[1]),
1887                     vreinterpretq_u64_u32(w11.val[1]));
1888   d[7] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[1]),
1889                     vreinterpretq_u64_u32(w11.val[1]));
1890 #else
1891   d[0] = vreinterpretq_u64_u32(
1892       vcombine_u32(vget_low_u32(w8.val[0]), vget_low_u32(w9.val[0])));
1893   d[1] = vreinterpretq_u64_u32(
1894       vcombine_u32(vget_high_u32(w8.val[0]), vget_high_u32(w9.val[0])));
1895   d[2] = vreinterpretq_u64_u32(
1896       vcombine_u32(vget_low_u32(w8.val[1]), vget_low_u32(w9.val[1])));
1897   d[3] = vreinterpretq_u64_u32(
1898       vcombine_u32(vget_high_u32(w8.val[1]), vget_high_u32(w9.val[1])));
1899   d[4] = vreinterpretq_u64_u32(
1900       vcombine_u32(vget_low_u32(w10.val[0]), vget_low_u32(w11.val[0])));
1901   d[5] = vreinterpretq_u64_u32(
1902       vcombine_u32(vget_high_u32(w10.val[0]), vget_high_u32(w11.val[0])));
1903   d[6] = vreinterpretq_u64_u32(
1904       vcombine_u32(vget_low_u32(w10.val[1]), vget_low_u32(w11.val[1])));
1905   d[7] = vreinterpretq_u64_u32(
1906       vcombine_u32(vget_high_u32(w10.val[1]), vget_high_u32(w11.val[1])));
1907 #endif
1908 }
1909 
transpose16x16_neon(uint8x16_t * x,uint64x2_t * d)1910 static AOM_FORCE_INLINE void transpose16x16_neon(uint8x16_t *x, uint64x2_t *d) {
1911   uint8x16x2_t w0, w1, w2, w3, w4, w5, w6, w7;
1912   uint16x8x2_t w8, w9, w10, w11;
1913   uint32x4x2_t w12, w13, w14, w15;
1914 
1915   w0 = vzipq_u8(x[0], x[1]);
1916   w1 = vzipq_u8(x[2], x[3]);
1917   w2 = vzipq_u8(x[4], x[5]);
1918   w3 = vzipq_u8(x[6], x[7]);
1919 
1920   w4 = vzipq_u8(x[8], x[9]);
1921   w5 = vzipq_u8(x[10], x[11]);
1922   w6 = vzipq_u8(x[12], x[13]);
1923   w7 = vzipq_u8(x[14], x[15]);
1924 
1925   w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
1926                  vreinterpretq_u16_u8(w1.val[0]));
1927   w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
1928                  vreinterpretq_u16_u8(w3.val[0]));
1929   w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
1930                   vreinterpretq_u16_u8(w5.val[0]));
1931   w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
1932                   vreinterpretq_u16_u8(w7.val[0]));
1933 
1934   w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
1935                   vreinterpretq_u32_u16(w9.val[0]));
1936   w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
1937                   vreinterpretq_u32_u16(w11.val[0]));
1938   w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
1939                   vreinterpretq_u32_u16(w9.val[1]));
1940   w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
1941                   vreinterpretq_u32_u16(w11.val[1]));
1942 
1943 #if defined(__aarch64__)
1944   d[0] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
1945                     vreinterpretq_u64_u32(w13.val[0]));
1946   d[1] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
1947                     vreinterpretq_u64_u32(w13.val[0]));
1948   d[2] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]),
1949                     vreinterpretq_u64_u32(w13.val[1]));
1950   d[3] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]),
1951                     vreinterpretq_u64_u32(w13.val[1]));
1952   d[4] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]),
1953                     vreinterpretq_u64_u32(w15.val[0]));
1954   d[5] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]),
1955                     vreinterpretq_u64_u32(w15.val[0]));
1956   d[6] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]),
1957                     vreinterpretq_u64_u32(w15.val[1]));
1958   d[7] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]),
1959                     vreinterpretq_u64_u32(w15.val[1]));
1960 #else
1961   d[0] = vreinterpretq_u64_u32(
1962       vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0])));
1963   d[1] = vreinterpretq_u64_u32(
1964       vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0])));
1965   d[2] = vreinterpretq_u64_u32(
1966       vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1])));
1967   d[3] = vreinterpretq_u64_u32(
1968       vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1])));
1969   d[4] = vreinterpretq_u64_u32(
1970       vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0])));
1971   d[5] = vreinterpretq_u64_u32(
1972       vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0])));
1973   d[6] = vreinterpretq_u64_u32(
1974       vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1])));
1975   d[7] = vreinterpretq_u64_u32(
1976       vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1])));
1977 #endif
1978 
1979   // upper half
1980   w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
1981                  vreinterpretq_u16_u8(w1.val[1]));
1982   w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
1983                  vreinterpretq_u16_u8(w3.val[1]));
1984   w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
1985                   vreinterpretq_u16_u8(w5.val[1]));
1986   w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
1987                   vreinterpretq_u16_u8(w7.val[1]));
1988 
1989   w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
1990                   vreinterpretq_u32_u16(w9.val[0]));
1991   w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
1992                   vreinterpretq_u32_u16(w11.val[0]));
1993   w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
1994                   vreinterpretq_u32_u16(w9.val[1]));
1995   w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
1996                   vreinterpretq_u32_u16(w11.val[1]));
1997 
1998 #if defined(__aarch64__)
1999   d[8] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
2000                     vreinterpretq_u64_u32(w13.val[0]));
2001   d[9] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
2002                     vreinterpretq_u64_u32(w13.val[0]));
2003   d[10] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]),
2004                      vreinterpretq_u64_u32(w13.val[1]));
2005   d[11] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]),
2006                      vreinterpretq_u64_u32(w13.val[1]));
2007   d[12] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]),
2008                      vreinterpretq_u64_u32(w15.val[0]));
2009   d[13] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]),
2010                      vreinterpretq_u64_u32(w15.val[0]));
2011   d[14] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]),
2012                      vreinterpretq_u64_u32(w15.val[1]));
2013   d[15] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]),
2014                      vreinterpretq_u64_u32(w15.val[1]));
2015 #else
2016   d[8] = vreinterpretq_u64_u32(
2017       vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0])));
2018   d[9] = vreinterpretq_u64_u32(
2019       vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0])));
2020   d[10] = vreinterpretq_u64_u32(
2021       vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1])));
2022   d[11] = vreinterpretq_u64_u32(
2023       vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1])));
2024   d[12] = vreinterpretq_u64_u32(
2025       vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0])));
2026   d[13] = vreinterpretq_u64_u32(
2027       vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0])));
2028   d[14] = vreinterpretq_u64_u32(
2029       vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1])));
2030   d[15] = vreinterpretq_u64_u32(
2031       vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1])));
2032 #endif
2033 }
2034 
transpose16x32_neon(uint8x16x2_t * x,uint64x2x2_t * d)2035 static AOM_FORCE_INLINE void transpose16x32_neon(uint8x16x2_t *x,
2036                                                  uint64x2x2_t *d) {
2037   uint8x16x2_t w0, w1, w2, w3, w8, w9, w10, w11;
2038   uint16x8x2_t w4, w5, w12, w13;
2039   uint32x4x2_t w6, w7, w14, w15;
2040 
2041   w0 = vzipq_u8(x[0].val[0], x[1].val[0]);
2042   w1 = vzipq_u8(x[2].val[0], x[3].val[0]);
2043   w2 = vzipq_u8(x[4].val[0], x[5].val[0]);
2044   w3 = vzipq_u8(x[6].val[0], x[7].val[0]);
2045 
2046   w8 = vzipq_u8(x[8].val[0], x[9].val[0]);
2047   w9 = vzipq_u8(x[10].val[0], x[11].val[0]);
2048   w10 = vzipq_u8(x[12].val[0], x[13].val[0]);
2049   w11 = vzipq_u8(x[14].val[0], x[15].val[0]);
2050 
2051   w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
2052                  vreinterpretq_u16_u8(w1.val[0]));
2053   w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
2054                  vreinterpretq_u16_u8(w3.val[0]));
2055   w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]),
2056                   vreinterpretq_u16_u8(w9.val[0]));
2057   w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]),
2058                   vreinterpretq_u16_u8(w11.val[0]));
2059 
2060   w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
2061                  vreinterpretq_u32_u16(w5.val[0]));
2062   w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
2063                  vreinterpretq_u32_u16(w5.val[1]));
2064   w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
2065                   vreinterpretq_u32_u16(w13.val[0]));
2066   w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
2067                   vreinterpretq_u32_u16(w13.val[1]));
2068 
2069   // Store first 4-line result
2070 
2071 #if defined(__aarch64__)
2072   d[0].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
2073                            vreinterpretq_u64_u32(w14.val[0]));
2074   d[0].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
2075                            vreinterpretq_u64_u32(w14.val[0]));
2076   d[1].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
2077                            vreinterpretq_u64_u32(w14.val[1]));
2078   d[1].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
2079                            vreinterpretq_u64_u32(w14.val[1]));
2080   d[2].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
2081                            vreinterpretq_u64_u32(w15.val[0]));
2082   d[2].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
2083                            vreinterpretq_u64_u32(w15.val[0]));
2084   d[3].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
2085                            vreinterpretq_u64_u32(w15.val[1]));
2086   d[3].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
2087                            vreinterpretq_u64_u32(w15.val[1]));
2088 #else
2089   d[0].val[0] = vreinterpretq_u64_u32(
2090       vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
2091   d[0].val[1] = vreinterpretq_u64_u32(
2092       vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
2093   d[1].val[0] = vreinterpretq_u64_u32(
2094       vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
2095   d[1].val[1] = vreinterpretq_u64_u32(
2096       vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
2097   d[2].val[0] = vreinterpretq_u64_u32(
2098       vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
2099   d[2].val[1] = vreinterpretq_u64_u32(
2100       vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
2101   d[3].val[0] = vreinterpretq_u64_u32(
2102       vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
2103   d[3].val[1] = vreinterpretq_u64_u32(
2104       vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
2105 #endif
2106 
2107   w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
2108                  vreinterpretq_u16_u8(w1.val[1]));
2109   w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
2110                  vreinterpretq_u16_u8(w3.val[1]));
2111   w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]),
2112                   vreinterpretq_u16_u8(w9.val[1]));
2113   w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]),
2114                   vreinterpretq_u16_u8(w11.val[1]));
2115 
2116   w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
2117                  vreinterpretq_u32_u16(w5.val[0]));
2118   w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
2119                  vreinterpretq_u32_u16(w5.val[1]));
2120   w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
2121                   vreinterpretq_u32_u16(w13.val[0]));
2122   w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
2123                   vreinterpretq_u32_u16(w13.val[1]));
2124 
2125   // Store second 4-line result
2126 
2127 #if defined(__aarch64__)
2128   d[4].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
2129                            vreinterpretq_u64_u32(w14.val[0]));
2130   d[4].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
2131                            vreinterpretq_u64_u32(w14.val[0]));
2132   d[5].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
2133                            vreinterpretq_u64_u32(w14.val[1]));
2134   d[5].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
2135                            vreinterpretq_u64_u32(w14.val[1]));
2136   d[6].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
2137                            vreinterpretq_u64_u32(w15.val[0]));
2138   d[6].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
2139                            vreinterpretq_u64_u32(w15.val[0]));
2140   d[7].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
2141                            vreinterpretq_u64_u32(w15.val[1]));
2142   d[7].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
2143                            vreinterpretq_u64_u32(w15.val[1]));
2144 #else
2145   d[4].val[0] = vreinterpretq_u64_u32(
2146       vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
2147   d[4].val[1] = vreinterpretq_u64_u32(
2148       vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
2149   d[5].val[0] = vreinterpretq_u64_u32(
2150       vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
2151   d[5].val[1] = vreinterpretq_u64_u32(
2152       vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
2153   d[6].val[0] = vreinterpretq_u64_u32(
2154       vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
2155   d[6].val[1] = vreinterpretq_u64_u32(
2156       vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
2157   d[7].val[0] = vreinterpretq_u64_u32(
2158       vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
2159   d[7].val[1] = vreinterpretq_u64_u32(
2160       vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
2161 #endif
2162 
2163   // upper half
2164   w0 = vzipq_u8(x[0].val[1], x[1].val[1]);
2165   w1 = vzipq_u8(x[2].val[1], x[3].val[1]);
2166   w2 = vzipq_u8(x[4].val[1], x[5].val[1]);
2167   w3 = vzipq_u8(x[6].val[1], x[7].val[1]);
2168 
2169   w8 = vzipq_u8(x[8].val[1], x[9].val[1]);
2170   w9 = vzipq_u8(x[10].val[1], x[11].val[1]);
2171   w10 = vzipq_u8(x[12].val[1], x[13].val[1]);
2172   w11 = vzipq_u8(x[14].val[1], x[15].val[1]);
2173 
2174   w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
2175                  vreinterpretq_u16_u8(w1.val[0]));
2176   w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
2177                  vreinterpretq_u16_u8(w3.val[0]));
2178   w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]),
2179                   vreinterpretq_u16_u8(w9.val[0]));
2180   w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]),
2181                   vreinterpretq_u16_u8(w11.val[0]));
2182 
2183   w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
2184                  vreinterpretq_u32_u16(w5.val[0]));
2185   w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
2186                  vreinterpretq_u32_u16(w5.val[1]));
2187   w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
2188                   vreinterpretq_u32_u16(w13.val[0]));
2189   w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
2190                   vreinterpretq_u32_u16(w13.val[1]));
2191 
2192   // Store first 4-line result
2193 
2194 #if defined(__aarch64__)
2195   d[8].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
2196                            vreinterpretq_u64_u32(w14.val[0]));
2197   d[8].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
2198                            vreinterpretq_u64_u32(w14.val[0]));
2199   d[9].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
2200                            vreinterpretq_u64_u32(w14.val[1]));
2201   d[9].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
2202                            vreinterpretq_u64_u32(w14.val[1]));
2203   d[10].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
2204                             vreinterpretq_u64_u32(w15.val[0]));
2205   d[10].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
2206                             vreinterpretq_u64_u32(w15.val[0]));
2207   d[11].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
2208                             vreinterpretq_u64_u32(w15.val[1]));
2209   d[11].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
2210                             vreinterpretq_u64_u32(w15.val[1]));
2211 #else
2212   d[8].val[0] = vreinterpretq_u64_u32(
2213       vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
2214   d[8].val[1] = vreinterpretq_u64_u32(
2215       vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
2216   d[9].val[0] = vreinterpretq_u64_u32(
2217       vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
2218   d[9].val[1] = vreinterpretq_u64_u32(
2219       vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
2220   d[10].val[0] = vreinterpretq_u64_u32(
2221       vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
2222   d[10].val[1] = vreinterpretq_u64_u32(
2223       vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
2224   d[11].val[0] = vreinterpretq_u64_u32(
2225       vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
2226   d[11].val[1] = vreinterpretq_u64_u32(
2227       vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
2228 #endif
2229 
2230   w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
2231                  vreinterpretq_u16_u8(w1.val[1]));
2232   w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
2233                  vreinterpretq_u16_u8(w3.val[1]));
2234   w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]),
2235                   vreinterpretq_u16_u8(w9.val[1]));
2236   w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]),
2237                   vreinterpretq_u16_u8(w11.val[1]));
2238 
2239   w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
2240                  vreinterpretq_u32_u16(w5.val[0]));
2241   w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
2242                  vreinterpretq_u32_u16(w5.val[1]));
2243   w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
2244                   vreinterpretq_u32_u16(w13.val[0]));
2245   w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
2246                   vreinterpretq_u32_u16(w13.val[1]));
2247 
2248   // Store second 4-line result
2249 
2250 #if defined(__aarch64__)
2251   d[12].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
2252                             vreinterpretq_u64_u32(w14.val[0]));
2253   d[12].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
2254                             vreinterpretq_u64_u32(w14.val[0]));
2255   d[13].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
2256                             vreinterpretq_u64_u32(w14.val[1]));
2257   d[13].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
2258                             vreinterpretq_u64_u32(w14.val[1]));
2259   d[14].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
2260                             vreinterpretq_u64_u32(w15.val[0]));
2261   d[14].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
2262                             vreinterpretq_u64_u32(w15.val[0]));
2263   d[15].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
2264                             vreinterpretq_u64_u32(w15.val[1]));
2265   d[15].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
2266                             vreinterpretq_u64_u32(w15.val[1]));
2267 #else
2268   d[12].val[0] = vreinterpretq_u64_u32(
2269       vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
2270   d[12].val[1] = vreinterpretq_u64_u32(
2271       vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
2272   d[13].val[0] = vreinterpretq_u64_u32(
2273       vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
2274   d[13].val[1] = vreinterpretq_u64_u32(
2275       vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
2276   d[14].val[0] = vreinterpretq_u64_u32(
2277       vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
2278   d[14].val[1] = vreinterpretq_u64_u32(
2279       vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
2280   d[15].val[0] = vreinterpretq_u64_u32(
2281       vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
2282   d[15].val[1] = vreinterpretq_u64_u32(
2283       vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
2284 #endif
2285 }
2286 
transpose_TX_16X16(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst)2287 static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
2288                                uint8_t *dst, ptrdiff_t pitchDst) {
2289   uint8x16_t r[16];
2290   uint64x2_t d[16];
2291   for (int i = 0; i < 16; i++) {
2292     r[i] = vld1q_u8(src + i * pitchSrc);
2293   }
2294   transpose16x16_neon(r, d);
2295   for (int i = 0; i < 16; i++) {
2296     vst1q_u8(dst + i * pitchDst, vreinterpretq_u8_u64(d[i]));
2297   }
2298 }
2299 
transpose(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst,int width,int height)2300 static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
2301                       ptrdiff_t pitchDst, int width, int height) {
2302   for (int j = 0; j < height; j += 16) {
2303     for (int i = 0; i < width; i += 16) {
2304       transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
2305                          dst + j * pitchDst + i, pitchDst);
2306     }
2307   }
2308 }
2309 
dr_prediction_z3_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2310 static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
2311                                       const uint8_t *left, int upsample_left,
2312                                       int dy) {
2313   uint8x8_t dstvec[4];
2314   uint16x4x2_t dest;
2315 
2316   dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
2317   transpose4x8_8x4_low_neon(dstvec, &dest);
2318   vst1_lane_u32((uint32_t *)(dst + stride * 0),
2319                 vreinterpret_u32_u16(dest.val[0]), 0);
2320   vst1_lane_u32((uint32_t *)(dst + stride * 1),
2321                 vreinterpret_u32_u16(dest.val[0]), 1);
2322   vst1_lane_u32((uint32_t *)(dst + stride * 2),
2323                 vreinterpret_u32_u16(dest.val[1]), 0);
2324   vst1_lane_u32((uint32_t *)(dst + stride * 3),
2325                 vreinterpret_u32_u16(dest.val[1]), 1);
2326 }
2327 
dr_prediction_z3_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2328 static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
2329                                       const uint8_t *left, int upsample_left,
2330                                       int dy) {
2331   uint8x8_t dstvec[8];
2332   uint32x2x2_t d[4];
2333 
2334   dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
2335   transpose8x8_neon(dstvec, d);
2336   vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]);
2337   vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]);
2338   vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]);
2339   vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]);
2340   vst1_u32((uint32_t *)(dst + 4 * stride), d[2].val[0]);
2341   vst1_u32((uint32_t *)(dst + 5 * stride), d[2].val[1]);
2342   vst1_u32((uint32_t *)(dst + 6 * stride), d[3].val[0]);
2343   vst1_u32((uint32_t *)(dst + 7 * stride), d[3].val[1]);
2344 }
2345 
dr_prediction_z3_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2346 static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
2347                                       const uint8_t *left, int upsample_left,
2348                                       int dy) {
2349   uint8x8_t dstvec[4];
2350   uint16x4x2_t d[2];
2351 
2352   dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
2353   transpose4x8_8x4_neon(dstvec, d);
2354   vst1_lane_u32((uint32_t *)(dst + stride * 0),
2355                 vreinterpret_u32_u16(d[0].val[0]), 0);
2356   vst1_lane_u32((uint32_t *)(dst + stride * 1),
2357                 vreinterpret_u32_u16(d[0].val[0]), 1);
2358   vst1_lane_u32((uint32_t *)(dst + stride * 2),
2359                 vreinterpret_u32_u16(d[0].val[1]), 0);
2360   vst1_lane_u32((uint32_t *)(dst + stride * 3),
2361                 vreinterpret_u32_u16(d[0].val[1]), 1);
2362   vst1_lane_u32((uint32_t *)(dst + stride * 4),
2363                 vreinterpret_u32_u16(d[1].val[0]), 0);
2364   vst1_lane_u32((uint32_t *)(dst + stride * 5),
2365                 vreinterpret_u32_u16(d[1].val[0]), 1);
2366   vst1_lane_u32((uint32_t *)(dst + stride * 6),
2367                 vreinterpret_u32_u16(d[1].val[1]), 0);
2368   vst1_lane_u32((uint32_t *)(dst + stride * 7),
2369                 vreinterpret_u32_u16(d[1].val[1]), 1);
2370 }
2371 
dr_prediction_z3_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2372 static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
2373                                       const uint8_t *left, int upsample_left,
2374                                       int dy) {
2375   uint8x8_t dstvec[8];
2376   uint32x2x2_t d[2];
2377 
2378   dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
2379   transpose8x8_low_neon(dstvec, d);
2380   vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]);
2381   vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]);
2382   vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]);
2383   vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]);
2384 }
2385 
dr_prediction_z3_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2386 static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
2387                                        const uint8_t *left, int upsample_left,
2388                                        int dy) {
2389   uint8x16_t dstvec[8];
2390   uint64x2_t d[8];
2391 
2392   dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
2393   transpose8x16_16x8_neon(dstvec, d);
2394   for (int i = 0; i < 8; i++) {
2395     vst1_u8(dst + i * stride, vreinterpret_u8_u64(vget_low_u64(d[i])));
2396     vst1_u8(dst + (i + 8) * stride, vreinterpret_u8_u64(vget_high_u64(d[i])));
2397   }
2398 }
2399 
dr_prediction_z3_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2400 static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
2401                                        const uint8_t *left, int upsample_left,
2402                                        int dy) {
2403   uint8x8_t dstvec[16];
2404   uint64x2_t d[8];
2405 
2406   dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
2407   transpose16x8_8x16_neon(dstvec, d);
2408   for (int i = 0; i < 8; i++) {
2409     vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
2410   }
2411 }
2412 
dr_prediction_z3_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2413 static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
2414                                        const uint8_t *left, int upsample_left,
2415                                        int dy) {
2416   uint8x16_t dstvec[4];
2417   uint16x8x2_t d[2];
2418 
2419   dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
2420   transpose4x16_neon(dstvec, d);
2421   vst1q_lane_u32((uint32_t *)(dst + stride * 0),
2422                  vreinterpretq_u32_u16(d[0].val[0]), 0);
2423   vst1q_lane_u32((uint32_t *)(dst + stride * 1),
2424                  vreinterpretq_u32_u16(d[0].val[0]), 1);
2425   vst1q_lane_u32((uint32_t *)(dst + stride * 2),
2426                  vreinterpretq_u32_u16(d[0].val[0]), 2);
2427   vst1q_lane_u32((uint32_t *)(dst + stride * 3),
2428                  vreinterpretq_u32_u16(d[0].val[0]), 3);
2429 
2430   vst1q_lane_u32((uint32_t *)(dst + stride * 4),
2431                  vreinterpretq_u32_u16(d[0].val[1]), 0);
2432   vst1q_lane_u32((uint32_t *)(dst + stride * 5),
2433                  vreinterpretq_u32_u16(d[0].val[1]), 1);
2434   vst1q_lane_u32((uint32_t *)(dst + stride * 6),
2435                  vreinterpretq_u32_u16(d[0].val[1]), 2);
2436   vst1q_lane_u32((uint32_t *)(dst + stride * 7),
2437                  vreinterpretq_u32_u16(d[0].val[1]), 3);
2438 
2439   vst1q_lane_u32((uint32_t *)(dst + stride * 8),
2440                  vreinterpretq_u32_u16(d[1].val[0]), 0);
2441   vst1q_lane_u32((uint32_t *)(dst + stride * 9),
2442                  vreinterpretq_u32_u16(d[1].val[0]), 1);
2443   vst1q_lane_u32((uint32_t *)(dst + stride * 10),
2444                  vreinterpretq_u32_u16(d[1].val[0]), 2);
2445   vst1q_lane_u32((uint32_t *)(dst + stride * 11),
2446                  vreinterpretq_u32_u16(d[1].val[0]), 3);
2447 
2448   vst1q_lane_u32((uint32_t *)(dst + stride * 12),
2449                  vreinterpretq_u32_u16(d[1].val[1]), 0);
2450   vst1q_lane_u32((uint32_t *)(dst + stride * 13),
2451                  vreinterpretq_u32_u16(d[1].val[1]), 1);
2452   vst1q_lane_u32((uint32_t *)(dst + stride * 14),
2453                  vreinterpretq_u32_u16(d[1].val[1]), 2);
2454   vst1q_lane_u32((uint32_t *)(dst + stride * 15),
2455                  vreinterpretq_u32_u16(d[1].val[1]), 3);
2456 }
2457 
dr_prediction_z3_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2458 static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
2459                                        const uint8_t *left, int upsample_left,
2460                                        int dy) {
2461   uint8x8_t dstvec[16];
2462   uint64x2_t d[8];
2463 
2464   dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
2465   transpose16x8_8x16_neon(dstvec, d);
2466   for (int i = 0; i < 4; i++) {
2467     vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
2468   }
2469 }
2470 
dr_prediction_z3_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2471 static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
2472                                        const uint8_t *left, int upsample_left,
2473                                        int dy) {
2474   uint8x16x2_t dstvec[16];
2475   uint64x2x2_t d[16];
2476   uint8x16_t v_zero = vdupq_n_u8(0);
2477 
2478   dr_prediction_z1_32xN_internal_neon(8, dstvec, left, upsample_left, dy);
2479   for (int i = 8; i < 16; i++) {
2480     dstvec[i].val[0] = v_zero;
2481     dstvec[i].val[1] = v_zero;
2482   }
2483   transpose16x32_neon(dstvec, d);
2484   for (int i = 0; i < 16; i++) {
2485     vst1_u8(dst + 2 * i * stride,
2486             vreinterpret_u8_u64(vget_low_u64(d[i].val[0])));
2487     vst1_u8(dst + (2 * i + 1) * stride,
2488             vreinterpret_u8_u64(vget_low_u64(d[i].val[1])));
2489   }
2490 }
2491 
dr_prediction_z3_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2492 static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
2493                                        const uint8_t *left, int upsample_left,
2494                                        int dy) {
2495   uint8x8_t dstvec[32];
2496   uint64x2_t d[16];
2497 
2498   dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
2499   transpose16x8_8x16_neon(dstvec, d);
2500   transpose16x8_8x16_neon(dstvec + 16, d + 8);
2501   for (int i = 0; i < 8; i++) {
2502     vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
2503     vst1q_u8(dst + i * stride + 16, vreinterpretq_u8_u64(d[i + 8]));
2504   }
2505 }
2506 
dr_prediction_z3_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2507 static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
2508                                         const uint8_t *left, int upsample_left,
2509                                         int dy) {
2510   uint8x16_t dstvec[16];
2511   uint64x2_t d[16];
2512 
2513   dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
2514   transpose16x16_neon(dstvec, d);
2515   for (int i = 0; i < 16; i++) {
2516     vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
2517   }
2518 }
2519 
dr_prediction_z3_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2520 static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
2521                                         const uint8_t *left, int upsample_left,
2522                                         int dy) {
2523   uint8x16x2_t dstvec[32];
2524   uint64x2x2_t d[32];
2525 
2526   dr_prediction_z1_32xN_internal_neon(32, dstvec, left, upsample_left, dy);
2527   transpose16x32_neon(dstvec, d);
2528   transpose16x32_neon(dstvec + 16, d + 16);
2529   for (int i = 0; i < 16; i++) {
2530     vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0]));
2531     vst1q_u8(dst + 2 * i * stride + 16, vreinterpretq_u8_u64(d[i + 16].val[0]));
2532     vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1]));
2533     vst1q_u8(dst + (2 * i + 1) * stride + 16,
2534              vreinterpretq_u8_u64(d[i + 16].val[1]));
2535   }
2536 }
2537 
dr_prediction_z3_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2538 static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
2539                                         const uint8_t *left, int upsample_left,
2540                                         int dy) {
2541   DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
2542 
2543   dr_prediction_z1_64xN_neon(64, dstT, 64, left, upsample_left, dy);
2544   transpose(dstT, 64, dst, stride, 64, 64);
2545 }
2546 
dr_prediction_z3_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2547 static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
2548                                         const uint8_t *left, int upsample_left,
2549                                         int dy) {
2550   uint8x16x2_t dstvec[16];
2551   uint64x2x2_t d[16];
2552 
2553   dr_prediction_z1_32xN_internal_neon(16, dstvec, left, upsample_left, dy);
2554   transpose16x32_neon(dstvec, d);
2555   for (int i = 0; i < 16; i++) {
2556     vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0]));
2557     vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1]));
2558   }
2559 }
2560 
dr_prediction_z3_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2561 static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
2562                                         const uint8_t *left, int upsample_left,
2563                                         int dy) {
2564   uint8x16_t dstvec[32];
2565   uint64x2_t d[16];
2566 
2567   dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
2568   for (int i = 0; i < 32; i += 16) {
2569     transpose16x16_neon((dstvec + i), d);
2570     for (int j = 0; j < 16; j++) {
2571       vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j]));
2572     }
2573   }
2574 }
2575 
dr_prediction_z3_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2576 static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
2577                                         const uint8_t *left, int upsample_left,
2578                                         int dy) {
2579   uint8_t dstT[64 * 32];
2580 
2581   dr_prediction_z1_64xN_neon(32, dstT, 64, left, upsample_left, dy);
2582   transpose(dstT, 64, dst, stride, 32, 64);
2583 }
2584 
dr_prediction_z3_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2585 static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
2586                                         const uint8_t *left, int upsample_left,
2587                                         int dy) {
2588   uint8_t dstT[32 * 64];
2589 
2590   dr_prediction_z1_32xN_neon(64, dstT, 32, left, upsample_left, dy);
2591   transpose(dstT, 32, dst, stride, 64, 32);
2592 }
2593 
dr_prediction_z3_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2594 static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
2595                                         const uint8_t *left, int upsample_left,
2596                                         int dy) {
2597   uint8_t dstT[64 * 16];
2598 
2599   dr_prediction_z1_64xN_neon(16, dstT, 64, left, upsample_left, dy);
2600   transpose(dstT, 64, dst, stride, 16, 64);
2601 }
2602 
dr_prediction_z3_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2603 static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
2604                                         const uint8_t *left, int upsample_left,
2605                                         int dy) {
2606   uint8x16_t dstvec[64];
2607   uint64x2_t d[16];
2608 
2609   dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
2610   for (int i = 0; i < 64; i += 16) {
2611     transpose16x16_neon((dstvec + i), d);
2612     for (int j = 0; j < 16; j++) {
2613       vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j]));
2614     }
2615   }
2616 }
2617 
av1_dr_prediction_z3_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)2618 void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
2619                                const uint8_t *above, const uint8_t *left,
2620                                int upsample_left, int dx, int dy) {
2621   (void)above;
2622   (void)dx;
2623   assert(dx == 1);
2624   assert(dy > 0);
2625 
2626   if (bw == bh) {
2627     switch (bw) {
2628       case 4:
2629         dr_prediction_z3_4x4_neon(dst, stride, left, upsample_left, dy);
2630         break;
2631       case 8:
2632         dr_prediction_z3_8x8_neon(dst, stride, left, upsample_left, dy);
2633         break;
2634       case 16:
2635         dr_prediction_z3_16x16_neon(dst, stride, left, upsample_left, dy);
2636         break;
2637       case 32:
2638         dr_prediction_z3_32x32_neon(dst, stride, left, upsample_left, dy);
2639         break;
2640       case 64:
2641         dr_prediction_z3_64x64_neon(dst, stride, left, upsample_left, dy);
2642         break;
2643     }
2644   } else {
2645     if (bw < bh) {
2646       if (bw + bw == bh) {
2647         switch (bw) {
2648           case 4:
2649             dr_prediction_z3_4x8_neon(dst, stride, left, upsample_left, dy);
2650             break;
2651           case 8:
2652             dr_prediction_z3_8x16_neon(dst, stride, left, upsample_left, dy);
2653             break;
2654           case 16:
2655             dr_prediction_z3_16x32_neon(dst, stride, left, upsample_left, dy);
2656             break;
2657           case 32:
2658             dr_prediction_z3_32x64_neon(dst, stride, left, upsample_left, dy);
2659             break;
2660         }
2661       } else {
2662         switch (bw) {
2663           case 4:
2664             dr_prediction_z3_4x16_neon(dst, stride, left, upsample_left, dy);
2665             break;
2666           case 8:
2667             dr_prediction_z3_8x32_neon(dst, stride, left, upsample_left, dy);
2668             break;
2669           case 16:
2670             dr_prediction_z3_16x64_neon(dst, stride, left, upsample_left, dy);
2671             break;
2672         }
2673       }
2674     } else {
2675       if (bh + bh == bw) {
2676         switch (bh) {
2677           case 4:
2678             dr_prediction_z3_8x4_neon(dst, stride, left, upsample_left, dy);
2679             break;
2680           case 8:
2681             dr_prediction_z3_16x8_neon(dst, stride, left, upsample_left, dy);
2682             break;
2683           case 16:
2684             dr_prediction_z3_32x16_neon(dst, stride, left, upsample_left, dy);
2685             break;
2686           case 32:
2687             dr_prediction_z3_64x32_neon(dst, stride, left, upsample_left, dy);
2688             break;
2689         }
2690       } else {
2691         switch (bh) {
2692           case 4:
2693             dr_prediction_z3_16x4_neon(dst, stride, left, upsample_left, dy);
2694             break;
2695           case 8:
2696             dr_prediction_z3_32x8_neon(dst, stride, left, upsample_left, dy);
2697             break;
2698           case 16:
2699             dr_prediction_z3_64x16_neon(dst, stride, left, upsample_left, dy);
2700             break;
2701         }
2702       }
2703     }
2704   }
2705 }
2706 static const int sm_weight_log2_scale = 8;
2707 
2708 // max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
2709 #define MAX_BLOCK_DIM 64
2710 
2711 /* clang-format off */
2712 static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
2713     // Unused, because we always offset by bs, which is at least 2.
2714     0, 0,
2715     // bs = 2
2716     255, 128,
2717     // bs = 4
2718     255, 149, 85, 64,
2719     // bs = 8
2720     255, 197, 146, 105, 73, 50, 37, 32,
2721     // bs = 16
2722     255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
2723     // bs = 32
2724     255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
2725     66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
2726     // bs = 64
2727     255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
2728     150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
2729     69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
2730     15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
2731 };
2732 /* clang-format on */
2733 
2734 // -----------------------------------------------------------------------------
2735 // SMOOTH_PRED
2736 
2737 // pixels[0]: above and below_pred interleave vector
2738 // pixels[1]: left vector
2739 // pixels[2]: right_pred vector
load_pixel_w4(const uint8_t * above,const uint8_t * left,int height,uint8x16_t * pixels)2740 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
2741                                  int height, uint8x16_t *pixels) {
2742   uint32x4_t zero = vdupq_n_u32(0);
2743   const uint8x8_t d = vcreate_u8(((const uint32_t *)above)[0]);
2744   if (height == 4)
2745     pixels[1] =
2746         vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero, 0));
2747   else if (height == 8) {
2748     pixels[1] = vreinterpretq_u8_u64(vsetq_lane_u64(
2749         ((const uint64_t *)left)[0], vreinterpretq_u64_u32(zero), 0));
2750   } else {
2751     pixels[1] = vld1q_u8(left);
2752   }
2753 
2754   pixels[2] = vreinterpretq_u8_u16(vdupq_n_u16(above[3]));
2755 
2756   const uint16x8_t bp = vdupq_n_u16(left[height - 1]);
2757 #if defined(__aarch64__)
2758   pixels[0] = vreinterpretq_u8_u16(vzip1q_u16(vmovl_u8(d), bp));
2759 #else
2760   pixels[0] = vreinterpretq_u8_u16(vzipq_u16(vmovl_u8(d), bp).val[0]);
2761 #endif  // (__aarch64__)
2762 }
2763 
2764 // weight_h[0]: weight_h vector
2765 // weight_h[1]: scale - weight_h vector
2766 // weight_h[2]: same as [0], second half for height = 16 only
2767 // weight_h[3]: same as [1], second half for height = 16 only
2768 // weight_w[0]: weights_w and scale - weights_w interleave vector
load_weight_w4(const uint8_t * weight_array,int height,uint16x8_t * weight_h,uint16x8_t * weight_w)2769 static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
2770                                   uint16x8_t *weight_h, uint16x8_t *weight_w) {
2771   const uint16x8_t d = vdupq_n_u16((uint16_t)(1 << sm_weight_log2_scale));
2772   const uint8x8_t t = vcreate_u8(((const uint32_t *)(weight_array))[1]);
2773   weight_h[0] = vmovl_u8(t);
2774   weight_h[1] = vsubw_u8(d, t);
2775 #if defined(__aarch64__)
2776   weight_w[0] = vzip1q_u16(weight_h[0], weight_h[1]);
2777 #else
2778   weight_w[0] = vzipq_u16(weight_h[0], weight_h[1]).val[0];
2779 #endif  // (__aarch64__)
2780 
2781   if (height == 8) {
2782     const uint8x8_t weight = vld1_u8(&weight_array[8]);
2783     weight_h[0] = vmovl_u8(weight);
2784     weight_h[1] = vsubw_u8(d, weight);
2785   } else if (height == 16) {
2786     const uint8x16_t zero = vdupq_n_u8(0);
2787     const uint8x16_t weight = vld1q_u8(&weight_array[16]);
2788     const uint8x16x2_t weight_h_02 = vzipq_u8(weight, zero);
2789     weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
2790     weight_h[1] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[0]));
2791     weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
2792     weight_h[3] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[1]));
2793   }
2794 }
2795 
smooth_pred_4xh(const uint8x16_t * pixel,const uint16x8_t * wh,const uint16x8_t * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)2796 static INLINE void smooth_pred_4xh(const uint8x16_t *pixel,
2797                                    const uint16x8_t *wh, const uint16x8_t *ww,
2798                                    int h, uint8_t *dst, ptrdiff_t stride,
2799                                    int second_half) {
2800   const uint16x4_t one = vdup_n_u16(1);
2801   const uint16x4_t inc = vdup_n_u16(0x202);
2802   uint16x4_t rep =
2803       second_half ? vdup_n_u16((uint16_t)0x8008) : vdup_n_u16((uint16_t)0x8000);
2804   uint16x4_t d = vdup_n_u16(0x100);
2805   const uint16x4_t v_pixel_0_lo = vmovn_u32(vreinterpretq_u32_u8(pixel[0]));
2806   const uint16x4_t v_pixel_0_hi =
2807       vmovn_u32(vreinterpretq_u32_u8(vextq_u8(pixel[0], pixel[0], 2)));
2808   const uint16x4_t v_pixel_2 = vget_low_u16(vreinterpretq_u16_u8(pixel[2]));
2809   const uint16x4_t ww_0_lo = vmovn_u32(vreinterpretq_u32_u16(ww[0]));
2810   const uint16x4_t ww_0_hi =
2811       vmovn_u32(vreinterpretq_u32_u16(vextq_u16(ww[0], ww[0], 1)));
2812   const uint8x8_t save_mask = vcreate_u8(0 + (2 << 8) + (4 << 16) + (6 << 24));
2813 
2814 #if !defined(__aarch64__)
2815   const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
2816                                    vget_high_u8(
2817                                        vreinterpretq_u8_u16(wh[0])) } };
2818   const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
2819                                    vget_high_u8(
2820                                        vreinterpretq_u8_u16(wh[1])) } };
2821   const uint8x8x2_t v_split3 = { { vget_low_u8(pixel[1]),
2822                                    vget_high_u8(pixel[1]) } };
2823 #endif  // (__aarch64__)
2824 
2825   for (int i = 0; i < h; ++i) {
2826 #if defined(__aarch64__)
2827     const uint8x8_t wg =
2828         vqtbl1_u8(vreinterpretq_u8_u16(wh[0]), vreinterpret_u8_u16(d));
2829     const uint8x8_t sc =
2830         vqtbl1_u8(vreinterpretq_u8_u16(wh[1]), vreinterpret_u8_u16(d));
2831 #else
2832     const uint8x8_t wg = vtbl2_u8(v_split1, vreinterpret_u8_u16(d));
2833     const uint8x8_t sc = vtbl2_u8(v_split2, vreinterpret_u8_u16(d));
2834 #endif  // (__aarch64__)
2835 
2836     uint32x4_t sum = vmull_u16(v_pixel_0_lo, vreinterpret_u16_u8(wg));
2837     sum = vmlal_u16(sum, v_pixel_0_hi, vreinterpret_u16_u8(sc));
2838 
2839 #if defined(__aarch64__)
2840     uint8x8_t b = vqtbl1_u8(pixel[1], vreinterpret_u8_u16(rep));
2841 #else
2842     uint8x8_t b = vtbl2_u8(v_split3, vreinterpret_u8_u16(rep));
2843 #endif  // (__aarch64__)
2844 
2845     sum = vmlal_u16(sum, vreinterpret_u16_u8(b), ww_0_lo);
2846     sum = vmlal_u16(sum, v_pixel_2, ww_0_hi);
2847     uint8x8_t sum_l = vreinterpret_u8_u16(vqrshrn_n_u32(sum, 9));
2848     uint32x2_t predsh = vreinterpret_u32_u8(vtbl1_u8(sum_l, save_mask));
2849     vst1_lane_u32((uint32_t *)dst, predsh, 0);
2850 
2851     dst += stride;
2852 
2853     rep = vadd_u16(rep, one);
2854     d = vadd_u16(d, inc);
2855   }
2856 }
2857 
aom_smooth_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)2858 void aom_smooth_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
2859                                    const uint8_t *above, const uint8_t *left) {
2860   uint8x16_t pixels[3];
2861   load_pixel_w4(above, left, 4, pixels);
2862 
2863   uint16x8_t wh[4], ww[2];
2864   load_weight_w4(sm_weight_arrays, 4, wh, ww);
2865 
2866   smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
2867 }
2868 
aom_smooth_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)2869 void aom_smooth_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
2870                                    const uint8_t *above, const uint8_t *left) {
2871   uint8x16_t pixels[3];
2872   load_pixel_w4(above, left, 8, pixels);
2873 
2874   uint16x8_t wh[4], ww[2];
2875   load_weight_w4(sm_weight_arrays, 8, wh, ww);
2876 
2877   smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
2878 }
2879 
aom_smooth_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)2880 void aom_smooth_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
2881                                     const uint8_t *above, const uint8_t *left) {
2882   uint8x16_t pixels[3];
2883   load_pixel_w4(above, left, 16, pixels);
2884 
2885   uint16x8_t wh[4], ww[2];
2886   load_weight_w4(sm_weight_arrays, 16, wh, ww);
2887 
2888   smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
2889   dst += stride << 3;
2890   smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
2891 }
2892 
2893 // pixels[0]: above and below_pred interleave vector, first half
2894 // pixels[1]: above and below_pred interleave vector, second half
2895 // pixels[2]: left vector
2896 // pixels[3]: right_pred vector
2897 // pixels[4]: above and below_pred interleave vector, first half
2898 // pixels[5]: above and below_pred interleave vector, second half
2899 // pixels[6]: left vector + 16
2900 // pixels[7]: right_pred vector
load_pixel_w8(const uint8_t * above,const uint8_t * left,int height,uint8x16_t * pixels)2901 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
2902                                  int height, uint8x16_t *pixels) {
2903   pixels[0] = vreinterpretq_u8_u16(vmovl_u8(vld1_u8(above)));
2904   pixels[1] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)left[height - 1]));
2905   pixels[3] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)above[7]));
2906 
2907   if (height == 4) {
2908     const uint32x4_t zero32 = vdupq_n_u32(0);
2909     pixels[2] =
2910         vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero32, 0));
2911   } else if (height == 8) {
2912     const uint64x2_t zero64 = vdupq_n_u64(0);
2913     pixels[2] = vreinterpretq_u8_u64(
2914         vsetq_lane_u64(((const uint64_t *)left)[0], zero64, 0));
2915   } else if (height == 16) {
2916     pixels[2] = vld1q_u8(left);
2917   } else {
2918     pixels[2] = vld1q_u8(left);
2919     pixels[4] = pixels[0];
2920     pixels[5] = pixels[1];
2921     pixels[6] = vld1q_u8(left + 16);
2922     pixels[7] = pixels[3];
2923   }
2924 }
2925 
2926 // weight_h[0]: weight_h vector
2927 // weight_h[1]: scale - weight_h vector
2928 // weight_h[2]: same as [0], offset 8
2929 // weight_h[3]: same as [1], offset 8
2930 // weight_h[4]: same as [0], offset 16
2931 // weight_h[5]: same as [1], offset 16
2932 // weight_h[6]: same as [0], offset 24
2933 // weight_h[7]: same as [1], offset 24
2934 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
2935 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_w8(const uint8_t * weight_array,int height,uint16x8_t * weight_h,uint16x8_t * weight_w)2936 static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
2937                                   uint16x8_t *weight_h, uint16x8_t *weight_w) {
2938   const uint8x16_t zero = vdupq_n_u8(0);
2939   const int we_offset = height < 8 ? 4 : 8;
2940   uint8x16_t we = vld1q_u8(&weight_array[we_offset]);
2941 #if defined(__aarch64__)
2942   weight_h[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
2943 #else
2944   weight_h[0] = vreinterpretq_u16_u8(vzipq_u8(we, zero).val[0]);
2945 #endif  // (__aarch64__)
2946   const uint16x8_t d = vdupq_n_u16(256);
2947   weight_h[1] = vsubq_u16(d, weight_h[0]);
2948 
2949   if (height == 4) {
2950     we = vextq_u8(we, zero, 4);
2951 #if defined(__aarch64__)
2952     weight_w[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
2953 #else
2954     weight_w[0] = vmovl_u8(vget_low_u8(we));
2955 #endif  // (__aarch64__)
2956     weight_w[1] = vsubq_u16(d, weight_w[0]);
2957   } else {
2958     weight_w[0] = weight_h[0];
2959     weight_w[1] = weight_h[1];
2960   }
2961 
2962   if (height == 16) {
2963     we = vld1q_u8(&weight_array[16]);
2964     const uint8x16x2_t weight_h_02 = vzipq_u8(we, zero);
2965     weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
2966     weight_h[1] = vsubq_u16(d, weight_h[0]);
2967     weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
2968     weight_h[3] = vsubq_u16(d, weight_h[2]);
2969   } else if (height == 32) {
2970     const uint8x16_t weight_lo = vld1q_u8(&weight_array[32]);
2971     const uint8x16x2_t weight_h_02 = vzipq_u8(weight_lo, zero);
2972     weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
2973     weight_h[1] = vsubq_u16(d, weight_h[0]);
2974     weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
2975     weight_h[3] = vsubq_u16(d, weight_h[2]);
2976     const uint8x16_t weight_hi = vld1q_u8(&weight_array[32 + 16]);
2977     const uint8x16x2_t weight_h_46 = vzipq_u8(weight_hi, zero);
2978     weight_h[4] = vreinterpretq_u16_u8(weight_h_46.val[0]);
2979     weight_h[5] = vsubq_u16(d, weight_h[4]);
2980     weight_h[6] = vreinterpretq_u16_u8(weight_h_46.val[1]);
2981     weight_h[7] = vsubq_u16(d, weight_h[6]);
2982   }
2983 }
2984 
smooth_pred_8xh(const uint8x16_t * pixels,const uint16x8_t * wh,const uint16x8_t * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)2985 static INLINE void smooth_pred_8xh(const uint8x16_t *pixels,
2986                                    const uint16x8_t *wh, const uint16x8_t *ww,
2987                                    int h, uint8_t *dst, ptrdiff_t stride,
2988                                    int second_half) {
2989   const uint16x8_t one = vdupq_n_u16(1);
2990   const uint16x8_t inc = vdupq_n_u16(0x202);
2991   uint16x8_t rep = second_half ? vdupq_n_u16((uint16_t)0x8008)
2992                                : vdupq_n_u16((uint16_t)0x8000);
2993   uint16x8_t d = vdupq_n_u16(0x100);
2994 
2995 #if !defined(__aarch64__)
2996   const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
2997                                    vget_high_u8(
2998                                        vreinterpretq_u8_u16(wh[0])) } };
2999   const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
3000                                    vget_high_u8(
3001                                        vreinterpretq_u8_u16(wh[1])) } };
3002   const uint8x8x2_t v_split3 = { { vget_low_u8(pixels[2]),
3003                                    vget_high_u8(pixels[2]) } };
3004 #endif
3005 
3006   for (int i = 0; i < h; ++i) {
3007 #if defined(__aarch64__)
3008     const uint8x16_t wg_wg =
3009         vqtbl1q_u8(vreinterpretq_u8_u16(wh[0]), vreinterpretq_u8_u16(d));
3010     const uint8x16_t sc_sc =
3011         vqtbl1q_u8(vreinterpretq_u8_u16(wh[1]), vreinterpretq_u8_u16(d));
3012 #else
3013     const uint8x8_t v_d_lo = vreinterpret_u8_u16(vget_low_u16(d));
3014     const uint8x8_t v_d_hi = vreinterpret_u8_u16(vget_high_u16(d));
3015     const uint8x16_t wg_wg =
3016         vcombine_u8(vtbl2_u8(v_split1, v_d_lo), vtbl2_u8(v_split1, v_d_hi));
3017     const uint8x16_t sc_sc =
3018         vcombine_u8(vtbl2_u8(v_split2, v_d_lo), vtbl2_u8(v_split2, v_d_hi));
3019 #endif  // (__aarch64__)
3020     uint16x8_t s01 =
3021         vmulq_u16(vreinterpretq_u16_u8(pixels[0]), vreinterpretq_u16_u8(wg_wg));
3022     s01 = vmlaq_u16(s01, vreinterpretq_u16_u8(pixels[1]),
3023                     vreinterpretq_u16_u8(sc_sc));
3024 #if defined(__aarch64__)
3025     const uint8x16_t b = vqtbl1q_u8(pixels[2], vreinterpretq_u8_u16(rep));
3026 #else
3027     const uint8x16_t b = vcombine_u8(
3028         vtbl2_u8(v_split3, vget_low_u8(vreinterpretq_u8_u16(rep))),
3029         vtbl2_u8(v_split3, vget_high_u8(vreinterpretq_u8_u16(rep))));
3030 #endif  // (__aarch64__)
3031     uint16x8_t sum0 = vmulq_u16(vreinterpretq_u16_u8(b), ww[0]);
3032     sum0 = vmlaq_u16(sum0, vreinterpretq_u16_u8(pixels[3]), ww[1]);
3033 
3034     uint32x4_t s0 = vaddl_u16(vget_low_u16(s01), vget_low_u16(sum0));
3035 #if defined(__aarch64__)
3036     uint32x4_t s1 = vaddl_high_u16(s01, sum0);
3037 #else
3038     uint32x4_t s1 = vaddl_u16(vget_high_u16(s01), vget_high_u16(sum0));
3039 #endif  // (__aarch64__)
3040 
3041     sum0 = vcombine_u16(vqrshrn_n_u32(s0, 9), vqrshrn_n_u32(s1, 9));
3042     uint8x8_t predsh = vqmovn_u16(sum0);
3043     vst1_u8(dst, predsh);
3044 
3045     dst += stride;
3046     rep = vaddq_u16(rep, one);
3047     d = vaddq_u16(d, inc);
3048   }
3049 }
3050 
aom_smooth_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3051 void aom_smooth_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
3052                                    const uint8_t *above, const uint8_t *left) {
3053   uint8x16_t pixels[4];
3054   load_pixel_w8(above, left, 4, pixels);
3055 
3056   uint16x8_t wh[4], ww[2];
3057   load_weight_w8(sm_weight_arrays, 4, wh, ww);
3058 
3059   smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
3060 }
3061 
aom_smooth_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3062 void aom_smooth_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
3063                                    const uint8_t *above, const uint8_t *left) {
3064   uint8x16_t pixels[4];
3065   load_pixel_w8(above, left, 8, pixels);
3066 
3067   uint16x8_t wh[4], ww[2];
3068   load_weight_w8(sm_weight_arrays, 8, wh, ww);
3069 
3070   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
3071 }
3072 
aom_smooth_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3073 void aom_smooth_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
3074                                     const uint8_t *above, const uint8_t *left) {
3075   uint8x16_t pixels[4];
3076   load_pixel_w8(above, left, 16, pixels);
3077 
3078   uint16x8_t wh[4], ww[2];
3079   load_weight_w8(sm_weight_arrays, 16, wh, ww);
3080 
3081   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
3082   dst += stride << 3;
3083   smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
3084 }
3085 
aom_smooth_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3086 void aom_smooth_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
3087                                     const uint8_t *above, const uint8_t *left) {
3088   uint8x16_t pixels[8];
3089   load_pixel_w8(above, left, 32, pixels);
3090 
3091   uint16x8_t wh[8], ww[2];
3092   load_weight_w8(sm_weight_arrays, 32, wh, ww);
3093 
3094   smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
3095   dst += stride << 3;
3096   smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
3097   dst += stride << 3;
3098   smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
3099   dst += stride << 3;
3100   smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
3101 }
3102 
smooth_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)3103 static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
3104                                         const uint8_t *above,
3105                                         const uint8_t *left, uint32_t bw,
3106                                         uint32_t bh) {
3107   const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
3108   const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
3109   const uint16x8_t scale_value = vdupq_n_u16(256);
3110 
3111   for (uint32_t y = 0; y < bh; ++y) {
3112     const uint8x8_t left_y = vdup_n_u8(left[y]);
3113     const uint8x8_t weights_y_dup = vdup_n_u8(sm_weights_h[y]);
3114     const uint32x4_t pred_scaled_bl =
3115         vdupq_n_u32(256 + (256 - sm_weights_h[y]) * left[bh - 1]);
3116 
3117     for (uint32_t x = 0; x < bw; x += 8) {
3118       const uint8x8_t weights_x = vld1_u8(sm_weights_w + x);
3119       const uint8x8_t top_x = vld1_u8(above + x);
3120 
3121       uint16x8_t pred_m1, pred_m2;
3122       uint32x4_t pred_lo, pred_hi;
3123       pred_m1 = vmull_u8(top_x, weights_y_dup);
3124       pred_m2 = vmull_u8(weights_x, left_y);
3125 
3126       pred_lo = vaddl_u16(vget_low_u16(pred_m1), vget_low_u16(pred_m2));
3127 #if defined(__aarch64__)
3128       pred_hi = vaddl_high_u16(pred_m1, pred_m2);
3129 #else
3130       pred_hi = vaddl_u16(vget_high_u16(pred_m1), vget_high_u16(pred_m2));
3131 #endif  // (__aarch64__)
3132 
3133       const uint16x8_t scale_m_weights_x = vsubw_u8(scale_value, weights_x);
3134 
3135       const uint16x8_t swxtr = vmulq_n_u16(scale_m_weights_x, above[bw - 1]);
3136 
3137       pred_lo = vaddq_u32(pred_lo, pred_scaled_bl);
3138       pred_hi = vaddq_u32(pred_hi, pred_scaled_bl);
3139 
3140       pred_lo = vaddw_u16(pred_lo, vget_low_u16(swxtr));
3141 #if defined(__aarch64__)
3142       pred_hi = vaddw_high_u16(pred_hi, swxtr);
3143 #else
3144       pred_hi = vaddw_u16(pred_hi, vget_high_u16(swxtr));
3145 #endif  // (__aarch64__)
3146 
3147       uint16x8_t pred =
3148           vcombine_u16(vshrn_n_u32(pred_lo, 9), vshrn_n_u32(pred_hi, 9));
3149 
3150       uint8x8_t predsh = vqmovn_u16(pred);
3151 
3152       vst1_u8(dst + x, predsh);
3153     }
3154 
3155     dst += stride;
3156   }
3157 }
3158 
aom_smooth_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3159 void aom_smooth_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
3160                                     const uint8_t *above, const uint8_t *left) {
3161   smooth_predictor_wxh(dst, stride, above, left, 16, 4);
3162 }
3163 
aom_smooth_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3164 void aom_smooth_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
3165                                     const uint8_t *above, const uint8_t *left) {
3166   smooth_predictor_wxh(dst, stride, above, left, 16, 8);
3167 }
3168 
aom_smooth_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3169 void aom_smooth_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
3170                                      const uint8_t *above,
3171                                      const uint8_t *left) {
3172   smooth_predictor_wxh(dst, stride, above, left, 16, 16);
3173 }
3174 
aom_smooth_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3175 void aom_smooth_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
3176                                      const uint8_t *above,
3177                                      const uint8_t *left) {
3178   smooth_predictor_wxh(dst, stride, above, left, 16, 32);
3179 }
3180 
aom_smooth_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3181 void aom_smooth_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
3182                                     const uint8_t *above, const uint8_t *left) {
3183   smooth_predictor_wxh(dst, stride, above, left, 32, 8);
3184 }
3185 
aom_smooth_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3186 void aom_smooth_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
3187                                      const uint8_t *above,
3188                                      const uint8_t *left) {
3189   smooth_predictor_wxh(dst, stride, above, left, 32, 16);
3190 }
3191 
aom_smooth_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3192 void aom_smooth_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
3193                                      const uint8_t *above,
3194                                      const uint8_t *left) {
3195   smooth_predictor_wxh(dst, stride, above, left, 32, 32);
3196 }
3197 
aom_smooth_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3198 void aom_smooth_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
3199                                      const uint8_t *above,
3200                                      const uint8_t *left) {
3201   smooth_predictor_wxh(dst, stride, above, left, 32, 64);
3202 }
3203 
aom_smooth_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3204 void aom_smooth_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
3205                                      const uint8_t *above,
3206                                      const uint8_t *left) {
3207   smooth_predictor_wxh(dst, stride, above, left, 64, 64);
3208 }
3209 
aom_smooth_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3210 void aom_smooth_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
3211                                      const uint8_t *above,
3212                                      const uint8_t *left) {
3213   smooth_predictor_wxh(dst, stride, above, left, 64, 32);
3214 }
3215 
aom_smooth_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3216 void aom_smooth_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
3217                                      const uint8_t *above,
3218                                      const uint8_t *left) {
3219   smooth_predictor_wxh(dst, stride, above, left, 64, 16);
3220 }
3221 
aom_smooth_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)3222 void aom_smooth_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
3223                                      const uint8_t *above,
3224                                      const uint8_t *left) {
3225   smooth_predictor_wxh(dst, stride, above, left, 16, 64);
3226 }
3227