1 /*
2  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include "./vpx_config.h"
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/transpose_neon.h"
15 
16 // For all the static inline functions, the functions ending with '_8' process
17 // 8 samples in a bunch, and the functions ending with '_16' process 16 samples
18 // in a bunch.
19 
20 #define FUN_LOAD_THRESH(w, r)                                             \
21   static INLINE void load_thresh_##w(                                     \
22       const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \
23       uint8x##w##_t *blimit_vec, uint8x##w##_t *limit_vec,                \
24       uint8x##w##_t *thresh_vec) {                                        \
25     *blimit_vec = vld1##r##dup_u8(blimit);                                \
26     *limit_vec = vld1##r##dup_u8(limit);                                  \
27     *thresh_vec = vld1##r##dup_u8(thresh);                                \
28   }
29 
30 FUN_LOAD_THRESH(8, _)    // load_thresh_8
31 FUN_LOAD_THRESH(16, q_)  // load_thresh_16
32 #undef FUN_LOAD_THRESH
33 
load_thresh_8_dual(const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,uint8x16_t * blimit_vec,uint8x16_t * limit_vec,uint8x16_t * thresh_vec)34 static INLINE void load_thresh_8_dual(
35     const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
36     const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,
37     uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) {
38   *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1));
39   *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1));
40   *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1));
41 }
42 
43 // Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a
44 // pixel. When used to control filter branches, we only detect whether it is all
45 // 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
46 // flat equals 0 if and only if flat_status equals 0.
47 // flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
48 // because each mask occupies more than 1 bit.)
calc_flat_status_8(uint8x8_t flat)49 static INLINE uint32_t calc_flat_status_8(uint8x8_t flat) {
50   return vget_lane_u32(
51       vreinterpret_u32_u64(vpaddl_u32(vreinterpret_u32_u8(flat))), 0);
52 }
53 
54 // Here flat is 128-bit long, with each 8-bit chunk being a mask of a pixel.
55 // When used to control filter branches, we only detect whether it is all 0s or
56 // all 1s. We narrowing shift right each 16-bit chunk by 4 arithmetically, so
57 // we get a 64-bit long number, with each 4-bit chunk being a mask of a pixel.
58 // Then we pairwise add flat to a 32-bit long number flat_status.
59 // flat equals 0 if and only if flat_status equals 0.
60 // flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
61 // because each mask occupies more than 1 bit.)
calc_flat_status_16(uint8x16_t flat)62 static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
63   const uint8x8_t flat_4bit =
64       vreinterpret_u8_s8(vshrn_n_s16(vreinterpretq_s16_u8(flat), 4));
65   return calc_flat_status_8(flat_4bit);
66 }
67 
68 #define FUN_FILTER_HEV_MASK4(w, r)                                            \
69   static INLINE uint8x##w##_t filter_hev_mask4_##w(                           \
70       const uint8x##w##_t limit, const uint8x##w##_t blimit,                  \
71       const uint8x##w##_t thresh, const uint8x##w##_t p3,                     \
72       const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
73       const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
74       const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) {      \
75     uint8x##w##_t max, t0, t1;                                                \
76                                                                               \
77     max = vabd##r##u8(p1, p0);                                                \
78     max = vmax##r##u8(max, vabd##r##u8(q1, q0));                              \
79     *hev = vcgt##r##u8(max, thresh);                                          \
80     *mask = vmax##r##u8(max, vabd##r##u8(p3, p2));                            \
81     *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1));                          \
82     *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1));                          \
83     *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2));                          \
84     t0 = vabd##r##u8(p0, q0);                                                 \
85     t1 = vabd##r##u8(p1, q1);                                                 \
86     t0 = vqadd##r##u8(t0, t0);                                                \
87     t1 = vshr##r##n_u8(t1, 1);                                                \
88     t0 = vqadd##r##u8(t0, t1);                                                \
89     *mask = vcle##r##u8(*mask, limit);                                        \
90     t0 = vcle##r##u8(t0, blimit);                                             \
91     *mask = vand##r##u8(*mask, t0);                                           \
92                                                                               \
93     return max;                                                               \
94   }
95 
96 FUN_FILTER_HEV_MASK4(8, _)    // filter_hev_mask4_8
97 FUN_FILTER_HEV_MASK4(16, q_)  // filter_hev_mask4_16
98 #undef FUN_FILTER_HEV_MASK4
99 
100 #define FUN_FILTER_FLAT_HEV_MASK(w, r)                                        \
101   static INLINE uint8x##w##_t filter_flat_hev_mask_##w(                       \
102       const uint8x##w##_t limit, const uint8x##w##_t blimit,                  \
103       const uint8x##w##_t thresh, const uint8x##w##_t p3,                     \
104       const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
105       const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
106       const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status,     \
107       uint8x##w##_t *hev) {                                                   \
108     uint8x##w##_t max, mask;                                                  \
109                                                                               \
110     max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \
111                                q2, q3, hev, &mask);                           \
112     *flat = vmax##r##u8(max, vabd##r##u8(p2, p0));                            \
113     *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0));                          \
114     *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0));                          \
115     *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0));                          \
116     *flat = vcle##r##u8(*flat, vdup##r##n_u8(1)); /* flat_mask4() */          \
117     *flat = vand##r##u8(*flat, mask);                                         \
118     *flat_status = calc_flat_status_##w(*flat);                               \
119                                                                               \
120     return mask;                                                              \
121   }
122 
123 FUN_FILTER_FLAT_HEV_MASK(8, _)    // filter_flat_hev_mask_8
124 FUN_FILTER_FLAT_HEV_MASK(16, q_)  // filter_flat_hev_mask_16
125 #undef FUN_FILTER_FLAT_HEV_MASK
126 
127 #define FUN_FLAT_MASK5(w, r)                                                  \
128   static INLINE uint8x##w##_t flat_mask5_##w(                                 \
129       const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
130       const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
131       const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
132       const uint8x##w##_t q4, const uint8x##w##_t flat,                       \
133       uint32_t *flat2_status) {                                               \
134     uint8x##w##_t flat2 = vabd##r##u8(p4, p0);                                \
135     flat2 = vmax##r##u8(flat2, vabd##r##u8(p3, p0));                          \
136     flat2 = vmax##r##u8(flat2, vabd##r##u8(p2, p0));                          \
137     flat2 = vmax##r##u8(flat2, vabd##r##u8(p1, p0));                          \
138     flat2 = vmax##r##u8(flat2, vabd##r##u8(q1, q0));                          \
139     flat2 = vmax##r##u8(flat2, vabd##r##u8(q2, q0));                          \
140     flat2 = vmax##r##u8(flat2, vabd##r##u8(q3, q0));                          \
141     flat2 = vmax##r##u8(flat2, vabd##r##u8(q4, q0));                          \
142     flat2 = vcle##r##u8(flat2, vdup##r##n_u8(1));                             \
143     flat2 = vand##r##u8(flat2, flat);                                         \
144     *flat2_status = calc_flat_status_##w(flat2);                              \
145                                                                               \
146     return flat2;                                                             \
147   }
148 
149 FUN_FLAT_MASK5(8, _)    // flat_mask5_8
150 FUN_FLAT_MASK5(16, q_)  // flat_mask5_16
151 #undef FUN_FLAT_MASK5
152 
153 #define FUN_FLIP_SIGN(w, r)                                         \
154   static INLINE int8x##w##_t flip_sign_##w(const uint8x##w##_t v) { \
155     const uint8x##w##_t sign_bit = vdup##r##n_u8(0x80);             \
156     return vreinterpret##r##s8_u8(veor##r##u8(v, sign_bit));        \
157   }
158 
159 FUN_FLIP_SIGN(8, _)    // flip_sign_8
160 FUN_FLIP_SIGN(16, q_)  // flip_sign_16
161 #undef FUN_FLIP_SIGN
162 
163 #define FUN_FLIP_SIGN_BACK(w, r)                                         \
164   static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \
165     const int8x##w##_t sign_bit = vdup##r##n_s8(0x80);                   \
166     return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit));             \
167   }
168 
169 FUN_FLIP_SIGN_BACK(8, _)    // flip_sign_back_8
170 FUN_FLIP_SIGN_BACK(16, q_)  // flip_sign_back_16
171 #undef FUN_FLIP_SIGN_BACK
172 
filter_update_8(const uint8x8_t sub0,const uint8x8_t sub1,const uint8x8_t add0,const uint8x8_t add1,uint16x8_t * sum)173 static INLINE void filter_update_8(const uint8x8_t sub0, const uint8x8_t sub1,
174                                    const uint8x8_t add0, const uint8x8_t add1,
175                                    uint16x8_t *sum) {
176   *sum = vsubw_u8(*sum, sub0);
177   *sum = vsubw_u8(*sum, sub1);
178   *sum = vaddw_u8(*sum, add0);
179   *sum = vaddw_u8(*sum, add1);
180 }
181 
filter_update_16(const uint8x16_t sub0,const uint8x16_t sub1,const uint8x16_t add0,const uint8x16_t add1,uint16x8_t * sum0,uint16x8_t * sum1)182 static INLINE void filter_update_16(const uint8x16_t sub0,
183                                     const uint8x16_t sub1,
184                                     const uint8x16_t add0,
185                                     const uint8x16_t add1, uint16x8_t *sum0,
186                                     uint16x8_t *sum1) {
187   *sum0 = vsubw_u8(*sum0, vget_low_u8(sub0));
188   *sum1 = vsubw_u8(*sum1, vget_high_u8(sub0));
189   *sum0 = vsubw_u8(*sum0, vget_low_u8(sub1));
190   *sum1 = vsubw_u8(*sum1, vget_high_u8(sub1));
191   *sum0 = vaddw_u8(*sum0, vget_low_u8(add0));
192   *sum1 = vaddw_u8(*sum1, vget_high_u8(add0));
193   *sum0 = vaddw_u8(*sum0, vget_low_u8(add1));
194   *sum1 = vaddw_u8(*sum1, vget_high_u8(add1));
195 }
196 
calc_7_tap_filter_8_kernel(const uint8x8_t sub0,const uint8x8_t sub1,const uint8x8_t add0,const uint8x8_t add1,uint16x8_t * sum)197 static INLINE uint8x8_t calc_7_tap_filter_8_kernel(const uint8x8_t sub0,
198                                                    const uint8x8_t sub1,
199                                                    const uint8x8_t add0,
200                                                    const uint8x8_t add1,
201                                                    uint16x8_t *sum) {
202   filter_update_8(sub0, sub1, add0, add1, sum);
203   return vrshrn_n_u16(*sum, 3);
204 }
205 
calc_7_tap_filter_16_kernel(const uint8x16_t sub0,const uint8x16_t sub1,const uint8x16_t add0,const uint8x16_t add1,uint16x8_t * sum0,uint16x8_t * sum1)206 static INLINE uint8x16_t calc_7_tap_filter_16_kernel(
207     const uint8x16_t sub0, const uint8x16_t sub1, const uint8x16_t add0,
208     const uint8x16_t add1, uint16x8_t *sum0, uint16x8_t *sum1) {
209   filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
210   return vcombine_u8(vrshrn_n_u16(*sum0, 3), vrshrn_n_u16(*sum1, 3));
211 }
212 
apply_15_tap_filter_8_kernel(const uint8x8_t flat,const uint8x8_t sub0,const uint8x8_t sub1,const uint8x8_t add0,const uint8x8_t add1,const uint8x8_t in,uint16x8_t * sum)213 static INLINE uint8x8_t apply_15_tap_filter_8_kernel(
214     const uint8x8_t flat, const uint8x8_t sub0, const uint8x8_t sub1,
215     const uint8x8_t add0, const uint8x8_t add1, const uint8x8_t in,
216     uint16x8_t *sum) {
217   filter_update_8(sub0, sub1, add0, add1, sum);
218   return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in);
219 }
220 
apply_15_tap_filter_16_kernel(const uint8x16_t flat,const uint8x16_t sub0,const uint8x16_t sub1,const uint8x16_t add0,const uint8x16_t add1,const uint8x16_t in,uint16x8_t * sum0,uint16x8_t * sum1)221 static INLINE uint8x16_t apply_15_tap_filter_16_kernel(
222     const uint8x16_t flat, const uint8x16_t sub0, const uint8x16_t sub1,
223     const uint8x16_t add0, const uint8x16_t add1, const uint8x16_t in,
224     uint16x8_t *sum0, uint16x8_t *sum1) {
225   uint8x16_t t;
226   filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
227   t = vcombine_u8(vrshrn_n_u16(*sum0, 4), vrshrn_n_u16(*sum1, 4));
228   return vbslq_u8(flat, t, in);
229 }
230 
231 // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
calc_7_tap_filter_8(const uint8x8_t p3,const uint8x8_t p2,const uint8x8_t p1,const uint8x8_t p0,const uint8x8_t q0,const uint8x8_t q1,const uint8x8_t q2,const uint8x8_t q3,uint8x8_t * op2,uint8x8_t * op1,uint8x8_t * op0,uint8x8_t * oq0,uint8x8_t * oq1,uint8x8_t * oq2)232 static INLINE void calc_7_tap_filter_8(const uint8x8_t p3, const uint8x8_t p2,
233                                        const uint8x8_t p1, const uint8x8_t p0,
234                                        const uint8x8_t q0, const uint8x8_t q1,
235                                        const uint8x8_t q2, const uint8x8_t q3,
236                                        uint8x8_t *op2, uint8x8_t *op1,
237                                        uint8x8_t *op0, uint8x8_t *oq0,
238                                        uint8x8_t *oq1, uint8x8_t *oq2) {
239   uint16x8_t sum;
240   sum = vaddl_u8(p3, p3);   // 2*p3
241   sum = vaddw_u8(sum, p3);  // 3*p3
242   sum = vaddw_u8(sum, p2);  // 3*p3+p2
243   sum = vaddw_u8(sum, p2);  // 3*p3+2*p2
244   sum = vaddw_u8(sum, p1);  // 3*p3+2*p2+p1
245   sum = vaddw_u8(sum, p0);  // 3*p3+2*p2+p1+p0
246   sum = vaddw_u8(sum, q0);  // 3*p3+2*p2+p1+p0+q0
247   *op2 = vrshrn_n_u16(sum, 3);
248   *op1 = calc_7_tap_filter_8_kernel(p3, p2, p1, q1, &sum);
249   *op0 = calc_7_tap_filter_8_kernel(p3, p1, p0, q2, &sum);
250   *oq0 = calc_7_tap_filter_8_kernel(p3, p0, q0, q3, &sum);
251   *oq1 = calc_7_tap_filter_8_kernel(p2, q0, q1, q3, &sum);
252   *oq2 = calc_7_tap_filter_8_kernel(p1, q1, q2, q3, &sum);
253 }
254 
calc_7_tap_filter_16(const uint8x16_t p3,const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t q3,uint8x16_t * op2,uint8x16_t * op1,uint8x16_t * op0,uint8x16_t * oq0,uint8x16_t * oq1,uint8x16_t * oq2)255 static INLINE void calc_7_tap_filter_16(
256     const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t p1,
257     const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1,
258     const uint8x16_t q2, const uint8x16_t q3, uint8x16_t *op2, uint8x16_t *op1,
259     uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2) {
260   uint16x8_t sum0, sum1;
261   sum0 = vaddl_u8(vget_low_u8(p3), vget_low_u8(p3));    // 2*p3
262   sum1 = vaddl_u8(vget_high_u8(p3), vget_high_u8(p3));  // 2*p3
263   sum0 = vaddw_u8(sum0, vget_low_u8(p3));               // 3*p3
264   sum1 = vaddw_u8(sum1, vget_high_u8(p3));              // 3*p3
265   sum0 = vaddw_u8(sum0, vget_low_u8(p2));               // 3*p3+p2
266   sum1 = vaddw_u8(sum1, vget_high_u8(p2));              // 3*p3+p2
267   sum0 = vaddw_u8(sum0, vget_low_u8(p2));               // 3*p3+2*p2
268   sum1 = vaddw_u8(sum1, vget_high_u8(p2));              // 3*p3+2*p2
269   sum0 = vaddw_u8(sum0, vget_low_u8(p1));               // 3*p3+2*p2+p1
270   sum1 = vaddw_u8(sum1, vget_high_u8(p1));              // 3*p3+2*p2+p1
271   sum0 = vaddw_u8(sum0, vget_low_u8(p0));               // 3*p3+2*p2+p1+p0
272   sum1 = vaddw_u8(sum1, vget_high_u8(p0));              // 3*p3+2*p2+p1+p0
273   sum0 = vaddw_u8(sum0, vget_low_u8(q0));               // 3*p3+2*p2+p1+p0+q0
274   sum1 = vaddw_u8(sum1, vget_high_u8(q0));              // 3*p3+2*p2+p1+p0+q0
275   *op2 = vcombine_u8(vrshrn_n_u16(sum0, 3), vrshrn_n_u16(sum1, 3));
276   *op1 = calc_7_tap_filter_16_kernel(p3, p2, p1, q1, &sum0, &sum1);
277   *op0 = calc_7_tap_filter_16_kernel(p3, p1, p0, q2, &sum0, &sum1);
278   *oq0 = calc_7_tap_filter_16_kernel(p3, p0, q0, q3, &sum0, &sum1);
279   *oq1 = calc_7_tap_filter_16_kernel(p2, q0, q1, q3, &sum0, &sum1);
280   *oq2 = calc_7_tap_filter_16_kernel(p1, q1, q2, q3, &sum0, &sum1);
281 }
282 
283 #define FUN_APPLY_7_TAP_FILTER(w, r)                                          \
284   static INLINE void apply_7_tap_filter_##w(                                  \
285       const uint8x##w##_t flat, const uint8x##w##_t p3,                       \
286       const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
287       const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
288       const uint8x##w##_t q3, uint8x##w##_t *op2, uint8x##w##_t *op1,         \
289       uint8x##w##_t *op0, uint8x##w##_t *oq0, uint8x##w##_t *oq1,             \
290       uint8x##w##_t *oq2) {                                                   \
291     uint8x##w##_t tp1, tp0, tq0, tq1;                                         \
292     calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0,    \
293                           &tq0, &tq1, oq2);                                   \
294     *op2 = vbsl##r##u8(flat, *op2, p2);                                       \
295     *op1 = vbsl##r##u8(flat, tp1, *op1);                                      \
296     *op0 = vbsl##r##u8(flat, tp0, *op0);                                      \
297     *oq0 = vbsl##r##u8(flat, tq0, *oq0);                                      \
298     *oq1 = vbsl##r##u8(flat, tq1, *oq1);                                      \
299     *oq2 = vbsl##r##u8(flat, *oq2, q2);                                       \
300   }
301 
302 FUN_APPLY_7_TAP_FILTER(8, _)    // apply_7_tap_filter_8
303 FUN_APPLY_7_TAP_FILTER(16, q_)  // apply_7_tap_filter_16
304 #undef FUN_APPLY_7_TAP_FILTER
305 
306 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
apply_15_tap_filter_8(const uint8x8_t flat2,const uint8x8_t p7,const uint8x8_t p6,const uint8x8_t p5,const uint8x8_t p4,const uint8x8_t p3,const uint8x8_t p2,const uint8x8_t p1,const uint8x8_t p0,const uint8x8_t q0,const uint8x8_t q1,const uint8x8_t q2,const uint8x8_t q3,const uint8x8_t q4,const uint8x8_t q5,const uint8x8_t q6,const uint8x8_t q7,uint8x8_t * op6,uint8x8_t * op5,uint8x8_t * op4,uint8x8_t * op3,uint8x8_t * op2,uint8x8_t * op1,uint8x8_t * op0,uint8x8_t * oq0,uint8x8_t * oq1,uint8x8_t * oq2,uint8x8_t * oq3,uint8x8_t * oq4,uint8x8_t * oq5,uint8x8_t * oq6)307 static INLINE void apply_15_tap_filter_8(
308     const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6,
309     const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3,
310     const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0,
311     const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2,
312     const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5,
313     const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5,
314     uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1,
315     uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2,
316     uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) {
317   uint16x8_t sum;
318   sum = vshll_n_u8(p7, 3);  // 8*p7
319   sum = vsubw_u8(sum, p7);  // 7*p7
320   sum = vaddw_u8(sum, p6);  // 7*p7+p6
321   sum = vaddw_u8(sum, p6);  // 7*p7+2*p6
322   sum = vaddw_u8(sum, p5);  // 7*p7+2*p6+p5
323   sum = vaddw_u8(sum, p4);  // 7*p7+2*p6+p5+p4
324   sum = vaddw_u8(sum, p3);  // 7*p7+2*p6+p5+p4+p3
325   sum = vaddw_u8(sum, p2);  // 7*p7+2*p6+p5+p4+p3+p2
326   sum = vaddw_u8(sum, p1);  // 7*p7+2*p6+p5+p4+p3+p2+p1
327   sum = vaddw_u8(sum, p0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
328   sum = vaddw_u8(sum, q0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
329   *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6);
330   *op5 = apply_15_tap_filter_8_kernel(flat2, p7, p6, p5, q1, p5, &sum);
331   *op4 = apply_15_tap_filter_8_kernel(flat2, p7, p5, p4, q2, p4, &sum);
332   *op3 = apply_15_tap_filter_8_kernel(flat2, p7, p4, p3, q3, p3, &sum);
333   *op2 = apply_15_tap_filter_8_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
334   *op1 = apply_15_tap_filter_8_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
335   *op0 = apply_15_tap_filter_8_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
336   *oq0 = apply_15_tap_filter_8_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
337   *oq1 = apply_15_tap_filter_8_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
338   *oq2 = apply_15_tap_filter_8_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
339   *oq3 = apply_15_tap_filter_8_kernel(flat2, p4, q2, q3, q7, q3, &sum);
340   *oq4 = apply_15_tap_filter_8_kernel(flat2, p3, q3, q4, q7, q4, &sum);
341   *oq5 = apply_15_tap_filter_8_kernel(flat2, p2, q4, q5, q7, q5, &sum);
342   *oq6 = apply_15_tap_filter_8_kernel(flat2, p1, q5, q6, q7, q6, &sum);
343 }
344 
apply_15_tap_filter_16(const uint8x16_t flat2,const uint8x16_t p7,const uint8x16_t p6,const uint8x16_t p5,const uint8x16_t p4,const uint8x16_t p3,const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t q3,const uint8x16_t q4,const uint8x16_t q5,const uint8x16_t q6,const uint8x16_t q7,uint8x16_t * op6,uint8x16_t * op5,uint8x16_t * op4,uint8x16_t * op3,uint8x16_t * op2,uint8x16_t * op1,uint8x16_t * op0,uint8x16_t * oq0,uint8x16_t * oq1,uint8x16_t * oq2,uint8x16_t * oq3,uint8x16_t * oq4,uint8x16_t * oq5,uint8x16_t * oq6)345 static INLINE void apply_15_tap_filter_16(
346     const uint8x16_t flat2, const uint8x16_t p7, const uint8x16_t p6,
347     const uint8x16_t p5, const uint8x16_t p4, const uint8x16_t p3,
348     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
349     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
350     const uint8x16_t q3, const uint8x16_t q4, const uint8x16_t q5,
351     const uint8x16_t q6, const uint8x16_t q7, uint8x16_t *op6, uint8x16_t *op5,
352     uint8x16_t *op4, uint8x16_t *op3, uint8x16_t *op2, uint8x16_t *op1,
353     uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2,
354     uint8x16_t *oq3, uint8x16_t *oq4, uint8x16_t *oq5, uint8x16_t *oq6) {
355   uint16x8_t sum0, sum1;
356   uint8x16_t t;
357   sum0 = vshll_n_u8(vget_low_u8(p7), 3);    // 8*p7
358   sum1 = vshll_n_u8(vget_high_u8(p7), 3);   // 8*p7
359   sum0 = vsubw_u8(sum0, vget_low_u8(p7));   // 7*p7
360   sum1 = vsubw_u8(sum1, vget_high_u8(p7));  // 7*p7
361   sum0 = vaddw_u8(sum0, vget_low_u8(p6));   // 7*p7+p6
362   sum1 = vaddw_u8(sum1, vget_high_u8(p6));  // 7*p7+p6
363   sum0 = vaddw_u8(sum0, vget_low_u8(p6));   // 7*p7+2*p6
364   sum1 = vaddw_u8(sum1, vget_high_u8(p6));  // 7*p7+2*p6
365   sum0 = vaddw_u8(sum0, vget_low_u8(p5));   // 7*p7+2*p6+p5
366   sum1 = vaddw_u8(sum1, vget_high_u8(p5));  // 7*p7+2*p6+p5
367   sum0 = vaddw_u8(sum0, vget_low_u8(p4));   // 7*p7+2*p6+p5+p4
368   sum1 = vaddw_u8(sum1, vget_high_u8(p4));  // 7*p7+2*p6+p5+p4
369   sum0 = vaddw_u8(sum0, vget_low_u8(p3));   // 7*p7+2*p6+p5+p4+p3
370   sum1 = vaddw_u8(sum1, vget_high_u8(p3));  // 7*p7+2*p6+p5+p4+p3
371   sum0 = vaddw_u8(sum0, vget_low_u8(p2));   // 7*p7+2*p6+p5+p4+p3+p2
372   sum1 = vaddw_u8(sum1, vget_high_u8(p2));  // 7*p7+2*p6+p5+p4+p3+p2
373   sum0 = vaddw_u8(sum0, vget_low_u8(p1));   // 7*p7+2*p6+p5+p4+p3+p2+p1
374   sum1 = vaddw_u8(sum1, vget_high_u8(p1));  // 7*p7+2*p6+p5+p4+p3+p2+p1
375   sum0 = vaddw_u8(sum0, vget_low_u8(p0));   // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
376   sum1 = vaddw_u8(sum1, vget_high_u8(p0));  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
377   sum0 = vaddw_u8(sum0, vget_low_u8(q0));   // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
378   sum1 = vaddw_u8(sum1, vget_high_u8(q0));  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
379   t = vcombine_u8(vrshrn_n_u16(sum0, 4), vrshrn_n_u16(sum1, 4));
380   *op6 = vbslq_u8(flat2, t, p6);
381   *op5 = apply_15_tap_filter_16_kernel(flat2, p7, p6, p5, q1, p5, &sum0, &sum1);
382   *op4 = apply_15_tap_filter_16_kernel(flat2, p7, p5, p4, q2, p4, &sum0, &sum1);
383   *op3 = apply_15_tap_filter_16_kernel(flat2, p7, p4, p3, q3, p3, &sum0, &sum1);
384   *op2 =
385       apply_15_tap_filter_16_kernel(flat2, p7, p3, p2, q4, *op2, &sum0, &sum1);
386   *op1 =
387       apply_15_tap_filter_16_kernel(flat2, p7, p2, p1, q5, *op1, &sum0, &sum1);
388   *op0 =
389       apply_15_tap_filter_16_kernel(flat2, p7, p1, p0, q6, *op0, &sum0, &sum1);
390   *oq0 =
391       apply_15_tap_filter_16_kernel(flat2, p7, p0, q0, q7, *oq0, &sum0, &sum1);
392   *oq1 =
393       apply_15_tap_filter_16_kernel(flat2, p6, q0, q1, q7, *oq1, &sum0, &sum1);
394   *oq2 =
395       apply_15_tap_filter_16_kernel(flat2, p5, q1, q2, q7, *oq2, &sum0, &sum1);
396   *oq3 = apply_15_tap_filter_16_kernel(flat2, p4, q2, q3, q7, q3, &sum0, &sum1);
397   *oq4 = apply_15_tap_filter_16_kernel(flat2, p3, q3, q4, q7, q4, &sum0, &sum1);
398   *oq5 = apply_15_tap_filter_16_kernel(flat2, p2, q4, q5, q7, q5, &sum0, &sum1);
399   *oq6 = apply_15_tap_filter_16_kernel(flat2, p1, q5, q6, q7, q6, &sum0, &sum1);
400 }
401 
402 #define FUN_FILTER4(w, r)                                                     \
403   static INLINE void filter4_##w(                                             \
404       const uint8x##w##_t mask, const uint8x##w##_t hev,                      \
405       const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
406       const uint8x##w##_t q1, uint8x##w##_t *op1, uint8x##w##_t *op0,         \
407       uint8x##w##_t *oq0, uint8x##w##_t *oq1) {                               \
408     int8x##w##_t filter, filter1, filter2, t;                                 \
409     int8x##w##_t ps1 = flip_sign_##w(p1);                                     \
410     int8x##w##_t ps0 = flip_sign_##w(p0);                                     \
411     int8x##w##_t qs0 = flip_sign_##w(q0);                                     \
412     int8x##w##_t qs1 = flip_sign_##w(q1);                                     \
413                                                                               \
414     /* add outer taps if we have high edge variance */                        \
415     filter = vqsub##r##s8(ps1, qs1);                                          \
416     filter = vand##r##s8(filter, vreinterpret##r##s8_u8(hev));                \
417     t = vqsub##r##s8(qs0, ps0);                                               \
418                                                                               \
419     /* inner taps */                                                          \
420     filter = vqadd##r##s8(filter, t);                                         \
421     filter = vqadd##r##s8(filter, t);                                         \
422     filter = vqadd##r##s8(filter, t);                                         \
423     filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask));               \
424                                                                               \
425     /* save bottom 3 bits so that we round one side +4 and the other +3 */    \
426     /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \
427     /* we'd round it by 3 the other way */                                    \
428     filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3);       \
429     filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3);       \
430                                                                               \
431     qs0 = vqsub##r##s8(qs0, filter1);                                         \
432     ps0 = vqadd##r##s8(ps0, filter2);                                         \
433     *oq0 = flip_sign_back_##w(qs0);                                           \
434     *op0 = flip_sign_back_##w(ps0);                                           \
435                                                                               \
436     /* outer tap adjustments */                                               \
437     filter = vrshr##r##n_s8(filter1, 1);                                      \
438     filter = vbic##r##s8(filter, vreinterpret##r##s8_u8(hev));                \
439                                                                               \
440     qs1 = vqsub##r##s8(qs1, filter);                                          \
441     ps1 = vqadd##r##s8(ps1, filter);                                          \
442     *oq1 = flip_sign_back_##w(qs1);                                           \
443     *op1 = flip_sign_back_##w(ps1);                                           \
444   }
445 
446 FUN_FILTER4(8, _)    // filter4_8
447 FUN_FILTER4(16, q_)  // filter4_16
448 #undef FUN_FILTER4
449 
450 #define FUN_FILTER8(w)                                                         \
451   static INLINE void filter8_##w(                                              \
452       const uint8x##w##_t mask, const uint8x##w##_t flat,                      \
453       const uint32_t flat_status, const uint8x##w##_t hev,                     \
454       const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1,  \
455       const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1,  \
456       const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2,      \
457       uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
458       uint8x##w##_t *oq1, uint8x##w##_t *oq2) {                                \
459     if (flat_status != (uint32_t)-2) {                                         \
460       filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1);              \
461       *op2 = p2;                                                               \
462       *oq2 = q2;                                                               \
463       if (flat_status) {                                                       \
464         apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
465                                op0, oq0, oq1, oq2);                            \
466       }                                                                        \
467     } else {                                                                   \
468       calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,     \
469                             oq0, oq1, oq2);                                    \
470     }                                                                          \
471   }
472 
473 FUN_FILTER8(8)   // filter8_8
474 FUN_FILTER8(16)  // filter8_16
475 #undef FUN_FILTER8
476 
477 #define FUN_FILTER16(w)                                                        \
478   static INLINE void filter16_##w(                                             \
479       const uint8x##w##_t mask, const uint8x##w##_t flat,                      \
480       const uint32_t flat_status, const uint8x##w##_t flat2,                   \
481       const uint32_t flat2_status, const uint8x##w##_t hev,                    \
482       const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5,  \
483       const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
484       const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
485       const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
486       const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
487       const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5,          \
488       uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2,              \
489       uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
490       uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3,              \
491       uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6) {            \
492     if (flat_status != (uint32_t)-2) {                                         \
493       filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1);              \
494     }                                                                          \
495                                                                                \
496     if (flat_status) {                                                         \
497       *op2 = p2;                                                               \
498       *oq2 = q2;                                                               \
499       if (flat2_status != (uint32_t)-2) {                                      \
500         apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
501                                op0, oq0, oq1, oq2);                            \
502       }                                                                        \
503       if (flat2_status) {                                                      \
504         apply_15_tap_filter_##w(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, \
505                                 q2, q3, q4, q5, q6, q7, op6, op5, op4, op3,    \
506                                 op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5,   \
507                                 oq6);                                          \
508       }                                                                        \
509     }                                                                          \
510   }
511 
512 FUN_FILTER16(8)   // filter16_8
513 FUN_FILTER16(16)  // filter16_16
514 #undef FUN_FILTER16
515 
516 #define FUN_LOAD8(w, r)                                                    \
517   static INLINE void load_##w##x8(                                         \
518       const uint8_t *s, const int p, uint8x##w##_t *p3, uint8x##w##_t *p2, \
519       uint8x##w##_t *p1, uint8x##w##_t *p0, uint8x##w##_t *q0,             \
520       uint8x##w##_t *q1, uint8x##w##_t *q2, uint8x##w##_t *q3) {           \
521     *p3 = vld1##r##u8(s);                                                  \
522     s += p;                                                                \
523     *p2 = vld1##r##u8(s);                                                  \
524     s += p;                                                                \
525     *p1 = vld1##r##u8(s);                                                  \
526     s += p;                                                                \
527     *p0 = vld1##r##u8(s);                                                  \
528     s += p;                                                                \
529     *q0 = vld1##r##u8(s);                                                  \
530     s += p;                                                                \
531     *q1 = vld1##r##u8(s);                                                  \
532     s += p;                                                                \
533     *q2 = vld1##r##u8(s);                                                  \
534     s += p;                                                                \
535     *q3 = vld1##r##u8(s);                                                  \
536   }
537 
538 FUN_LOAD8(8, _)    // load_8x8
539 FUN_LOAD8(16, q_)  // load_16x8
540 #undef FUN_LOAD8
541 
542 #define FUN_LOAD16(w, r)                                                   \
543   static INLINE void load_##w##x16(                                        \
544       const uint8_t *s, const int p, uint8x##w##_t *s0, uint8x##w##_t *s1, \
545       uint8x##w##_t *s2, uint8x##w##_t *s3, uint8x##w##_t *s4,             \
546       uint8x##w##_t *s5, uint8x##w##_t *s6, uint8x##w##_t *s7,             \
547       uint8x##w##_t *s8, uint8x##w##_t *s9, uint8x##w##_t *s10,            \
548       uint8x##w##_t *s11, uint8x##w##_t *s12, uint8x##w##_t *s13,          \
549       uint8x##w##_t *s14, uint8x##w##_t *s15) {                            \
550     *s0 = vld1##r##u8(s);                                                  \
551     s += p;                                                                \
552     *s1 = vld1##r##u8(s);                                                  \
553     s += p;                                                                \
554     *s2 = vld1##r##u8(s);                                                  \
555     s += p;                                                                \
556     *s3 = vld1##r##u8(s);                                                  \
557     s += p;                                                                \
558     *s4 = vld1##r##u8(s);                                                  \
559     s += p;                                                                \
560     *s5 = vld1##r##u8(s);                                                  \
561     s += p;                                                                \
562     *s6 = vld1##r##u8(s);                                                  \
563     s += p;                                                                \
564     *s7 = vld1##r##u8(s);                                                  \
565     s += p;                                                                \
566     *s8 = vld1##r##u8(s);                                                  \
567     s += p;                                                                \
568     *s9 = vld1##r##u8(s);                                                  \
569     s += p;                                                                \
570     *s10 = vld1##r##u8(s);                                                 \
571     s += p;                                                                \
572     *s11 = vld1##r##u8(s);                                                 \
573     s += p;                                                                \
574     *s12 = vld1##r##u8(s);                                                 \
575     s += p;                                                                \
576     *s13 = vld1##r##u8(s);                                                 \
577     s += p;                                                                \
578     *s14 = vld1##r##u8(s);                                                 \
579     s += p;                                                                \
580     *s15 = vld1##r##u8(s);                                                 \
581   }
582 
583 FUN_LOAD16(8, _)    // load_8x16
584 FUN_LOAD16(16, q_)  // load_16x16
585 #undef FUN_LOAD16
586 
587 #define FUN_STORE4(w, r)                                                       \
588   static INLINE void store_##w##x4(                                            \
589       uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
590       const uint8x##w##_t s2, const uint8x##w##_t s3) {                        \
591     vst1##r##u8(s, s0);                                                        \
592     s += p;                                                                    \
593     vst1##r##u8(s, s1);                                                        \
594     s += p;                                                                    \
595     vst1##r##u8(s, s2);                                                        \
596     s += p;                                                                    \
597     vst1##r##u8(s, s3);                                                        \
598   }
599 
600 FUN_STORE4(8, _)    // store_8x4
601 FUN_STORE4(16, q_)  // store_16x4
602 #undef FUN_STORE4
603 
604 #define FUN_STORE6(w, r)                                                       \
605   static INLINE void store_##w##x6(                                            \
606       uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
607       const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4,  \
608       const uint8x##w##_t s5) {                                                \
609     vst1##r##u8(s, s0);                                                        \
610     s += p;                                                                    \
611     vst1##r##u8(s, s1);                                                        \
612     s += p;                                                                    \
613     vst1##r##u8(s, s2);                                                        \
614     s += p;                                                                    \
615     vst1##r##u8(s, s3);                                                        \
616     s += p;                                                                    \
617     vst1##r##u8(s, s4);                                                        \
618     s += p;                                                                    \
619     vst1##r##u8(s, s5);                                                        \
620   }
621 
622 FUN_STORE6(8, _)    // store_8x6
623 FUN_STORE6(16, q_)  // store_16x6
624 #undef FUN_STORE6
625 
store_4x8(uint8_t * s,const int p,const uint8x8_t p1,const uint8x8_t p0,const uint8x8_t q0,const uint8x8_t q1)626 static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
627                              const uint8x8_t p0, const uint8x8_t q0,
628                              const uint8x8_t q1) {
629   uint8x8x4_t o;
630 
631   o.val[0] = p1;
632   o.val[1] = p0;
633   o.val[2] = q0;
634   o.val[3] = q1;
635   vst4_lane_u8(s, o, 0);
636   s += p;
637   vst4_lane_u8(s, o, 1);
638   s += p;
639   vst4_lane_u8(s, o, 2);
640   s += p;
641   vst4_lane_u8(s, o, 3);
642   s += p;
643   vst4_lane_u8(s, o, 4);
644   s += p;
645   vst4_lane_u8(s, o, 5);
646   s += p;
647   vst4_lane_u8(s, o, 6);
648   s += p;
649   vst4_lane_u8(s, o, 7);
650 }
651 
store_6x8(uint8_t * s,const int p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5)652 static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
653                              const uint8x8_t s1, const uint8x8_t s2,
654                              const uint8x8_t s3, const uint8x8_t s4,
655                              const uint8x8_t s5) {
656   uint8x8x3_t o0, o1;
657 
658   o0.val[0] = s0;
659   o0.val[1] = s1;
660   o0.val[2] = s2;
661   o1.val[0] = s3;
662   o1.val[1] = s4;
663   o1.val[2] = s5;
664   vst3_lane_u8(s - 3, o0, 0);
665   vst3_lane_u8(s + 0, o1, 0);
666   s += p;
667   vst3_lane_u8(s - 3, o0, 1);
668   vst3_lane_u8(s + 0, o1, 1);
669   s += p;
670   vst3_lane_u8(s - 3, o0, 2);
671   vst3_lane_u8(s + 0, o1, 2);
672   s += p;
673   vst3_lane_u8(s - 3, o0, 3);
674   vst3_lane_u8(s + 0, o1, 3);
675   s += p;
676   vst3_lane_u8(s - 3, o0, 4);
677   vst3_lane_u8(s + 0, o1, 4);
678   s += p;
679   vst3_lane_u8(s - 3, o0, 5);
680   vst3_lane_u8(s + 0, o1, 5);
681   s += p;
682   vst3_lane_u8(s - 3, o0, 6);
683   vst3_lane_u8(s + 0, o1, 6);
684   s += p;
685   vst3_lane_u8(s - 3, o0, 7);
686   vst3_lane_u8(s + 0, o1, 7);
687 }
688 
689 #define FUN_STORE8(w, r)                                                       \
690   static INLINE void store_##w##x8(                                            \
691       uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
692       const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4,  \
693       const uint8x##w##_t s5, const uint8x##w##_t s6,                          \
694       const uint8x##w##_t s7) {                                                \
695     vst1##r##u8(s, s0);                                                        \
696     s += p;                                                                    \
697     vst1##r##u8(s, s1);                                                        \
698     s += p;                                                                    \
699     vst1##r##u8(s, s2);                                                        \
700     s += p;                                                                    \
701     vst1##r##u8(s, s3);                                                        \
702     s += p;                                                                    \
703     vst1##r##u8(s, s4);                                                        \
704     s += p;                                                                    \
705     vst1##r##u8(s, s5);                                                        \
706     s += p;                                                                    \
707     vst1##r##u8(s, s6);                                                        \
708     s += p;                                                                    \
709     vst1##r##u8(s, s7);                                                        \
710   }
711 
712 FUN_STORE8(8, _)    // store_8x8
713 FUN_STORE8(16, q_)  // store_16x8
714 #undef FUN_STORE8
715 
716 #define FUN_STORE14(w, r)                                                      \
717   static INLINE void store_##w##x14(                                           \
718       uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
719       const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
720       const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
721       const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
722       const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
723       const uint32_t flat_status, const uint32_t flat2_status) {               \
724     if (flat_status) {                                                         \
725       if (flat2_status) {                                                      \
726         vst1##r##u8(s - 7 * p, p6);                                            \
727         vst1##r##u8(s - 6 * p, p5);                                            \
728         vst1##r##u8(s - 5 * p, p4);                                            \
729         vst1##r##u8(s - 4 * p, p3);                                            \
730         vst1##r##u8(s + 3 * p, q3);                                            \
731         vst1##r##u8(s + 4 * p, q4);                                            \
732         vst1##r##u8(s + 5 * p, q5);                                            \
733         vst1##r##u8(s + 6 * p, q6);                                            \
734       }                                                                        \
735       vst1##r##u8(s - 3 * p, p2);                                              \
736       vst1##r##u8(s + 2 * p, q2);                                              \
737     }                                                                          \
738     vst1##r##u8(s - 2 * p, p1);                                                \
739     vst1##r##u8(s - 1 * p, p0);                                                \
740     vst1##r##u8(s + 0 * p, q0);                                                \
741     vst1##r##u8(s + 1 * p, q1);                                                \
742   }
743 
744 FUN_STORE14(8, _)    // store_8x14
745 FUN_STORE14(16, q_)  // store_16x14
746 #undef FUN_STORE14
747 
store_16x16(uint8_t * s,const int p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3,const uint8x16_t s4,const uint8x16_t s5,const uint8x16_t s6,const uint8x16_t s7,const uint8x16_t s8,const uint8x16_t s9,const uint8x16_t s10,const uint8x16_t s11,const uint8x16_t s12,const uint8x16_t s13,const uint8x16_t s14,const uint8x16_t s15)748 static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
749                                const uint8x16_t s1, const uint8x16_t s2,
750                                const uint8x16_t s3, const uint8x16_t s4,
751                                const uint8x16_t s5, const uint8x16_t s6,
752                                const uint8x16_t s7, const uint8x16_t s8,
753                                const uint8x16_t s9, const uint8x16_t s10,
754                                const uint8x16_t s11, const uint8x16_t s12,
755                                const uint8x16_t s13, const uint8x16_t s14,
756                                const uint8x16_t s15) {
757   vst1q_u8(s, s0);
758   s += p;
759   vst1q_u8(s, s1);
760   s += p;
761   vst1q_u8(s, s2);
762   s += p;
763   vst1q_u8(s, s3);
764   s += p;
765   vst1q_u8(s, s4);
766   s += p;
767   vst1q_u8(s, s5);
768   s += p;
769   vst1q_u8(s, s6);
770   s += p;
771   vst1q_u8(s, s7);
772   s += p;
773   vst1q_u8(s, s8);
774   s += p;
775   vst1q_u8(s, s9);
776   s += p;
777   vst1q_u8(s, s10);
778   s += p;
779   vst1q_u8(s, s11);
780   s += p;
781   vst1q_u8(s, s12);
782   s += p;
783   vst1q_u8(s, s13);
784   s += p;
785   vst1q_u8(s, s14);
786   s += p;
787   vst1q_u8(s, s15);
788 }
789 
790 #define FUN_HOR_4_KERNEL(name, w)                                           \
791   static INLINE void lpf_horizontal_4##name##kernel(                        \
792       uint8_t *s, const int p, const uint8x##w##_t blimit,                  \
793       const uint8x##w##_t limit, const uint8x##w##_t thresh) {              \
794     uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev;                \
795                                                                             \
796     load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);     \
797     filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \
798                          q3, &hev, &mask);                                  \
799     filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);             \
800     store_##w##x4(s - 2 * p, p, p1, p0, q0, q1);                            \
801   }
802 
803 FUN_HOR_4_KERNEL(_, 8)        // lpf_horizontal_4_kernel
804 FUN_HOR_4_KERNEL(_dual_, 16)  // lpf_horizontal_4_dual_kernel
805 #undef FUN_HOR_4_KERNEL
806 
vpx_lpf_horizontal_4_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)807 void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit,
808                                const uint8_t *limit, const uint8_t *thresh) {
809   uint8x8_t blimit_vec, limit_vec, thresh_vec;
810   load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
811   lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
812 }
813 
vpx_lpf_horizontal_4_dual_neon(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)814 void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
815                                     const uint8_t *limit0,
816                                     const uint8_t *thresh0,
817                                     const uint8_t *blimit1,
818                                     const uint8_t *limit1,
819                                     const uint8_t *thresh1) {
820   uint8x16_t blimit_vec, limit_vec, thresh_vec;
821   load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
822                      &blimit_vec, &limit_vec, &thresh_vec);
823   lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
824 }
825 
vpx_lpf_vertical_4_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)826 void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit,
827                              const uint8_t *limit, const uint8_t *thresh) {
828   uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
829       mask, hev;
830   load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
831   load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
832   transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
833   filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
834                      q2, q3, &hev, &mask);
835   filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
836   store_4x8(s - 2, p, p1, p0, q0, q1);
837 }
838 
vpx_lpf_vertical_4_dual_neon(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)839 void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
840                                   const uint8_t *limit0, const uint8_t *thresh0,
841                                   const uint8_t *blimit1, const uint8_t *limit1,
842                                   const uint8_t *thresh1) {
843   uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
844       mask, hev;
845   uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
846       s15;
847 
848   load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
849                      &blimit_vec, &limit_vec, &thresh_vec);
850   load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
851             &s11, &s12, &s13, &s14, &s15);
852   transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
853                     s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
854   filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
855                       q2, q3, &hev, &mask);
856   filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
857   s -= 2;
858   store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0),
859             vget_low_u8(q1));
860   store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0),
861             vget_high_u8(q1));
862 }
863 
vpx_lpf_horizontal_8_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)864 void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit,
865                                const uint8_t *limit, const uint8_t *thresh) {
866   uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
867       op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
868   uint32_t flat_status;
869 
870   load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
871   load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
872   mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
873                                 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
874   filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
875             &op1, &op0, &oq0, &oq1, &oq2);
876   store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
877 }
878 
vpx_lpf_horizontal_8_dual_neon(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)879 void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
880                                     const uint8_t *limit0,
881                                     const uint8_t *thresh0,
882                                     const uint8_t *blimit1,
883                                     const uint8_t *limit1,
884                                     const uint8_t *thresh1) {
885   uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
886       op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
887   uint32_t flat_status;
888 
889   load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
890                      &blimit_vec, &limit_vec, &thresh_vec);
891   load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
892   mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
893                                  p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
894   filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
895              &op1, &op0, &oq0, &oq1, &oq2);
896   store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
897 }
898 
vpx_lpf_vertical_8_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)899 void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit,
900                              const uint8_t *limit, const uint8_t *thresh) {
901   uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
902       op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
903   uint32_t flat_status;
904 
905   load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
906   load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
907   transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
908   mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
909                                 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
910   filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
911             &op1, &op0, &oq0, &oq1, &oq2);
912   // Note: transpose + store_8x8() is faster than store_6x8().
913   transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
914   store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
915 }
916 
vpx_lpf_vertical_8_dual_neon(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)917 void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
918                                   const uint8_t *limit0, const uint8_t *thresh0,
919                                   const uint8_t *blimit1, const uint8_t *limit1,
920                                   const uint8_t *thresh1) {
921   uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
922       op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
923   uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
924       s15;
925   uint32_t flat_status;
926 
927   load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
928                      &blimit_vec, &limit_vec, &thresh_vec);
929   load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
930             &s11, &s12, &s13, &s14, &s15);
931   transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
932                     s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
933   mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
934                                  p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
935   filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
936              &op1, &op0, &oq0, &oq1, &oq2);
937   // Note: store_6x8() twice is faster than transpose + store_8x16().
938   store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
939             vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
940   store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
941             vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
942             vget_high_u8(oq2));
943 }
944 
945 #define FUN_LPF_16_KERNEL(name, w)                                             \
946   static INLINE void lpf_16##name##kernel(                                     \
947       const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh,      \
948       const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5,  \
949       const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
950       const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
951       const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
952       const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
953       const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5,          \
954       uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2,              \
955       uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
956       uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3,              \
957       uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6,              \
958       uint32_t *flat_status, uint32_t *flat2_status) {                         \
959     uint8x##w##_t blimit_vec, limit_vec, thresh_vec, mask, flat, flat2, hev;   \
960                                                                                \
961     load_thresh_##w(blimit, limit, thresh, &blimit_vec, &limit_vec,            \
962                     &thresh_vec);                                              \
963     mask = filter_flat_hev_mask_##w(limit_vec, blimit_vec, thresh_vec, p3, p2, \
964                                     p1, p0, q0, q1, q2, q3, &flat,             \
965                                     flat_status, &hev);                        \
966     flat2 = flat_mask5_##w(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,       \
967                            flat2_status);                                      \
968     filter16_##w(mask, flat, *flat_status, flat2, *flat2_status, hev, p7, p6,  \
969                  p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,  \
970                  op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5,   \
971                  oq6);                                                         \
972   }
973 
974 FUN_LPF_16_KERNEL(_, 8)        // lpf_16_kernel
975 FUN_LPF_16_KERNEL(_dual_, 16)  // lpf_16_dual_kernel
976 #undef FUN_LPF_16_KERNEL
977 
vpx_lpf_horizontal_16_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)978 void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit,
979                                 const uint8_t *limit, const uint8_t *thresh) {
980   uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
981       op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
982   uint32_t flat_status, flat2_status;
983 
984   load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
985             &q3, &q4, &q5, &q6, &q7);
986   lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
987                 q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
988                 &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
989                 &flat2_status);
990   store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
991              oq5, oq6, flat_status, flat2_status);
992 }
993 
vpx_lpf_horizontal_16_dual_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)994 void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
995                                      const uint8_t *limit,
996                                      const uint8_t *thresh) {
997   uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
998       op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
999   uint32_t flat_status, flat2_status;
1000 
1001   load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
1002   p7 = vld1q_u8(s - 8 * p);
1003   p6 = vld1q_u8(s - 7 * p);
1004   p5 = vld1q_u8(s - 6 * p);
1005   p4 = vld1q_u8(s - 5 * p);
1006   q4 = vld1q_u8(s + 4 * p);
1007   q5 = vld1q_u8(s + 5 * p);
1008   q6 = vld1q_u8(s + 6 * p);
1009   q7 = vld1q_u8(s + 7 * p);
1010   lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
1011                      q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
1012                      &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
1013                      &flat_status, &flat2_status);
1014   store_16x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
1015               oq5, oq6, flat_status, flat2_status);
1016 }
1017 
vpx_lpf_vertical_16_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)1018 void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
1019                               const uint8_t *limit, const uint8_t *thresh) {
1020   uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
1021       op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
1022   uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
1023   uint32_t flat_status, flat2_status;
1024 
1025   s -= 8;
1026   load_16x8(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1027   transpose_u8_16x8(s0, s1, s2, s3, s4, s5, s6, s7, &p7, &p6, &p5, &p4, &p3,
1028                     &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
1029   lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
1030                 q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
1031                 &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
1032                 &flat2_status);
1033   if (flat_status) {
1034     if (flat2_status) {
1035       transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
1036                         oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
1037                         &s6, &s7);
1038       store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
1039     } else {
1040       // Note: transpose + store_8x8() is faster than store_6x8().
1041       transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
1042       store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
1043     }
1044   } else {
1045     store_4x8(s + 6, p, op1, op0, oq0, oq1);
1046   }
1047 }
1048 
vpx_lpf_vertical_16_dual_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)1049 void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
1050                                    const uint8_t *limit,
1051                                    const uint8_t *thresh) {
1052   uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
1053       op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
1054   uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
1055       s15;
1056   uint32_t flat_status, flat2_status;
1057 
1058   s -= 8;
1059   load_16x16(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, &s11,
1060              &s12, &s13, &s14, &s15);
1061   transpose_u8_16x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
1062                      s14, s15, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1,
1063                      &q2, &q3, &q4, &q5, &q6, &q7);
1064   lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
1065                      q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
1066                      &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
1067                      &flat_status, &flat2_status);
1068   if (flat_status) {
1069     if (flat2_status) {
1070       transpose_u8_16x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
1071                          oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
1072                          &s6, &s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14,
1073                          &s15);
1074       store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1075                   s13, s14, s15);
1076     } else {
1077       // Note: store_6x8() twice is faster than transpose + store_8x16().
1078       s += 8;
1079       store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
1080                 vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
1081       store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
1082                 vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
1083                 vget_high_u8(oq2));
1084     }
1085   } else {
1086     s += 6;
1087     store_4x8(s, p, vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0),
1088               vget_low_u8(oq1));
1089     store_4x8(s + 8 * p, p, vget_high_u8(op1), vget_high_u8(op0),
1090               vget_high_u8(oq0), vget_high_u8(oq1));
1091   }
1092 }
1093