1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of dsp functions and loop filtering.
11 //
12 // Authors: Somnath Banerjee (somnath@google.com)
13 //          Johann Koenig (johannkoenig@google.com)
14 
15 #include "src/dsp/dsp.h"
16 
17 #if defined(WEBP_USE_NEON)
18 
19 #include "src/dsp/neon.h"
20 #include "src/dec/vp8i_dec.h"
21 
22 //------------------------------------------------------------------------------
23 // NxM Loading functions
24 
25 #if !defined(WORK_AROUND_GCC)
26 
27 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
28 // (register alloc, probably). The variants somewhat mitigate the problem, but
29 // not quite. HFilter16i() remains problematic.
Load4x8_NEON(const uint8_t * const src,int stride)30 static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
31                                             int stride) {
32   const uint8x8_t zero = vdup_n_u8(0);
33   uint8x8x4_t out;
34   INIT_VECTOR4(out, zero, zero, zero, zero);
35   out = vld4_lane_u8(src + 0 * stride, out, 0);
36   out = vld4_lane_u8(src + 1 * stride, out, 1);
37   out = vld4_lane_u8(src + 2 * stride, out, 2);
38   out = vld4_lane_u8(src + 3 * stride, out, 3);
39   out = vld4_lane_u8(src + 4 * stride, out, 4);
40   out = vld4_lane_u8(src + 5 * stride, out, 5);
41   out = vld4_lane_u8(src + 6 * stride, out, 6);
42   out = vld4_lane_u8(src + 7 * stride, out, 7);
43   return out;
44 }
45 
Load4x16_NEON(const uint8_t * const src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)46 static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
47                                       uint8x16_t* const p1,
48                                       uint8x16_t* const p0,
49                                       uint8x16_t* const q0,
50                                       uint8x16_t* const q1) {
51   // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
52   // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
53   const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
54   const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
55   *p1 = vcombine_u8(row0.val[0], row8.val[0]);
56   *p0 = vcombine_u8(row0.val[1], row8.val[1]);
57   *q0 = vcombine_u8(row0.val[2], row8.val[2]);
58   *q1 = vcombine_u8(row0.val[3], row8.val[3]);
59 }
60 
61 #else  // WORK_AROUND_GCC
62 
63 #define LOADQ_LANE_32b(VALUE, LANE) do {                             \
64   (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
65   src += stride;                                                     \
66 } while (0)
67 
Load4x16_NEON(const uint8_t * src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)68 static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
69                                       uint8x16_t* const p1,
70                                       uint8x16_t* const p0,
71                                       uint8x16_t* const q0,
72                                       uint8x16_t* const q1) {
73   const uint32x4_t zero = vdupq_n_u32(0);
74   uint32x4x4_t in;
75   INIT_VECTOR4(in, zero, zero, zero, zero);
76   src -= 2;
77   LOADQ_LANE_32b(in.val[0], 0);
78   LOADQ_LANE_32b(in.val[1], 0);
79   LOADQ_LANE_32b(in.val[2], 0);
80   LOADQ_LANE_32b(in.val[3], 0);
81   LOADQ_LANE_32b(in.val[0], 1);
82   LOADQ_LANE_32b(in.val[1], 1);
83   LOADQ_LANE_32b(in.val[2], 1);
84   LOADQ_LANE_32b(in.val[3], 1);
85   LOADQ_LANE_32b(in.val[0], 2);
86   LOADQ_LANE_32b(in.val[1], 2);
87   LOADQ_LANE_32b(in.val[2], 2);
88   LOADQ_LANE_32b(in.val[3], 2);
89   LOADQ_LANE_32b(in.val[0], 3);
90   LOADQ_LANE_32b(in.val[1], 3);
91   LOADQ_LANE_32b(in.val[2], 3);
92   LOADQ_LANE_32b(in.val[3], 3);
93   // Transpose four 4x4 parts:
94   {
95     const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
96                                         vreinterpretq_u8_u32(in.val[1]));
97     const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
98                                         vreinterpretq_u8_u32(in.val[3]));
99     const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
100                                          vreinterpretq_u16_u8(row23.val[0]));
101     const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
102                                          vreinterpretq_u16_u8(row23.val[1]));
103     *p1 = vreinterpretq_u8_u16(row02.val[0]);
104     *p0 = vreinterpretq_u8_u16(row13.val[0]);
105     *q0 = vreinterpretq_u8_u16(row02.val[1]);
106     *q1 = vreinterpretq_u8_u16(row13.val[1]);
107   }
108 }
109 #undef LOADQ_LANE_32b
110 
111 #endif  // !WORK_AROUND_GCC
112 
Load8x16_NEON(const uint8_t * const src,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)113 static WEBP_INLINE void Load8x16_NEON(
114     const uint8_t* const src, int stride,
115     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
116     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
117     uint8x16_t* const q2, uint8x16_t* const q3) {
118   Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
119   Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
120 }
121 
Load16x4_NEON(const uint8_t * const src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)122 static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
123                                       uint8x16_t* const p1,
124                                       uint8x16_t* const p0,
125                                       uint8x16_t* const q0,
126                                       uint8x16_t* const q1) {
127   *p1 = vld1q_u8(src - 2 * stride);
128   *p0 = vld1q_u8(src - 1 * stride);
129   *q0 = vld1q_u8(src + 0 * stride);
130   *q1 = vld1q_u8(src + 1 * stride);
131 }
132 
Load16x8_NEON(const uint8_t * const src,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)133 static WEBP_INLINE void Load16x8_NEON(
134     const uint8_t* const src, int stride,
135     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
136     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
137     uint8x16_t* const q2, uint8x16_t* const q3) {
138   Load16x4_NEON(src - 2  * stride, stride, p3, p2, p1, p0);
139   Load16x4_NEON(src + 2  * stride, stride, q0, q1, q2, q3);
140 }
141 
Load8x8x2_NEON(const uint8_t * const u,const uint8_t * const v,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)142 static WEBP_INLINE void Load8x8x2_NEON(
143     const uint8_t* const u, const uint8_t* const v, int stride,
144     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
145     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
146     uint8x16_t* const q2, uint8x16_t* const q3) {
147   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
148   // and the v-samples on the higher half.
149   *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
150   *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
151   *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
152   *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
153   *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
154   *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
155   *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
156   *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
157 }
158 
159 #if !defined(WORK_AROUND_GCC)
160 
161 #define LOAD_UV_8(ROW) \
162   vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
163 
Load8x8x2T_NEON(const uint8_t * const u,const uint8_t * const v,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)164 static WEBP_INLINE void Load8x8x2T_NEON(
165     const uint8_t* const u, const uint8_t* const v, int stride,
166     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
167     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
168     uint8x16_t* const q2, uint8x16_t* const q3) {
169   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
170   // and the v-samples on the higher half.
171   const uint8x16_t row0 = LOAD_UV_8(0);
172   const uint8x16_t row1 = LOAD_UV_8(1);
173   const uint8x16_t row2 = LOAD_UV_8(2);
174   const uint8x16_t row3 = LOAD_UV_8(3);
175   const uint8x16_t row4 = LOAD_UV_8(4);
176   const uint8x16_t row5 = LOAD_UV_8(5);
177   const uint8x16_t row6 = LOAD_UV_8(6);
178   const uint8x16_t row7 = LOAD_UV_8(7);
179   // Perform two side-by-side 8x8 transposes
180   // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
181   // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
182   // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
183   // u30 u31 u32 u33 u34 u35 u36 u37 | ...
184   // u40 u41 u42 u43 u44 u45 u46 u47 | ...
185   // u50 u51 u52 u53 u54 u55 u56 u57 | ...
186   // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
187   // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
188   const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
189                                                     // u01 u11 u03 u13 ...
190   const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
191                                                     // u21 u31 u23 u33 ...
192   const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
193   const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
194   const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
195                                        vreinterpretq_u16_u8(row23.val[0]));
196   const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
197                                        vreinterpretq_u16_u8(row23.val[1]));
198   const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
199                                        vreinterpretq_u16_u8(row67.val[0]));
200   const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
201                                        vreinterpretq_u16_u8(row67.val[1]));
202   const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
203                                        vreinterpretq_u32_u16(row46.val[0]));
204   const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
205                                        vreinterpretq_u32_u16(row46.val[1]));
206   const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
207                                        vreinterpretq_u32_u16(row57.val[0]));
208   const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
209                                        vreinterpretq_u32_u16(row57.val[1]));
210   *p3 = vreinterpretq_u8_u32(row04.val[0]);
211   *p2 = vreinterpretq_u8_u32(row15.val[0]);
212   *p1 = vreinterpretq_u8_u32(row26.val[0]);
213   *p0 = vreinterpretq_u8_u32(row37.val[0]);
214   *q0 = vreinterpretq_u8_u32(row04.val[1]);
215   *q1 = vreinterpretq_u8_u32(row15.val[1]);
216   *q2 = vreinterpretq_u8_u32(row26.val[1]);
217   *q3 = vreinterpretq_u8_u32(row37.val[1]);
218 }
219 #undef LOAD_UV_8
220 
221 #endif  // !WORK_AROUND_GCC
222 
Store2x8_NEON(const uint8x8x2_t v,uint8_t * const dst,int stride)223 static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
224                                       uint8_t* const dst, int stride) {
225   vst2_lane_u8(dst + 0 * stride, v, 0);
226   vst2_lane_u8(dst + 1 * stride, v, 1);
227   vst2_lane_u8(dst + 2 * stride, v, 2);
228   vst2_lane_u8(dst + 3 * stride, v, 3);
229   vst2_lane_u8(dst + 4 * stride, v, 4);
230   vst2_lane_u8(dst + 5 * stride, v, 5);
231   vst2_lane_u8(dst + 6 * stride, v, 6);
232   vst2_lane_u8(dst + 7 * stride, v, 7);
233 }
234 
Store2x16_NEON(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const dst,int stride)235 static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
236                                        uint8_t* const dst, int stride) {
237   uint8x8x2_t lo, hi;
238   lo.val[0] = vget_low_u8(p0);
239   lo.val[1] = vget_low_u8(q0);
240   hi.val[0] = vget_high_u8(p0);
241   hi.val[1] = vget_high_u8(q0);
242   Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
243   Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
244 }
245 
246 #if !defined(WORK_AROUND_GCC)
Store4x8_NEON(const uint8x8x4_t v,uint8_t * const dst,int stride)247 static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
248                                       uint8_t* const dst, int stride) {
249   vst4_lane_u8(dst + 0 * stride, v, 0);
250   vst4_lane_u8(dst + 1 * stride, v, 1);
251   vst4_lane_u8(dst + 2 * stride, v, 2);
252   vst4_lane_u8(dst + 3 * stride, v, 3);
253   vst4_lane_u8(dst + 4 * stride, v, 4);
254   vst4_lane_u8(dst + 5 * stride, v, 5);
255   vst4_lane_u8(dst + 6 * stride, v, 6);
256   vst4_lane_u8(dst + 7 * stride, v, 7);
257 }
258 
Store4x16_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const dst,int stride)259 static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
260                                        const uint8x16_t q0, const uint8x16_t q1,
261                                        uint8_t* const dst, int stride) {
262   uint8x8x4_t lo, hi;
263   INIT_VECTOR4(lo,
264                vget_low_u8(p1), vget_low_u8(p0),
265                vget_low_u8(q0), vget_low_u8(q1));
266   INIT_VECTOR4(hi,
267                vget_high_u8(p1), vget_high_u8(p0),
268                vget_high_u8(q0), vget_high_u8(q1));
269   Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
270   Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
271 }
272 #endif  // !WORK_AROUND_GCC
273 
Store16x2_NEON(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const dst,int stride)274 static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
275                                        uint8_t* const dst, int stride) {
276   vst1q_u8(dst - stride, p0);
277   vst1q_u8(dst, q0);
278 }
279 
Store16x4_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const dst,int stride)280 static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
281                                        const uint8x16_t q0, const uint8x16_t q1,
282                                        uint8_t* const dst, int stride) {
283   Store16x2_NEON(p1, p0, dst - stride, stride);
284   Store16x2_NEON(q0, q1, dst + stride, stride);
285 }
286 
Store8x2x2_NEON(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const u,uint8_t * const v,int stride)287 static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
288                                         const uint8x16_t q0,
289                                         uint8_t* const u, uint8_t* const v,
290                                         int stride) {
291   // p0 and q0 contain the u+v samples packed in low/high halves.
292   vst1_u8(u - stride, vget_low_u8(p0));
293   vst1_u8(u,          vget_low_u8(q0));
294   vst1_u8(v - stride, vget_high_u8(p0));
295   vst1_u8(v,          vget_high_u8(q0));
296 }
297 
Store8x4x2_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const u,uint8_t * const v,int stride)298 static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
299                                         const uint8x16_t p0,
300                                         const uint8x16_t q0,
301                                         const uint8x16_t q1,
302                                         uint8_t* const u, uint8_t* const v,
303                                         int stride) {
304   // The p1...q1 registers contain the u+v samples packed in low/high halves.
305   Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
306   Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
307 }
308 
309 #if !defined(WORK_AROUND_GCC)
310 
311 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
312   vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
313   vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
314   (DST) += stride;                                \
315 } while (0)
316 
Store6x8x2_NEON(const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,uint8_t * u,uint8_t * v,int stride)317 static WEBP_INLINE void Store6x8x2_NEON(
318     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
319     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
320     uint8_t* u, uint8_t* v, int stride) {
321   uint8x8x3_t u0, u1, v0, v1;
322   INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
323   INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
324   INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
325   INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
326   STORE6_LANE(u, u0, u1, 0);
327   STORE6_LANE(u, u0, u1, 1);
328   STORE6_LANE(u, u0, u1, 2);
329   STORE6_LANE(u, u0, u1, 3);
330   STORE6_LANE(u, u0, u1, 4);
331   STORE6_LANE(u, u0, u1, 5);
332   STORE6_LANE(u, u0, u1, 6);
333   STORE6_LANE(u, u0, u1, 7);
334   STORE6_LANE(v, v0, v1, 0);
335   STORE6_LANE(v, v0, v1, 1);
336   STORE6_LANE(v, v0, v1, 2);
337   STORE6_LANE(v, v0, v1, 3);
338   STORE6_LANE(v, v0, v1, 4);
339   STORE6_LANE(v, v0, v1, 5);
340   STORE6_LANE(v, v0, v1, 6);
341   STORE6_LANE(v, v0, v1, 7);
342 }
343 #undef STORE6_LANE
344 
Store4x8x2_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const u,uint8_t * const v,int stride)345 static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
346                                         const uint8x16_t p0,
347                                         const uint8x16_t q0,
348                                         const uint8x16_t q1,
349                                         uint8_t* const u, uint8_t* const v,
350                                         int stride) {
351   uint8x8x4_t u0, v0;
352   INIT_VECTOR4(u0,
353                vget_low_u8(p1), vget_low_u8(p0),
354                vget_low_u8(q0), vget_low_u8(q1));
355   INIT_VECTOR4(v0,
356                vget_high_u8(p1), vget_high_u8(p0),
357                vget_high_u8(q0), vget_high_u8(q1));
358   vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
359   vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
360   vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
361   vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
362   vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
363   vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
364   vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
365   vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
366   vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
367   vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
368   vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
369   vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
370   vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
371   vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
372   vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
373   vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
374 }
375 
376 #endif  // !WORK_AROUND_GCC
377 
378 // Zero extend 'v' to an int16x8_t.
ConvertU8ToS16_NEON(uint8x8_t v)379 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
380   return vreinterpretq_s16_u16(vmovl_u8(v));
381 }
382 
383 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
384 // to the corresponding rows of 'dst'.
SaturateAndStore4x4_NEON(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)385 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
386                                                  const int16x8_t dst01,
387                                                  const int16x8_t dst23) {
388   // Unsigned saturate to 8b.
389   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
390   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
391 
392   // Store the results.
393   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
394   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
395   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
396   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
397 }
398 
Add4x4_NEON(const int16x8_t row01,const int16x8_t row23,uint8_t * const dst)399 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
400                                     const int16x8_t row23,
401                                     uint8_t* const dst) {
402   uint32x2_t dst01 = vdup_n_u32(0);
403   uint32x2_t dst23 = vdup_n_u32(0);
404 
405   // Load the source pixels.
406   dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
407   dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
408   dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
409   dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
410 
411   {
412     // Convert to 16b.
413     const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
414     const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
415 
416     // Descale with rounding.
417     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
418     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
419     // Add the inverse transform.
420     SaturateAndStore4x4_NEON(dst, out01, out23);
421   }
422 }
423 
424 //-----------------------------------------------------------------------------
425 // Simple In-loop filtering (Paragraph 15.2)
426 
NeedsFilter_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,int thresh)427 static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
428                                    const uint8x16_t q0, const uint8x16_t q1,
429                                    int thresh) {
430   const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
431   const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
432   const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
433   const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
434   const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
435   const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
436   const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
437   return mask;
438 }
439 
FlipSign_NEON(const uint8x16_t v)440 static int8x16_t FlipSign_NEON(const uint8x16_t v) {
441   const uint8x16_t sign_bit = vdupq_n_u8(0x80);
442   return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
443 }
444 
FlipSignBack_NEON(const int8x16_t v)445 static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
446   const int8x16_t sign_bit = vdupq_n_s8(0x80);
447   return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
448 }
449 
GetBaseDelta_NEON(const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1)450 static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
451                                    const int8x16_t q0, const int8x16_t q1) {
452   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
453   const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
454   const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
455   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
456   const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
457   return s3;
458 }
459 
GetBaseDelta0_NEON(const int8x16_t p0,const int8x16_t q0)460 static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
461   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
462   const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
463   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
464   return s2;
465 }
466 
467 //------------------------------------------------------------------------------
468 
ApplyFilter2NoFlip_NEON(const int8x16_t p0s,const int8x16_t q0s,const int8x16_t delta,int8x16_t * const op0,int8x16_t * const oq0)469 static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
470                                     const int8x16_t delta,
471                                     int8x16_t* const op0,
472                                     int8x16_t* const oq0) {
473   const int8x16_t kCst3 = vdupq_n_s8(0x03);
474   const int8x16_t kCst4 = vdupq_n_s8(0x04);
475   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
476   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
477   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
478   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
479   *op0 = vqaddq_s8(p0s, delta3);
480   *oq0 = vqsubq_s8(q0s, delta4);
481 }
482 
483 #if defined(WEBP_USE_INTRINSICS)
484 
ApplyFilter2_NEON(const int8x16_t p0s,const int8x16_t q0s,const int8x16_t delta,uint8x16_t * const op0,uint8x16_t * const oq0)485 static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
486                               const int8x16_t delta,
487                               uint8x16_t* const op0, uint8x16_t* const oq0) {
488   const int8x16_t kCst3 = vdupq_n_s8(0x03);
489   const int8x16_t kCst4 = vdupq_n_s8(0x04);
490   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
491   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
492   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
493   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
494   const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
495   const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
496   *op0 = FlipSignBack_NEON(sp0);
497   *oq0 = FlipSignBack_NEON(sq0);
498 }
499 
DoFilter2_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t mask,uint8x16_t * const op0,uint8x16_t * const oq0)500 static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
501                            const uint8x16_t q0, const uint8x16_t q1,
502                            const uint8x16_t mask,
503                            uint8x16_t* const op0, uint8x16_t* const oq0) {
504   const int8x16_t p1s = FlipSign_NEON(p1);
505   const int8x16_t p0s = FlipSign_NEON(p0);
506   const int8x16_t q0s = FlipSign_NEON(q0);
507   const int8x16_t q1s = FlipSign_NEON(q1);
508   const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
509   const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
510   ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
511 }
512 
SimpleVFilter16_NEON(uint8_t * p,int stride,int thresh)513 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
514   uint8x16_t p1, p0, q0, q1, op0, oq0;
515   Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
516   {
517     const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
518     DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
519   }
520   Store16x2_NEON(op0, oq0, p, stride);
521 }
522 
SimpleHFilter16_NEON(uint8_t * p,int stride,int thresh)523 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
524   uint8x16_t p1, p0, q0, q1, oq0, op0;
525   Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
526   {
527     const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
528     DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
529   }
530   Store2x16_NEON(op0, oq0, p, stride);
531 }
532 
533 #else
534 
535 // Load/Store vertical edge
536 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
537   "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
538   "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
539   "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
540   "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
541   "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
542   "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
543   "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
544   "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
545 
546 #define STORE8x2(c1, c2, p, stride)                                            \
547   "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
548   "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
549   "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
550   "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
551   "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
552   "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
553   "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
554   "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
555 
556 #define QRegs "q0", "q1", "q2", "q3",                                          \
557               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
558 
559 #define FLIP_SIGN_BIT2(a, b, s)                                                \
560   "veor     " #a "," #a "," #s "               \n"                             \
561   "veor     " #b "," #b "," #s "               \n"                             \
562 
563 #define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
564   FLIP_SIGN_BIT2(a, b, s)                                                      \
565   FLIP_SIGN_BIT2(c, d, s)                                                      \
566 
567 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
568   "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
569   "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
570   "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
571   "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
572   "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
573   "vdup.8     q14, " #thresh "            \n"                                  \
574   "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
575 
576 #define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
577   "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
578   "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
579   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
580   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
581   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
582 
583 #define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
584   "vmov.i8    q15, #0x03                  \n"                                  \
585   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
586   "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
587   "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
588                                                                                \
589   "vmov.i8    q15, #0x04                  \n"                                  \
590   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
591   "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
592   "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
593 
594 // Applies filter on 2 pixels (p0 and q0)
595 #define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
596   NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
597   "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
598   FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
599   GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
600   "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
601   DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
602   FLIP_SIGN_BIT2(p0, q0, q10)
603 
SimpleVFilter16_NEON(uint8_t * p,int stride,int thresh)604 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
605   __asm__ volatile (
606     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
607 
608     "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
609     "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
610     "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
611     "vld1.u8    {q12}, [%[p]]                  \n"  // q1
612 
613     DO_FILTER2(q1, q2, q3, q12, %[thresh])
614 
615     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
616 
617     "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
618     "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
619     : [p] "+r"(p)
620     : [stride] "r"(stride), [thresh] "r"(thresh)
621     : "memory", QRegs
622   );
623 }
624 
SimpleHFilter16_NEON(uint8_t * p,int stride,int thresh)625 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
626   __asm__ volatile (
627     "sub        r4, %[p], #2                   \n"  // base1 = p - 2
628     "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
629     "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
630 
631     LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
632     LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
633     "vswp       d3, d24                        \n"  // p1:q1 p0:q3
634     "vswp       d5, d26                        \n"  // q0:q2 q1:q4
635     "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
636 
637     DO_FILTER2(q1, q2, q12, q13, %[thresh])
638 
639     "sub        %[p], %[p], #1                 \n"  // p - 1
640 
641     "vswp        d5, d24                       \n"
642     STORE8x2(d4, d5, [%[p]], %[stride])
643     STORE8x2(d24, d25, [%[p]], %[stride])
644 
645     : [p] "+r"(p)
646     : [stride] "r"(stride), [thresh] "r"(thresh)
647     : "memory", "r4", "r5", "r6", QRegs
648   );
649 }
650 
651 #undef LOAD8x4
652 #undef STORE8x2
653 
654 #endif    // WEBP_USE_INTRINSICS
655 
SimpleVFilter16i_NEON(uint8_t * p,int stride,int thresh)656 static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
657   uint32_t k;
658   for (k = 3; k != 0; --k) {
659     p += 4 * stride;
660     SimpleVFilter16_NEON(p, stride, thresh);
661   }
662 }
663 
SimpleHFilter16i_NEON(uint8_t * p,int stride,int thresh)664 static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
665   uint32_t k;
666   for (k = 3; k != 0; --k) {
667     p += 4;
668     SimpleHFilter16_NEON(p, stride, thresh);
669   }
670 }
671 
672 //------------------------------------------------------------------------------
673 // Complex In-loop filtering (Paragraph 15.3)
674 
NeedsHev_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,int hev_thresh)675 static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
676                                 const uint8x16_t q0, const uint8x16_t q1,
677                                 int hev_thresh) {
678   const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
679   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
680   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
681   const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
682   const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
683   return mask;
684 }
685 
NeedsFilter2_NEON(const uint8x16_t p3,const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t q3,int ithresh,int thresh)686 static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
687                                     const uint8x16_t p1, const uint8x16_t p0,
688                                     const uint8x16_t q0, const uint8x16_t q1,
689                                     const uint8x16_t q2, const uint8x16_t q3,
690                                     int ithresh, int thresh) {
691   const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
692   const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
693   const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
694   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
695   const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
696   const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
697   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
698   const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
699   const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
700   const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
701   const uint8x16_t max12 = vmaxq_u8(max1, max2);
702   const uint8x16_t max123 = vmaxq_u8(max12, max3);
703   const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
704   const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
705   const uint8x16_t mask = vandq_u8(mask1, mask2);
706   return mask;
707 }
708 
709 //  4-points filter
710 
ApplyFilter4_NEON(const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1,const int8x16_t delta0,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1)711 static void ApplyFilter4_NEON(
712     const int8x16_t p1, const int8x16_t p0,
713     const int8x16_t q0, const int8x16_t q1,
714     const int8x16_t delta0,
715     uint8x16_t* const op1, uint8x16_t* const op0,
716     uint8x16_t* const oq0, uint8x16_t* const oq1) {
717   const int8x16_t kCst3 = vdupq_n_s8(0x03);
718   const int8x16_t kCst4 = vdupq_n_s8(0x04);
719   const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
720   const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
721   const int8x16_t a1 = vshrq_n_s8(delta1, 3);
722   const int8x16_t a2 = vshrq_n_s8(delta2, 3);
723   const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
724   *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));  // clip(p0 + a2)
725   *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - a1)
726   *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));  // clip(p1 + a3)
727   *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));  // clip(q1 - a3)
728 }
729 
DoFilter4_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t mask,const uint8x16_t hev_mask,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1)730 static void DoFilter4_NEON(
731     const uint8x16_t p1, const uint8x16_t p0,
732     const uint8x16_t q0, const uint8x16_t q1,
733     const uint8x16_t mask, const uint8x16_t hev_mask,
734     uint8x16_t* const op1, uint8x16_t* const op0,
735     uint8x16_t* const oq0, uint8x16_t* const oq1) {
736   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
737   const int8x16_t p1s = FlipSign_NEON(p1);
738   int8x16_t p0s = FlipSign_NEON(p0);
739   int8x16_t q0s = FlipSign_NEON(q0);
740   const int8x16_t q1s = FlipSign_NEON(q1);
741   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
742 
743   // do_filter2 part (simple loopfilter on pixels with hev)
744   {
745     const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
746     const int8x16_t simple_lf_delta =
747         vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
748     ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
749   }
750 
751   // do_filter4 part (complex loopfilter on pixels without hev)
752   {
753     const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
754     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
755     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
756     const int8x16_t complex_lf_delta =
757         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
758     ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
759   }
760 }
761 
762 //  6-points filter
763 
ApplyFilter6_NEON(const int8x16_t p2,const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1,const int8x16_t q2,const int8x16_t delta,uint8x16_t * const op2,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1,uint8x16_t * const oq2)764 static void ApplyFilter6_NEON(
765     const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
766     const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
767     const int8x16_t delta,
768     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
769     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
770   // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
771   // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
772   // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
773   //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
774   const int8x8_t delta_lo = vget_low_s8(delta);
775   const int8x8_t delta_hi = vget_high_s8(delta);
776   const int8x8_t kCst9 = vdup_n_s8(9);
777   const int16x8_t kCstm1 = vdupq_n_s16(-1);
778   const int8x8_t kCst18 = vdup_n_s8(18);
779   const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
780   const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
781   const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
782   const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
783   const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
784   const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
785   const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
786   const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
787   const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
788   const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
789   const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
790   const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
791   const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
792 
793   *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));  // clip(p0 + a1)
794   *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - q1)
795   *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));  // clip(q1 - a2)
796   *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));  // clip(p1 + a2)
797   *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));  // clip(q2 - a3)
798   *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));  // clip(p2 + a3)
799 }
800 
DoFilter6_NEON(const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t mask,const uint8x16_t hev_mask,uint8x16_t * const op2,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1,uint8x16_t * const oq2)801 static void DoFilter6_NEON(
802     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
803     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
804     const uint8x16_t mask, const uint8x16_t hev_mask,
805     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
806     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
807   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
808   const int8x16_t p2s = FlipSign_NEON(p2);
809   const int8x16_t p1s = FlipSign_NEON(p1);
810   int8x16_t p0s = FlipSign_NEON(p0);
811   int8x16_t q0s = FlipSign_NEON(q0);
812   const int8x16_t q1s = FlipSign_NEON(q1);
813   const int8x16_t q2s = FlipSign_NEON(q2);
814   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
815   const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
816 
817   // do_filter2 part (simple loopfilter on pixels with hev)
818   {
819     const int8x16_t simple_lf_delta =
820         vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
821     ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
822   }
823 
824   // do_filter6 part (complex loopfilter on pixels without hev)
825   {
826     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
827     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
828     const int8x16_t complex_lf_delta =
829         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
830     ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
831                       op2, op1, op0, oq0, oq1, oq2);
832   }
833 }
834 
835 // on macroblock edges
836 
VFilter16_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)837 static void VFilter16_NEON(uint8_t* p, int stride,
838                            int thresh, int ithresh, int hev_thresh) {
839   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840   Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
841   {
842     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
843                                               ithresh, thresh);
844     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
845     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
846     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
847                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
848     Store16x2_NEON(op2, op1, p - 2 * stride, stride);
849     Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
850     Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
851   }
852 }
853 
HFilter16_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)854 static void HFilter16_NEON(uint8_t* p, int stride,
855                            int thresh, int ithresh, int hev_thresh) {
856   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
857   Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
858   {
859     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
860                                               ithresh, thresh);
861     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
862     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
863     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
864                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
865     Store2x16_NEON(op2, op1, p - 2, stride);
866     Store2x16_NEON(op0, oq0, p + 0, stride);
867     Store2x16_NEON(oq1, oq2, p + 2, stride);
868   }
869 }
870 
871 // on three inner edges
VFilter16i_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)872 static void VFilter16i_NEON(uint8_t* p, int stride,
873                             int thresh, int ithresh, int hev_thresh) {
874   uint32_t k;
875   uint8x16_t p3, p2, p1, p0;
876   Load16x4_NEON(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
877   for (k = 3; k != 0; --k) {
878     uint8x16_t q0, q1, q2, q3;
879     p += 4 * stride;
880     Load16x4_NEON(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
881     {
882       const uint8x16_t mask =
883           NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
884       const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
885       // p3 and p2 are not just temporary variables here: they will be
886       // re-used for next span. And q2/q3 will become p1/p0 accordingly.
887       DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
888       Store16x4_NEON(p1, p0, p3, p2, p, stride);
889       p1 = q2;
890       p0 = q3;
891     }
892   }
893 }
894 
895 #if !defined(WORK_AROUND_GCC)
HFilter16i_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)896 static void HFilter16i_NEON(uint8_t* p, int stride,
897                             int thresh, int ithresh, int hev_thresh) {
898   uint32_t k;
899   uint8x16_t p3, p2, p1, p0;
900   Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
901   for (k = 3; k != 0; --k) {
902     uint8x16_t q0, q1, q2, q3;
903     p += 4;
904     Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
905     {
906       const uint8x16_t mask =
907           NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
908       const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
909       DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
910       Store4x16_NEON(p1, p0, p3, p2, p, stride);
911       p1 = q2;
912       p0 = q3;
913     }
914   }
915 }
916 #endif  // !WORK_AROUND_GCC
917 
918 // 8-pixels wide variant, for chroma filtering
VFilter8_NEON(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)919 static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
920                           int thresh, int ithresh, int hev_thresh) {
921   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
922   Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
923   {
924     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
925                                               ithresh, thresh);
926     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
927     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
928     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
929                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
930     Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
931     Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
932     Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
933   }
934 }
VFilter8i_NEON(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)935 static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
936                            int thresh, int ithresh, int hev_thresh) {
937   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
938   u += 4 * stride;
939   v += 4 * stride;
940   Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
941   {
942     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
943                                               ithresh, thresh);
944     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
945     uint8x16_t op1, op0, oq0, oq1;
946     DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
947     Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
948   }
949 }
950 
951 #if !defined(WORK_AROUND_GCC)
HFilter8_NEON(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)952 static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
953                           int thresh, int ithresh, int hev_thresh) {
954   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
955   Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
956   {
957     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
958                                               ithresh, thresh);
959     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
960     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
961     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
962                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
963     Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
964   }
965 }
966 
HFilter8i_NEON(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)967 static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
968                            int thresh, int ithresh, int hev_thresh) {
969   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
970   u += 4;
971   v += 4;
972   Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
973   {
974     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
975                                               ithresh, thresh);
976     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
977     uint8x16_t op1, op0, oq0, oq1;
978     DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
979     Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
980   }
981 }
982 #endif  // !WORK_AROUND_GCC
983 
984 //-----------------------------------------------------------------------------
985 // Inverse transforms (Paragraph 14.4)
986 
987 // Technically these are unsigned but vqdmulh is only available in signed.
988 // vqdmulh returns high half (effectively >> 16) but also doubles the value,
989 // changing the >> 16 to >> 15 and requiring an additional >> 1.
990 // We use this to our advantage with kC2. The canonical value is 35468.
991 // However, the high bit is set so treating it as signed will give incorrect
992 // results. We avoid this by down shifting by 1 here to clear the highest bit.
993 // Combined with the doubling effect of vqdmulh we get >> 16.
994 // This can not be applied to kC1 because the lowest bit is set. Down shifting
995 // the constant would reduce precision.
996 
997 // libwebp uses a trick to avoid some extra addition that libvpx does.
998 // Instead of:
999 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1000 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1001 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
1002 
1003 static const int16_t kC1 = 20091;
1004 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
1005 
1006 #if defined(WEBP_USE_INTRINSICS)
Transpose8x2_NEON(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)1007 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1008                                           const int16x8_t in1,
1009                                           int16x8x2_t* const out) {
1010   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
1011   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
1012   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
1013                                                   // b0 d0 b1 d1 b2 d2 ...
1014   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1015 }
1016 
TransformPass_NEON(int16x8x2_t * const rows)1017 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1018   // {rows} = in0 | in4
1019   //          in8 | in12
1020   // B1 = in4 | in12
1021   const int16x8_t B1 =
1022       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1023   // C0 = kC1 * in4 | kC1 * in12
1024   // C1 = kC2 * in4 | kC2 * in12
1025   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1026   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1027   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1028                                 vget_low_s16(rows->val[1]));   // in0 + in8
1029   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1030                                 vget_low_s16(rows->val[1]));   // in0 - in8
1031   // c = kC2 * in4 - kC1 * in12
1032   // d = kC1 * in4 + kC2 * in12
1033   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1034   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1035   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
1036   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
1037   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
1038   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
1039   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1040   Transpose8x2_NEON(E0, E1, rows);
1041 }
1042 
TransformOne_NEON(const int16_t * in,uint8_t * dst)1043 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1044   int16x8x2_t rows;
1045   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1046   TransformPass_NEON(&rows);
1047   TransformPass_NEON(&rows);
1048   Add4x4_NEON(rows.val[0], rows.val[1], dst);
1049 }
1050 
1051 #else
1052 
TransformOne_NEON(const int16_t * in,uint8_t * dst)1053 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1054   const int kBPS = BPS;
1055   // kC1, kC2. Padded because vld1.16 loads 8 bytes
1056   const int16_t constants[4] = { kC1, kC2, 0, 0 };
1057   /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1058   __asm__ volatile (
1059     "vld1.16         {q1, q2}, [%[in]]           \n"
1060     "vld1.16         {d0}, [%[constants]]        \n"
1061 
1062     /* d2: in[0]
1063      * d3: in[8]
1064      * d4: in[4]
1065      * d5: in[12]
1066      */
1067     "vswp            d3, d4                      \n"
1068 
1069     /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1070      * q9 = {in[4], in[12]} * kC2 >> 16
1071      */
1072     "vqdmulh.s16     q8, q2, d0[0]               \n"
1073     "vqdmulh.s16     q9, q2, d0[1]               \n"
1074 
1075     /* d22 = a = in[0] + in[8]
1076      * d23 = b = in[0] - in[8]
1077      */
1078     "vqadd.s16       d22, d2, d3                 \n"
1079     "vqsub.s16       d23, d2, d3                 \n"
1080 
1081     /* The multiplication should be x * kC1 >> 16
1082      * However, with vqdmulh we get x * kC1 * 2 >> 16
1083      * (multiply, double, return high half)
1084      * We avoided this in kC2 by pre-shifting the constant.
1085      * q8 = in[4]/[12] * kC1 >> 16
1086      */
1087     "vshr.s16        q8, q8, #1                  \n"
1088 
1089     /* Add {in[4], in[12]} back after the multiplication. This is handled by
1090      * adding 1 << 16 to kC1 in the libwebp C code.
1091      */
1092     "vqadd.s16       q8, q2, q8                  \n"
1093 
1094     /* d20 = c = in[4]*kC2 - in[12]*kC1
1095      * d21 = d = in[4]*kC1 + in[12]*kC2
1096      */
1097     "vqsub.s16       d20, d18, d17               \n"
1098     "vqadd.s16       d21, d19, d16               \n"
1099 
1100     /* d2 = tmp[0] = a + d
1101      * d3 = tmp[1] = b + c
1102      * d4 = tmp[2] = b - c
1103      * d5 = tmp[3] = a - d
1104      */
1105     "vqadd.s16       d2, d22, d21                \n"
1106     "vqadd.s16       d3, d23, d20                \n"
1107     "vqsub.s16       d4, d23, d20                \n"
1108     "vqsub.s16       d5, d22, d21                \n"
1109 
1110     "vzip.16         q1, q2                      \n"
1111     "vzip.16         q1, q2                      \n"
1112 
1113     "vswp            d3, d4                      \n"
1114 
1115     /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1116      * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1117      */
1118     "vqdmulh.s16     q8, q2, d0[0]               \n"
1119     "vqdmulh.s16     q9, q2, d0[1]               \n"
1120 
1121     /* d22 = a = tmp[0] + tmp[8]
1122      * d23 = b = tmp[0] - tmp[8]
1123      */
1124     "vqadd.s16       d22, d2, d3                 \n"
1125     "vqsub.s16       d23, d2, d3                 \n"
1126 
1127     /* See long winded explanations prior */
1128     "vshr.s16        q8, q8, #1                  \n"
1129     "vqadd.s16       q8, q2, q8                  \n"
1130 
1131     /* d20 = c = in[4]*kC2 - in[12]*kC1
1132      * d21 = d = in[4]*kC1 + in[12]*kC2
1133      */
1134     "vqsub.s16       d20, d18, d17               \n"
1135     "vqadd.s16       d21, d19, d16               \n"
1136 
1137     /* d2 = tmp[0] = a + d
1138      * d3 = tmp[1] = b + c
1139      * d4 = tmp[2] = b - c
1140      * d5 = tmp[3] = a - d
1141      */
1142     "vqadd.s16       d2, d22, d21                \n"
1143     "vqadd.s16       d3, d23, d20                \n"
1144     "vqsub.s16       d4, d23, d20                \n"
1145     "vqsub.s16       d5, d22, d21                \n"
1146 
1147     "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
1148     "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
1149     "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
1150     "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
1151 
1152     "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
1153 
1154     /* (val) + 4 >> 3 */
1155     "vrshr.s16       d2, d2, #3                  \n"
1156     "vrshr.s16       d3, d3, #3                  \n"
1157     "vrshr.s16       d4, d4, #3                  \n"
1158     "vrshr.s16       d5, d5, #3                  \n"
1159 
1160     "vzip.16         q1, q2                      \n"
1161     "vzip.16         q1, q2                      \n"
1162 
1163     /* Must accumulate before saturating */
1164     "vmovl.u8        q8, d6                      \n"
1165     "vmovl.u8        q9, d7                      \n"
1166 
1167     "vqadd.s16       q1, q1, q8                  \n"
1168     "vqadd.s16       q2, q2, q9                  \n"
1169 
1170     "vqmovun.s16     d0, q1                      \n"
1171     "vqmovun.s16     d1, q2                      \n"
1172 
1173     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
1174     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
1175     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
1176     "vst1.32         d1[1], [%[dst]]             \n"
1177 
1178     : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
1179     : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
1180     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
1181   );
1182 }
1183 
1184 #endif    // WEBP_USE_INTRINSICS
1185 
TransformTwo_NEON(const int16_t * in,uint8_t * dst,int do_two)1186 static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
1187   TransformOne_NEON(in, dst);
1188   if (do_two) {
1189     TransformOne_NEON(in + 16, dst + 4);
1190   }
1191 }
1192 
TransformDC_NEON(const int16_t * in,uint8_t * dst)1193 static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
1194   const int16x8_t DC = vdupq_n_s16(in[0]);
1195   Add4x4_NEON(DC, DC, dst);
1196 }
1197 
1198 //------------------------------------------------------------------------------
1199 
1200 #define STORE_WHT(dst, col, rows) do {                  \
1201   *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1202   *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1203   *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1204   *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1205 } while (0)
1206 
TransformWHT_NEON(const int16_t * in,int16_t * out)1207 static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
1208   int32x4x4_t tmp;
1209 
1210   {
1211     // Load the source.
1212     const int16x4_t in00_03 = vld1_s16(in + 0);
1213     const int16x4_t in04_07 = vld1_s16(in + 4);
1214     const int16x4_t in08_11 = vld1_s16(in + 8);
1215     const int16x4_t in12_15 = vld1_s16(in + 12);
1216     const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
1217     const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
1218     const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
1219     const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
1220     tmp.val[0] = vaddq_s32(a0, a1);
1221     tmp.val[1] = vaddq_s32(a3, a2);
1222     tmp.val[2] = vsubq_s32(a0, a1);
1223     tmp.val[3] = vsubq_s32(a3, a2);
1224     // Arrange the temporary results column-wise.
1225     tmp = Transpose4x4_NEON(tmp);
1226   }
1227 
1228   {
1229     const int32x4_t kCst3 = vdupq_n_s32(3);
1230     const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
1231     const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1232     const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1233     const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1234     const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1235 
1236     tmp.val[0] = vaddq_s32(a0, a1);
1237     tmp.val[1] = vaddq_s32(a3, a2);
1238     tmp.val[2] = vsubq_s32(a0, a1);
1239     tmp.val[3] = vsubq_s32(a3, a2);
1240 
1241     // right shift the results by 3.
1242     tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1243     tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1244     tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1245     tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1246 
1247     STORE_WHT(out, 0, tmp);
1248     STORE_WHT(out, 1, tmp);
1249     STORE_WHT(out, 2, tmp);
1250     STORE_WHT(out, 3, tmp);
1251   }
1252 }
1253 
1254 #undef STORE_WHT
1255 
1256 //------------------------------------------------------------------------------
1257 
1258 #define MUL(a, b) (((a) * (b)) >> 16)
TransformAC3_NEON(const int16_t * in,uint8_t * dst)1259 static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
1260   static const int kC1_full = 20091 + (1 << 16);
1261   static const int kC2_full = 35468;
1262   const int16x4_t A = vld1_dup_s16(in);
1263   const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1264   const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1265   const int c1 = MUL(in[1], kC2_full);
1266   const int d1 = MUL(in[1], kC1_full);
1267   const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
1268                       (uint64_t)( c1 & 0xffff) << 16 |
1269                       (uint64_t)(-c1 & 0xffff) << 32 |
1270                       (uint64_t)(-d1 & 0xffff) << 48;
1271   const int16x4_t CD = vcreate_s16(cd);
1272   const int16x4_t B = vqadd_s16(A, CD);
1273   const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1274   const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1275   Add4x4_NEON(m0_m1, m2_m3, dst);
1276 }
1277 #undef MUL
1278 
1279 //------------------------------------------------------------------------------
1280 // 4x4
1281 
DC4_NEON(uint8_t * dst)1282 static void DC4_NEON(uint8_t* dst) {    // DC
1283   const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1284   const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1285   const uint16x4_t p1 = vpadd_u16(p0, p0);
1286   const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1287   const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1288   const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1289   const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1290   const uint16x8_t s0 = vaddl_u8(L0, L1);
1291   const uint16x8_t s1 = vaddl_u8(L2, L3);
1292   const uint16x8_t s01 = vaddq_u16(s0, s1);
1293   const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1294   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
1295   const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1296   int i;
1297   for (i = 0; i < 4; ++i) {
1298     vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1299   }
1300 }
1301 
1302 // TrueMotion (4x4 + 8x8)
TrueMotion_NEON(uint8_t * dst,int size)1303 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1304   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1305   const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
1306   const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
1307   int y;
1308   for (y = 0; y < size; y += 4) {
1309     // left edge
1310     const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1311     const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1312     const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1313     const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1314     const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
1315     const int16x8_t r1 = vaddq_s16(L1, d);
1316     const int16x8_t r2 = vaddq_s16(L2, d);
1317     const int16x8_t r3 = vaddq_s16(L3, d);
1318     // Saturate and store the result.
1319     const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1320     const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1321     const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1322     const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1323     if (size == 4) {
1324       vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1325       vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1326       vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1327       vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1328     } else {
1329       vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1330       vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1331       vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1332       vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1333     }
1334     dst += 4 * BPS;
1335   }
1336 }
1337 
TM4_NEON(uint8_t * dst)1338 static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1339 
VE4_NEON(uint8_t * dst)1340 static void VE4_NEON(uint8_t* dst) {    // vertical
1341   // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1342   const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
1343   const uint64x1_t A1 = vshr_n_u64(A0, 8);
1344   const uint64x1_t A2 = vshr_n_u64(A0, 16);
1345   const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1346   const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1347   const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1348   const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1349   const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1350   int i;
1351   for (i = 0; i < 4; ++i) {
1352     vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1353   }
1354 }
1355 
RD4_NEON(uint8_t * dst)1356 static void RD4_NEON(uint8_t* dst) {   // Down-right
1357   const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1358   const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1359   const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1360   const uint32_t I = dst[-1 + 0 * BPS];
1361   const uint32_t J = dst[-1 + 1 * BPS];
1362   const uint32_t K = dst[-1 + 2 * BPS];
1363   const uint32_t L = dst[-1 + 3 * BPS];
1364   const uint64x1_t LKJI____ =
1365       vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
1366   const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1367   const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1368   const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1369   const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1370   const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1371   const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1372   const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1373   const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1374   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1375   const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1376   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1377   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1378   const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1379   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1380   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1381   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1382   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1383 }
1384 
LD4_NEON(uint8_t * dst)1385 static void LD4_NEON(uint8_t* dst) {    // Down-left
1386   // Note using the same shift trick as VE4() is slower here.
1387   const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1388   const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1389   const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1390   const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1391   const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1392   const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1393   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1394   const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1395   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1396   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1397   const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1398   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1399   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1400   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1401   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1402 }
1403 
1404 //------------------------------------------------------------------------------
1405 // Chroma
1406 
VE8uv_NEON(uint8_t * dst)1407 static void VE8uv_NEON(uint8_t* dst) {    // vertical
1408   const uint8x8_t top = vld1_u8(dst - BPS);
1409   int j;
1410   for (j = 0; j < 8; ++j) {
1411     vst1_u8(dst + j * BPS, top);
1412   }
1413 }
1414 
HE8uv_NEON(uint8_t * dst)1415 static void HE8uv_NEON(uint8_t* dst) {    // horizontal
1416   int j;
1417   for (j = 0; j < 8; ++j) {
1418     const uint8x8_t left = vld1_dup_u8(dst - 1);
1419     vst1_u8(dst, left);
1420     dst += BPS;
1421   }
1422 }
1423 
DC8_NEON(uint8_t * dst,int do_top,int do_left)1424 static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1425   uint16x8_t sum_top;
1426   uint16x8_t sum_left;
1427   uint8x8_t dc0;
1428 
1429   if (do_top) {
1430     const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1431 #if defined(__aarch64__)
1432     const uint16_t p2 = vaddlv_u8(A);
1433     sum_top = vdupq_n_u16(p2);
1434 #else
1435     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1436     const uint16x4_t p1 = vpadd_u16(p0, p0);
1437     const uint16x4_t p2 = vpadd_u16(p1, p1);
1438     sum_top = vcombine_u16(p2, p2);
1439 #endif
1440   }
1441 
1442   if (do_left) {
1443     const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1444     const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1445     const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1446     const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1447     const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
1448     const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
1449     const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
1450     const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
1451     const uint16x8_t s0 = vaddl_u8(L0, L1);
1452     const uint16x8_t s1 = vaddl_u8(L2, L3);
1453     const uint16x8_t s2 = vaddl_u8(L4, L5);
1454     const uint16x8_t s3 = vaddl_u8(L6, L7);
1455     const uint16x8_t s01 = vaddq_u16(s0, s1);
1456     const uint16x8_t s23 = vaddq_u16(s2, s3);
1457     sum_left = vaddq_u16(s01, s23);
1458   }
1459 
1460   if (do_top && do_left) {
1461     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1462     dc0 = vrshrn_n_u16(sum, 4);
1463   } else if (do_top) {
1464     dc0 = vrshrn_n_u16(sum_top, 3);
1465   } else if (do_left) {
1466     dc0 = vrshrn_n_u16(sum_left, 3);
1467   } else {
1468     dc0 = vdup_n_u8(0x80);
1469   }
1470 
1471   {
1472     const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1473     int i;
1474     for (i = 0; i < 8; ++i) {
1475       vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1476     }
1477   }
1478 }
1479 
DC8uv_NEON(uint8_t * dst)1480 static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
DC8uvNoTop_NEON(uint8_t * dst)1481 static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
DC8uvNoLeft_NEON(uint8_t * dst)1482 static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
DC8uvNoTopLeft_NEON(uint8_t * dst)1483 static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1484 
TM8uv_NEON(uint8_t * dst)1485 static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1486 
1487 //------------------------------------------------------------------------------
1488 // 16x16
1489 
VE16_NEON(uint8_t * dst)1490 static void VE16_NEON(uint8_t* dst) {     // vertical
1491   const uint8x16_t top = vld1q_u8(dst - BPS);
1492   int j;
1493   for (j = 0; j < 16; ++j) {
1494     vst1q_u8(dst + j * BPS, top);
1495   }
1496 }
1497 
HE16_NEON(uint8_t * dst)1498 static void HE16_NEON(uint8_t* dst) {     // horizontal
1499   int j;
1500   for (j = 0; j < 16; ++j) {
1501     const uint8x16_t left = vld1q_dup_u8(dst - 1);
1502     vst1q_u8(dst, left);
1503     dst += BPS;
1504   }
1505 }
1506 
DC16_NEON(uint8_t * dst,int do_top,int do_left)1507 static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1508   uint16x8_t sum_top;
1509   uint16x8_t sum_left;
1510   uint8x8_t dc0;
1511 
1512   if (do_top) {
1513     const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
1514 #if defined(__aarch64__)
1515     const uint16_t p3 = vaddlvq_u8(A);
1516     sum_top = vdupq_n_u16(p3);
1517 #else
1518     const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
1519     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1520     const uint16x4_t p2 = vpadd_u16(p1, p1);
1521     const uint16x4_t p3 = vpadd_u16(p2, p2);
1522     sum_top = vcombine_u16(p3, p3);
1523 #endif
1524   }
1525 
1526   if (do_left) {
1527     int i;
1528     sum_left = vdupq_n_u16(0);
1529     for (i = 0; i < 16; i += 8) {
1530       const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
1531       const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
1532       const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
1533       const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
1534       const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
1535       const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
1536       const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
1537       const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
1538       const uint16x8_t s0 = vaddl_u8(L0, L1);
1539       const uint16x8_t s1 = vaddl_u8(L2, L3);
1540       const uint16x8_t s2 = vaddl_u8(L4, L5);
1541       const uint16x8_t s3 = vaddl_u8(L6, L7);
1542       const uint16x8_t s01 = vaddq_u16(s0, s1);
1543       const uint16x8_t s23 = vaddq_u16(s2, s3);
1544       const uint16x8_t sum = vaddq_u16(s01, s23);
1545       sum_left = vaddq_u16(sum_left, sum);
1546     }
1547   }
1548 
1549   if (do_top && do_left) {
1550     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1551     dc0 = vrshrn_n_u16(sum, 5);
1552   } else if (do_top) {
1553     dc0 = vrshrn_n_u16(sum_top, 4);
1554   } else if (do_left) {
1555     dc0 = vrshrn_n_u16(sum_left, 4);
1556   } else {
1557     dc0 = vdup_n_u8(0x80);
1558   }
1559 
1560   {
1561     const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1562     int i;
1563     for (i = 0; i < 16; ++i) {
1564       vst1q_u8(dst + i * BPS, dc);
1565     }
1566   }
1567 }
1568 
DC16TopLeft_NEON(uint8_t * dst)1569 static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
DC16NoTop_NEON(uint8_t * dst)1570 static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
DC16NoLeft_NEON(uint8_t * dst)1571 static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
DC16NoTopLeft_NEON(uint8_t * dst)1572 static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1573 
TM16_NEON(uint8_t * dst)1574 static void TM16_NEON(uint8_t* dst) {
1575   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1576   const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
1577   // A[c] - A[-1]
1578   const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
1579   const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
1580   int y;
1581   for (y = 0; y < 16; y += 4) {
1582     // left edge
1583     const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1584     const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1585     const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1586     const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1587     const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
1588     const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
1589     const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
1590     const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
1591     const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
1592     const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
1593     const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
1594     const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
1595     // Saturate and store the result.
1596     const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1597     const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1598     const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1599     const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1600     vst1q_u8(dst + 0 * BPS, row0);
1601     vst1q_u8(dst + 1 * BPS, row1);
1602     vst1q_u8(dst + 2 * BPS, row2);
1603     vst1q_u8(dst + 3 * BPS, row3);
1604     dst += 4 * BPS;
1605   }
1606 }
1607 
1608 //------------------------------------------------------------------------------
1609 // Entry point
1610 
1611 extern void VP8DspInitNEON(void);
1612 
VP8DspInitNEON(void)1613 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1614   VP8Transform = TransformTwo_NEON;
1615   VP8TransformAC3 = TransformAC3_NEON;
1616   VP8TransformDC = TransformDC_NEON;
1617   VP8TransformWHT = TransformWHT_NEON;
1618 
1619   VP8VFilter16 = VFilter16_NEON;
1620   VP8VFilter16i = VFilter16i_NEON;
1621   VP8HFilter16 = HFilter16_NEON;
1622 #if !defined(WORK_AROUND_GCC)
1623   VP8HFilter16i = HFilter16i_NEON;
1624 #endif
1625   VP8VFilter8 = VFilter8_NEON;
1626   VP8VFilter8i = VFilter8i_NEON;
1627 #if !defined(WORK_AROUND_GCC)
1628   VP8HFilter8 = HFilter8_NEON;
1629   VP8HFilter8i = HFilter8i_NEON;
1630 #endif
1631   VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1632   VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1633   VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1634   VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1635 
1636   VP8PredLuma4[0] = DC4_NEON;
1637   VP8PredLuma4[1] = TM4_NEON;
1638   VP8PredLuma4[2] = VE4_NEON;
1639   VP8PredLuma4[4] = RD4_NEON;
1640   VP8PredLuma4[6] = LD4_NEON;
1641 
1642   VP8PredLuma16[0] = DC16TopLeft_NEON;
1643   VP8PredLuma16[1] = TM16_NEON;
1644   VP8PredLuma16[2] = VE16_NEON;
1645   VP8PredLuma16[3] = HE16_NEON;
1646   VP8PredLuma16[4] = DC16NoTop_NEON;
1647   VP8PredLuma16[5] = DC16NoLeft_NEON;
1648   VP8PredLuma16[6] = DC16NoTopLeft_NEON;
1649 
1650   VP8PredChroma8[0] = DC8uv_NEON;
1651   VP8PredChroma8[1] = TM8uv_NEON;
1652   VP8PredChroma8[2] = VE8uv_NEON;
1653   VP8PredChroma8[3] = HE8uv_NEON;
1654   VP8PredChroma8[4] = DC8uvNoTop_NEON;
1655   VP8PredChroma8[5] = DC8uvNoLeft_NEON;
1656   VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
1657 }
1658 
1659 #else  // !WEBP_USE_NEON
1660 
1661 WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1662 
1663 #endif  // WEBP_USE_NEON
1664