1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of speed-critical encoding functions.
11 //
12 // adapted from libvpx (http://www.webmproject.org/code/)
13 
14 #include "src/dsp/dsp.h"
15 
16 #if defined(WEBP_USE_NEON)
17 
18 #include <assert.h>
19 
20 #include "src/dsp/neon.h"
21 #include "src/enc/vp8i_enc.h"
22 
23 //------------------------------------------------------------------------------
24 // Transforms (Paragraph 14.4)
25 
26 // Inverse transform.
27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
28 // for subtraction to *ref. See the comments there for algorithmic explanations.
29 
30 static const int16_t kC1 = 20091;
31 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
32 
33 // This code works but is *slower* than the inlined-asm version below
34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
35 // WEBP_USE_INTRINSICS define.
36 // With gcc-4.8, it's a little faster speed than inlined-assembly.
37 #if defined(WEBP_USE_INTRINSICS)
38 
39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
ConvertU8ToS16_NEON(uint32x2_t v)40 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
41   return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
42 }
43 
44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
45 // to the corresponding rows of 'dst'.
SaturateAndStore4x4_NEON(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)46 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
47                                                  const int16x8_t dst01,
48                                                  const int16x8_t dst23) {
49   // Unsigned saturate to 8b.
50   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
51   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
52 
53   // Store the results.
54   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
55   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
56   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
57   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
58 }
59 
Add4x4_NEON(const int16x8_t row01,const int16x8_t row23,const uint8_t * const ref,uint8_t * const dst)60 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
61                                     const int16x8_t row23,
62                                     const uint8_t* const ref,
63                                     uint8_t* const dst) {
64   uint32x2_t dst01 = vdup_n_u32(0);
65   uint32x2_t dst23 = vdup_n_u32(0);
66 
67   // Load the source pixels.
68   dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
69   dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
70   dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
71   dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
72 
73   {
74     // Convert to 16b.
75     const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
76     const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
77 
78     // Descale with rounding.
79     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
80     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
81     // Add the inverse transform.
82     SaturateAndStore4x4_NEON(dst, out01, out23);
83   }
84 }
85 
Transpose8x2_NEON(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)86 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
87                                           const int16x8_t in1,
88                                           int16x8x2_t* const out) {
89   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
90   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
91   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
92                                                   // b0 d0 b1 d1 b2 d2 ...
93   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
94 }
95 
TransformPass_NEON(int16x8x2_t * const rows)96 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
97   // {rows} = in0 | in4
98   //          in8 | in12
99   // B1 = in4 | in12
100   const int16x8_t B1 =
101       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
102   // C0 = kC1 * in4 | kC1 * in12
103   // C1 = kC2 * in4 | kC2 * in12
104   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
105   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
106   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
107                                 vget_low_s16(rows->val[1]));   // in0 + in8
108   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
109                                 vget_low_s16(rows->val[1]));   // in0 - in8
110   // c = kC2 * in4 - kC1 * in12
111   // d = kC1 * in4 + kC2 * in12
112   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
113   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
114   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
115   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
116   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
117   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
118   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
119   Transpose8x2_NEON(E0, E1, rows);
120 }
121 
ITransformOne_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst)122 static void ITransformOne_NEON(const uint8_t* ref,
123                                const int16_t* in, uint8_t* dst) {
124   int16x8x2_t rows;
125   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
126   TransformPass_NEON(&rows);
127   TransformPass_NEON(&rows);
128   Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
129 }
130 
131 #else
132 
ITransformOne_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst)133 static void ITransformOne_NEON(const uint8_t* ref,
134                                const int16_t* in, uint8_t* dst) {
135   const int kBPS = BPS;
136   const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
137 
138   __asm__ volatile (
139     "vld1.16         {q1, q2}, [%[in]]           \n"
140     "vld1.16         {d0}, [%[kC1C2]]            \n"
141 
142     // d2: in[0]
143     // d3: in[8]
144     // d4: in[4]
145     // d5: in[12]
146     "vswp            d3, d4                      \n"
147 
148     // q8 = {in[4], in[12]} * kC1 * 2 >> 16
149     // q9 = {in[4], in[12]} * kC2 >> 16
150     "vqdmulh.s16     q8, q2, d0[0]               \n"
151     "vqdmulh.s16     q9, q2, d0[1]               \n"
152 
153     // d22 = a = in[0] + in[8]
154     // d23 = b = in[0] - in[8]
155     "vqadd.s16       d22, d2, d3                 \n"
156     "vqsub.s16       d23, d2, d3                 \n"
157 
158     //  q8 = in[4]/[12] * kC1 >> 16
159     "vshr.s16        q8, q8, #1                  \n"
160 
161     // Add {in[4], in[12]} back after the multiplication.
162     "vqadd.s16       q8, q2, q8                  \n"
163 
164     // d20 = c = in[4]*kC2 - in[12]*kC1
165     // d21 = d = in[4]*kC1 + in[12]*kC2
166     "vqsub.s16       d20, d18, d17               \n"
167     "vqadd.s16       d21, d19, d16               \n"
168 
169     // d2 = tmp[0] = a + d
170     // d3 = tmp[1] = b + c
171     // d4 = tmp[2] = b - c
172     // d5 = tmp[3] = a - d
173     "vqadd.s16       d2, d22, d21                \n"
174     "vqadd.s16       d3, d23, d20                \n"
175     "vqsub.s16       d4, d23, d20                \n"
176     "vqsub.s16       d5, d22, d21                \n"
177 
178     "vzip.16         q1, q2                      \n"
179     "vzip.16         q1, q2                      \n"
180 
181     "vswp            d3, d4                      \n"
182 
183     // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
184     // q9 = {tmp[4], tmp[12]} * kC2 >> 16
185     "vqdmulh.s16     q8, q2, d0[0]               \n"
186     "vqdmulh.s16     q9, q2, d0[1]               \n"
187 
188     // d22 = a = tmp[0] + tmp[8]
189     // d23 = b = tmp[0] - tmp[8]
190     "vqadd.s16       d22, d2, d3                 \n"
191     "vqsub.s16       d23, d2, d3                 \n"
192 
193     "vshr.s16        q8, q8, #1                  \n"
194     "vqadd.s16       q8, q2, q8                  \n"
195 
196     // d20 = c = in[4]*kC2 - in[12]*kC1
197     // d21 = d = in[4]*kC1 + in[12]*kC2
198     "vqsub.s16       d20, d18, d17               \n"
199     "vqadd.s16       d21, d19, d16               \n"
200 
201     // d2 = tmp[0] = a + d
202     // d3 = tmp[1] = b + c
203     // d4 = tmp[2] = b - c
204     // d5 = tmp[3] = a - d
205     "vqadd.s16       d2, d22, d21                \n"
206     "vqadd.s16       d3, d23, d20                \n"
207     "vqsub.s16       d4, d23, d20                \n"
208     "vqsub.s16       d5, d22, d21                \n"
209 
210     "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
211     "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
212     "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
213     "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
214 
215     "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
216 
217     // (val) + 4 >> 3
218     "vrshr.s16       d2, d2, #3                  \n"
219     "vrshr.s16       d3, d3, #3                  \n"
220     "vrshr.s16       d4, d4, #3                  \n"
221     "vrshr.s16       d5, d5, #3                  \n"
222 
223     "vzip.16         q1, q2                      \n"
224     "vzip.16         q1, q2                      \n"
225 
226     // Must accumulate before saturating
227     "vmovl.u8        q8, d6                      \n"
228     "vmovl.u8        q9, d7                      \n"
229 
230     "vqadd.s16       q1, q1, q8                  \n"
231     "vqadd.s16       q2, q2, q9                  \n"
232 
233     "vqmovun.s16     d0, q1                      \n"
234     "vqmovun.s16     d1, q2                      \n"
235 
236     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
237     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
238     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
239     "vst1.32         d1[1], [%[dst]]             \n"
240 
241     : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
242     : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
243     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
244   );
245 }
246 
247 #endif    // WEBP_USE_INTRINSICS
248 
ITransform_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst,int do_two)249 static void ITransform_NEON(const uint8_t* ref,
250                             const int16_t* in, uint8_t* dst, int do_two) {
251   ITransformOne_NEON(ref, in, dst);
252   if (do_two) {
253     ITransformOne_NEON(ref + 4, in + 16, dst + 4);
254   }
255 }
256 
257 // Load all 4x4 pixels into a single uint8x16_t variable.
Load4x4_NEON(const uint8_t * src)258 static uint8x16_t Load4x4_NEON(const uint8_t* src) {
259   uint32x4_t out = vdupq_n_u32(0);
260   out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
261   out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
262   out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
263   out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
264   return vreinterpretq_u8_u32(out);
265 }
266 
267 // Forward transform.
268 
269 #if defined(WEBP_USE_INTRINSICS)
270 
Transpose4x4_S16_NEON(const int16x4_t A,const int16x4_t B,const int16x4_t C,const int16x4_t D,int16x8_t * const out01,int16x8_t * const out32)271 static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
272                                               const int16x4_t B,
273                                               const int16x4_t C,
274                                               const int16x4_t D,
275                                               int16x8_t* const out01,
276                                               int16x8_t* const out32) {
277   const int16x4x2_t AB = vtrn_s16(A, B);
278   const int16x4x2_t CD = vtrn_s16(C, D);
279   const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
280                                      vreinterpret_s32_s16(CD.val[0]));
281   const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
282                                      vreinterpret_s32_s16(CD.val[1]));
283   *out01 = vreinterpretq_s16_s64(
284       vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
285                    vreinterpret_s64_s32(tmp13.val[0])));
286   *out32 = vreinterpretq_s16_s64(
287       vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
288                    vreinterpret_s64_s32(tmp02.val[1])));
289 }
290 
DiffU8ToS16_NEON(const uint8x8_t a,const uint8x8_t b)291 static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
292                                               const uint8x8_t b) {
293   return vreinterpretq_s16_u16(vsubl_u8(a, b));
294 }
295 
FTransform_NEON(const uint8_t * src,const uint8_t * ref,int16_t * out)296 static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
297                             int16_t* out) {
298   int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
299   {
300     const uint8x16_t S0 = Load4x4_NEON(src);
301     const uint8x16_t R0 = Load4x4_NEON(ref);
302     const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
303     const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
304     const int16x4_t D0 = vget_low_s16(D0D1);
305     const int16x4_t D1 = vget_high_s16(D0D1);
306     const int16x4_t D2 = vget_low_s16(D2D3);
307     const int16x4_t D3 = vget_high_s16(D2D3);
308     Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
309   }
310   {    // 1rst pass
311     const int32x4_t kCst937 = vdupq_n_s32(937);
312     const int32x4_t kCst1812 = vdupq_n_s32(1812);
313     const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
314     const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
315     const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
316     const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
317                                     vget_high_s16(a0a1_2));
318     const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
319                                     vget_high_s16(a0a1_2));
320     const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
321     const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
322     const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
323     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
324     const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
325     const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
326     Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
327   }
328   {    // 2nd pass
329     // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
330     const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
331     const int32x4_t kCst51000 = vdupq_n_s32(51000);
332     const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
333     const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
334     const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
335     const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
336     const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
337     const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
338     const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
339     const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
340     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
341     const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
342     const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
343     const int16x4_t a3_eq_0 =
344         vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
345     const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
346     vst1_s16(out +  0, out0);
347     vst1_s16(out +  4, out1);
348     vst1_s16(out +  8, out2);
349     vst1_s16(out + 12, out3);
350   }
351 }
352 
353 #else
354 
355 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
356 static const int16_t kCoeff16[] = {
357   5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
358 };
359 static const int32_t kCoeff32[] = {
360    1812,  1812,  1812,  1812,
361     937,   937,   937,   937,
362   12000, 12000, 12000, 12000,
363   51000, 51000, 51000, 51000
364 };
365 
FTransform_NEON(const uint8_t * src,const uint8_t * ref,int16_t * out)366 static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
367                             int16_t* out) {
368   const int kBPS = BPS;
369   const uint8_t* src_ptr = src;
370   const uint8_t* ref_ptr = ref;
371   const int16_t* coeff16 = kCoeff16;
372   const int32_t* coeff32 = kCoeff32;
373 
374   __asm__ volatile (
375     // load src into q4, q5 in high half
376     "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
377     "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
378     "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
379     "vld1.8 {d11}, [%[src_ptr]]               \n"
380 
381     // load ref into q6, q7 in high half
382     "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
383     "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
384     "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
385     "vld1.8 {d15}, [%[ref_ptr]]               \n"
386 
387     // Pack the high values in to q4 and q6
388     "vtrn.32     q4, q5                       \n"
389     "vtrn.32     q6, q7                       \n"
390 
391     // d[0-3] = src - ref
392     "vsubl.u8    q0, d8, d12                  \n"
393     "vsubl.u8    q1, d9, d13                  \n"
394 
395     // load coeff16 into q8(d16=5352, d17=2217)
396     "vld1.16     {q8}, [%[coeff16]]           \n"
397 
398     // load coeff32 high half into q9 = 1812, q10 = 937
399     "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
400 
401     // load coeff32 low half into q11=12000, q12=51000
402     "vld1.32     {q11,q12}, [%[coeff32]]      \n"
403 
404     // part 1
405     // Transpose. Register dN is the same as dN in C
406     "vtrn.32         d0, d2                   \n"
407     "vtrn.32         d1, d3                   \n"
408     "vtrn.16         d0, d1                   \n"
409     "vtrn.16         d2, d3                   \n"
410 
411     "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
412     "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
413     "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
414     "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
415 
416     "vadd.s16        d0, d4, d5               \n" // a0 + a1
417     "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
418     "vsub.s16        d2, d4, d5               \n" // a0 - a1
419     "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
420 
421     "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
422     "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
423     "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
424     "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
425 
426     // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
427     // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
428     "vshrn.s32       d1, q9, #9               \n"
429     "vshrn.s32       d3, q10, #9              \n"
430 
431     // part 2
432     // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
433     "vtrn.32         d0, d2                   \n"
434     "vtrn.32         d1, d3                   \n"
435     "vtrn.16         d0, d1                   \n"
436     "vtrn.16         d2, d3                   \n"
437 
438     "vmov.s16        d26, #7                  \n"
439 
440     "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
441     "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
442     "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
443     "vadd.s16        d4, d4, d26              \n" // a1 + 7
444     "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
445 
446     "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
447     "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
448 
449     "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
450     "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
451 
452     "vceq.s16        d4, d7, #0               \n"
453 
454     "vshr.s16        d0, d0, #4               \n"
455     "vshr.s16        d2, d2, #4               \n"
456 
457     "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
458     "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
459 
460     "vmvn            d4, d4                   \n" // !(d1 == 0)
461     // op[4] = (c1*2217 + d1*5352 + 12000)>>16
462     "vshrn.s32       d1, q11, #16             \n"
463     // op[4] += (d1!=0)
464     "vsub.s16        d1, d1, d4               \n"
465     // op[12]= (d1*2217 - c1*5352 + 51000)>>16
466     "vshrn.s32       d3, q12, #16             \n"
467 
468     // set result to out array
469     "vst1.16         {q0, q1}, [%[out]]   \n"
470     : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
471       [coeff32] "+r"(coeff32)          // modified registers
472     : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
473       [out] "r"(out)                   // constants
474     : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
475       "q10", "q11", "q12", "q13"       // clobbered
476   );
477 }
478 
479 #endif
480 
481 #define LOAD_LANE_16b(VALUE, LANE) do {             \
482   (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
483   src += stride;                                    \
484 } while (0)
485 
FTransformWHT_NEON(const int16_t * src,int16_t * out)486 static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
487   const int stride = 16;
488   const int16x4_t zero = vdup_n_s16(0);
489   int32x4x4_t tmp0;
490   int16x4x4_t in;
491   INIT_VECTOR4(in, zero, zero, zero, zero);
492   LOAD_LANE_16b(in.val[0], 0);
493   LOAD_LANE_16b(in.val[1], 0);
494   LOAD_LANE_16b(in.val[2], 0);
495   LOAD_LANE_16b(in.val[3], 0);
496   LOAD_LANE_16b(in.val[0], 1);
497   LOAD_LANE_16b(in.val[1], 1);
498   LOAD_LANE_16b(in.val[2], 1);
499   LOAD_LANE_16b(in.val[3], 1);
500   LOAD_LANE_16b(in.val[0], 2);
501   LOAD_LANE_16b(in.val[1], 2);
502   LOAD_LANE_16b(in.val[2], 2);
503   LOAD_LANE_16b(in.val[3], 2);
504   LOAD_LANE_16b(in.val[0], 3);
505   LOAD_LANE_16b(in.val[1], 3);
506   LOAD_LANE_16b(in.val[2], 3);
507   LOAD_LANE_16b(in.val[3], 3);
508 
509   {
510     // a0 = in[0 * 16] + in[2 * 16]
511     // a1 = in[1 * 16] + in[3 * 16]
512     // a2 = in[1 * 16] - in[3 * 16]
513     // a3 = in[0 * 16] - in[2 * 16]
514     const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
515     const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
516     const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
517     const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
518     tmp0.val[0] = vaddq_s32(a0, a1);
519     tmp0.val[1] = vaddq_s32(a3, a2);
520     tmp0.val[2] = vsubq_s32(a3, a2);
521     tmp0.val[3] = vsubq_s32(a0, a1);
522   }
523   {
524     const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
525     // a0 = tmp[0 + i] + tmp[ 8 + i]
526     // a1 = tmp[4 + i] + tmp[12 + i]
527     // a2 = tmp[4 + i] - tmp[12 + i]
528     // a3 = tmp[0 + i] - tmp[ 8 + i]
529     const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
530     const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
531     const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
532     const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
533     const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
534     const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
535     const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
536     const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
537     const int16x4_t out0 = vmovn_s32(b0);
538     const int16x4_t out1 = vmovn_s32(b1);
539     const int16x4_t out2 = vmovn_s32(b2);
540     const int16x4_t out3 = vmovn_s32(b3);
541 
542     vst1_s16(out +  0, out0);
543     vst1_s16(out +  4, out1);
544     vst1_s16(out +  8, out2);
545     vst1_s16(out + 12, out3);
546   }
547 }
548 #undef LOAD_LANE_16b
549 
550 //------------------------------------------------------------------------------
551 // Texture distortion
552 //
553 // We try to match the spectral content (weighted) between source and
554 // reconstructed samples.
555 
556 // a 0123, b 0123
557 // a 4567, b 4567
558 // a 89ab, b 89ab
559 // a cdef, b cdef
560 //
561 // transpose
562 //
563 // a 048c, b 048c
564 // a 159d, b 159d
565 // a 26ae, b 26ae
566 // a 37bf, b 37bf
567 //
DistoTranspose4x4S16_NEON(int16x8x4_t q4_in)568 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
569   const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
570   const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
571   const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
572                                         vreinterpretq_s32_s16(q2_tmp1.val[0]));
573   const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
574                                         vreinterpretq_s32_s16(q2_tmp1.val[1]));
575   q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
576   q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
577   q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
578   q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
579   return q4_in;
580 }
581 
DistoHorizontalPass_NEON(const int16x8x4_t q4_in)582 static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
583     const int16x8x4_t q4_in) {
584   // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
585   // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
586   const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
587   const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
588   const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
589   const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
590   int16x8x4_t q4_out;
591   // tmp[0] = a0 + a1
592   // tmp[1] = a3 + a2
593   // tmp[2] = a3 - a2
594   // tmp[3] = a0 - a1
595   INIT_VECTOR4(q4_out,
596                vabsq_s16(vaddq_s16(q_a0, q_a1)),
597                vabsq_s16(vaddq_s16(q_a3, q_a2)),
598                vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
599   return q4_out;
600 }
601 
DistoVerticalPass_NEON(const uint8x8x4_t q4_in)602 static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
603   const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
604                                                         q4_in.val[2]));
605   const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
606                                                         q4_in.val[3]));
607   const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
608                                                         q4_in.val[3]));
609   const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
610                                                         q4_in.val[2]));
611   int16x8x4_t q4_out;
612 
613   INIT_VECTOR4(q4_out,
614                vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
615                vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
616   return q4_out;
617 }
618 
DistoLoadW_NEON(const uint16_t * w)619 static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
620   const uint16x8_t q_w07 = vld1q_u16(&w[0]);
621   const uint16x8_t q_w8f = vld1q_u16(&w[8]);
622   int16x4x4_t d4_w;
623   INIT_VECTOR4(d4_w,
624                vget_low_s16(vreinterpretq_s16_u16(q_w07)),
625                vget_high_s16(vreinterpretq_s16_u16(q_w07)),
626                vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
627                vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
628   return d4_w;
629 }
630 
DistoSum_NEON(const int16x8x4_t q4_in,const int16x4x4_t d4_w)631 static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
632                                            const int16x4x4_t d4_w) {
633   int32x2_t d_sum;
634   // sum += w[ 0] * abs(b0);
635   // sum += w[ 4] * abs(b1);
636   // sum += w[ 8] * abs(b2);
637   // sum += w[12] * abs(b3);
638   int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
639   int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
640   int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
641   int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
642   q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
643   q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
644   q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
645   q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
646 
647   q_sum0 = vaddq_s32(q_sum0, q_sum1);
648   q_sum2 = vaddq_s32(q_sum2, q_sum3);
649   q_sum2 = vaddq_s32(q_sum0, q_sum2);
650   d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
651   d_sum = vpadd_s32(d_sum, d_sum);
652   return d_sum;
653 }
654 
655 #define LOAD_LANE_32b(src, VALUE, LANE) \
656     (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
657 
658 // Hadamard transform
659 // Returns the weighted sum of the absolute value of transformed coefficients.
660 // w[] contains a row-major 4 by 4 symmetric matrix.
Disto4x4_NEON(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)661 static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
662                          const uint16_t* const w) {
663   uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
664   uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
665   uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
666   uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
667   uint8x8x4_t d4_in;
668 
669   // load data a, b
670   LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
671   LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
672   LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
673   LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
674   LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
675   LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
676   LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
677   LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
678   INIT_VECTOR4(d4_in,
679                vreinterpret_u8_u32(d_in_ab_0123),
680                vreinterpret_u8_u32(d_in_ab_4567),
681                vreinterpret_u8_u32(d_in_ab_89ab),
682                vreinterpret_u8_u32(d_in_ab_cdef));
683 
684   {
685     // Vertical pass first to avoid a transpose (vertical and horizontal passes
686     // are commutative because w/kWeightY is symmetric) and subsequent
687     // transpose.
688     const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
689     const int16x4x4_t d4_w = DistoLoadW_NEON(w);
690     // horizontal pass
691     const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
692     const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
693     int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
694 
695     // abs(sum2 - sum1) >> 5
696     d_sum = vabs_s32(d_sum);
697     d_sum = vshr_n_s32(d_sum, 5);
698     return vget_lane_s32(d_sum, 0);
699   }
700 }
701 #undef LOAD_LANE_32b
702 
Disto16x16_NEON(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)703 static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
704                            const uint16_t* const w) {
705   int D = 0;
706   int x, y;
707   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
708     for (x = 0; x < 16; x += 4) {
709       D += Disto4x4_NEON(a + x + y, b + x + y, w);
710     }
711   }
712   return D;
713 }
714 
715 //------------------------------------------------------------------------------
716 
CollectHistogram_NEON(const uint8_t * ref,const uint8_t * pred,int start_block,int end_block,VP8Histogram * const histo)717 static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
718                                   int start_block, int end_block,
719                                   VP8Histogram* const histo) {
720   const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
721   int j;
722   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
723   for (j = start_block; j < end_block; ++j) {
724     int16_t out[16];
725     FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
726     {
727       int k;
728       const int16x8_t a0 = vld1q_s16(out + 0);
729       const int16x8_t b0 = vld1q_s16(out + 8);
730       const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
731       const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
732       const uint16x8_t a2 = vshrq_n_u16(a1, 3);
733       const uint16x8_t b2 = vshrq_n_u16(b1, 3);
734       const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
735       const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
736       vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
737       vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
738       // Convert coefficients to bin.
739       for (k = 0; k < 16; ++k) {
740         ++distribution[out[k]];
741       }
742     }
743   }
744   VP8SetHistogramData(distribution, histo);
745 }
746 
747 //------------------------------------------------------------------------------
748 
AccumulateSSE16_NEON(const uint8_t * const a,const uint8_t * const b,uint32x4_t * const sum)749 static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
750                                              const uint8_t* const b,
751                                              uint32x4_t* const sum) {
752   const uint8x16_t a0 = vld1q_u8(a);
753   const uint8x16_t b0 = vld1q_u8(b);
754   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
755   const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
756                                     vget_low_u8(abs_diff));
757   const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
758                                     vget_high_u8(abs_diff));
759   /* pair-wise adds and widen */
760   const uint32x4_t sum1 = vpaddlq_u16(prod1);
761   const uint32x4_t sum2 = vpaddlq_u16(prod2);
762   *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
763 }
764 
765 // Horizontal sum of all four uint32_t values in 'sum'.
SumToInt_NEON(uint32x4_t sum)766 static int SumToInt_NEON(uint32x4_t sum) {
767   const uint64x2_t sum2 = vpaddlq_u32(sum);
768   const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
769   return (int)sum3;
770 }
771 
SSE16x16_NEON(const uint8_t * a,const uint8_t * b)772 static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
773   uint32x4_t sum = vdupq_n_u32(0);
774   int y;
775   for (y = 0; y < 16; ++y) {
776     AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
777   }
778   return SumToInt_NEON(sum);
779 }
780 
SSE16x8_NEON(const uint8_t * a,const uint8_t * b)781 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
782   uint32x4_t sum = vdupq_n_u32(0);
783   int y;
784   for (y = 0; y < 8; ++y) {
785     AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
786   }
787   return SumToInt_NEON(sum);
788 }
789 
SSE8x8_NEON(const uint8_t * a,const uint8_t * b)790 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
791   uint32x4_t sum = vdupq_n_u32(0);
792   int y;
793   for (y = 0; y < 8; ++y) {
794     const uint8x8_t a0 = vld1_u8(a + y * BPS);
795     const uint8x8_t b0 = vld1_u8(b + y * BPS);
796     const uint8x8_t abs_diff = vabd_u8(a0, b0);
797     const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
798     sum = vpadalq_u16(sum, prod);
799   }
800   return SumToInt_NEON(sum);
801 }
802 
SSE4x4_NEON(const uint8_t * a,const uint8_t * b)803 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
804   const uint8x16_t a0 = Load4x4_NEON(a);
805   const uint8x16_t b0 = Load4x4_NEON(b);
806   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
807   const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
808                                     vget_low_u8(abs_diff));
809   const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
810                                     vget_high_u8(abs_diff));
811   /* pair-wise adds and widen */
812   const uint32x4_t sum1 = vpaddlq_u16(prod1);
813   const uint32x4_t sum2 = vpaddlq_u16(prod2);
814   return SumToInt_NEON(vaddq_u32(sum1, sum2));
815 }
816 
817 //------------------------------------------------------------------------------
818 
819 // Compilation with gcc-4.6.x is problematic for now.
820 #if !defined(WORK_AROUND_GCC)
821 
Quantize_NEON(int16_t * const in,const VP8Matrix * const mtx,int offset)822 static int16x8_t Quantize_NEON(int16_t* const in,
823                                const VP8Matrix* const mtx, int offset) {
824   const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
825   const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
826   const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
827   const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
828   const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
829 
830   const int16x8_t a = vld1q_s16(in + offset);                // in
831   const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
832   const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
833   const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
834   const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
835   const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
836   const uint32x4_t m2 = vhaddq_u32(m0, bias0);
837   const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
838   const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
839                                      vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
840   const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
841   const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
842   const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
843   const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
844   vst1q_s16(in + offset, c4);
845   assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
846   return c3;
847 }
848 
849 static const uint8_t kShuffles[4][8] = {
850   { 0,   1,  2,  3,  8,  9, 16, 17 },
851   { 10, 11,  4,  5,  6,  7, 12, 13 },
852   { 18, 19, 24, 25, 26, 27, 20, 21 },
853   { 14, 15, 22, 23, 28, 29, 30, 31 }
854 };
855 
QuantizeBlock_NEON(int16_t in[16],int16_t out[16],const VP8Matrix * const mtx)856 static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
857                               const VP8Matrix* const mtx) {
858   const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
859   const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
860   uint8x8x4_t shuffles;
861   // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
862   // non-standard versions there.
863 #if defined(__APPLE__) && defined(__aarch64__) && \
864     defined(__apple_build_version__) && (__apple_build_version__< 6020037)
865   uint8x16x2_t all_out;
866   INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
867   INIT_VECTOR4(shuffles,
868                vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
869                vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
870                vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
871                vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
872 #else
873   uint8x8x4_t all_out;
874   INIT_VECTOR4(all_out,
875                vreinterpret_u8_s16(vget_low_s16(out0)),
876                vreinterpret_u8_s16(vget_high_s16(out0)),
877                vreinterpret_u8_s16(vget_low_s16(out1)),
878                vreinterpret_u8_s16(vget_high_s16(out1)));
879   INIT_VECTOR4(shuffles,
880                vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
881                vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
882                vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
883                vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
884 #endif
885   // Zigzag reordering
886   vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
887   vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
888   vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
889   vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
890   // test zeros
891   if (*(uint64_t*)(out +  0) != 0) return 1;
892   if (*(uint64_t*)(out +  4) != 0) return 1;
893   if (*(uint64_t*)(out +  8) != 0) return 1;
894   if (*(uint64_t*)(out + 12) != 0) return 1;
895   return 0;
896 }
897 
Quantize2Blocks_NEON(int16_t in[32],int16_t out[32],const VP8Matrix * const mtx)898 static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
899                                 const VP8Matrix* const mtx) {
900   int nz;
901   nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
902   nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
903   return nz;
904 }
905 
906 #endif   // !WORK_AROUND_GCC
907 
908 //------------------------------------------------------------------------------
909 // Entry point
910 
911 extern void VP8EncDspInitNEON(void);
912 
VP8EncDspInitNEON(void)913 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
914   VP8ITransform = ITransform_NEON;
915   VP8FTransform = FTransform_NEON;
916 
917   VP8FTransformWHT = FTransformWHT_NEON;
918 
919   VP8TDisto4x4 = Disto4x4_NEON;
920   VP8TDisto16x16 = Disto16x16_NEON;
921   VP8CollectHistogram = CollectHistogram_NEON;
922 
923   VP8SSE16x16 = SSE16x16_NEON;
924   VP8SSE16x8 = SSE16x8_NEON;
925   VP8SSE8x8 = SSE8x8_NEON;
926   VP8SSE4x4 = SSE4x4_NEON;
927 
928 #if !defined(WORK_AROUND_GCC)
929   VP8EncQuantizeBlock = QuantizeBlock_NEON;
930   VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
931 #endif
932 }
933 
934 #else  // !WEBP_USE_NEON
935 
936 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
937 
938 #endif  // WEBP_USE_NEON
939