1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include <assert.h>
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/sum_neon.h"
19 
load_unaligned_2_buffers(const void * const buf0,const void * const buf1)20 static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
21                                                  const void *const buf1) {
22   uint32_t a;
23   uint32x2_t aa = vdup_n_u32(0);
24   memcpy(&a, buf0, 4);
25   aa = vset_lane_u32(a, aa, 0);
26   memcpy(&a, buf1, 4);
27   aa = vset_lane_u32(a, aa, 1);
28   return vreinterpret_u8_u32(aa);
29 }
30 
sad4x_4d(const uint8_t * const src_ptr,const int src_stride,const uint8_t * const ref_array[4],const int ref_stride,const int height,uint32_t * const res)31 static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
32                             const uint8_t *const ref_array[4],
33                             const int ref_stride, const int height,
34                             uint32_t *const res) {
35   int i;
36   uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
37   uint16x4_t a[2];
38   uint32x4_t r;
39 
40   assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
41   assert(!(src_stride % sizeof(uint32_t)));
42 
43   for (i = 0; i < height; ++i) {
44     const uint8x8_t s = vreinterpret_u8_u32(
45         vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));
46     const uint8x8_t ref01 = load_unaligned_2_buffers(
47         ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);
48     const uint8x8_t ref23 = load_unaligned_2_buffers(
49         ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);
50     abs[0] = vabal_u8(abs[0], s, ref01);
51     abs[1] = vabal_u8(abs[1], s, ref23);
52   }
53 
54   a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
55   a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
56   r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
57   vst1q_u32(res, r);
58 }
59 
vpx_sad4x4x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)60 void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
61                         const uint8_t *const ref_array[4], int ref_stride,
62                         uint32_t *res) {
63   sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res);
64 }
65 
vpx_sad4x8x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)66 void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
67                         const uint8_t *const ref_array[4], int ref_stride,
68                         uint32_t *res) {
69   sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res);
70 }
71 
72 ////////////////////////////////////////////////////////////////////////////////
73 
74 // Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
sad_512_pel_final_neon(const uint16x8_t * sum,uint32_t * const res)75 static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
76                                           uint32_t *const res) {
77   const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
78   const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
79   const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
80   const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
81   const uint16x4_t b0 = vpadd_u16(a0, a1);
82   const uint16x4_t b1 = vpadd_u16(a2, a3);
83   const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
84   vst1q_u32(res, r);
85 }
86 
87 // Can handle 1024 pixels' sad sum (such as 32x32)
sad_1024_pel_final_neon(const uint16x8_t * sum,uint32_t * const res)88 static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
89                                            uint32_t *const res) {
90   const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
91   const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
92   const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
93   const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
94   const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1));
95   const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
96   const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
97   const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
98   vst1q_u32(res, vcombine_u32(c0, c1));
99 }
100 
101 // Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
sad_2048_pel_final_neon(const uint16x8_t * sum,uint32_t * const res)102 static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
103                                            uint32_t *const res) {
104   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
105   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
106   const uint32x4_t a2 = vpaddlq_u16(sum[2]);
107   const uint32x4_t a3 = vpaddlq_u16(sum[3]);
108   const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
109   const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
110   const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
111   const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
112   const uint32x2_t c0 = vpadd_u32(b0, b1);
113   const uint32x2_t c1 = vpadd_u32(b2, b3);
114   vst1q_u32(res, vcombine_u32(c0, c1));
115 }
116 
117 // Can handle 4096 pixels' sad sum (such as 64x64)
sad_4096_pel_final_neon(const uint16x8_t * sum,uint32_t * const res)118 static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
119                                            uint32_t *const res) {
120   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
121   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
122   const uint32x4_t a2 = vpaddlq_u16(sum[2]);
123   const uint32x4_t a3 = vpaddlq_u16(sum[3]);
124   const uint32x4_t a4 = vpaddlq_u16(sum[4]);
125   const uint32x4_t a5 = vpaddlq_u16(sum[5]);
126   const uint32x4_t a6 = vpaddlq_u16(sum[6]);
127   const uint32x4_t a7 = vpaddlq_u16(sum[7]);
128   const uint32x4_t b0 = vaddq_u32(a0, a1);
129   const uint32x4_t b1 = vaddq_u32(a2, a3);
130   const uint32x4_t b2 = vaddq_u32(a4, a5);
131   const uint32x4_t b3 = vaddq_u32(a6, a7);
132   const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
133   const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
134   const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
135   const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
136   const uint32x2_t d0 = vpadd_u32(c0, c1);
137   const uint32x2_t d1 = vpadd_u32(c2, c3);
138   vst1q_u32(res, vcombine_u32(d0, d1));
139 }
140 
sad8x_4d(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res,const int height)141 static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
142                             const uint8_t *const ref_array[4], int ref_stride,
143                             uint32_t *res, const int height) {
144   int i, j;
145   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
146                                  ref_array[3] };
147   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
148                         vdupq_n_u16(0) };
149 
150   for (i = 0; i < height; ++i) {
151     const uint8x8_t s = vld1_u8(src_ptr);
152     src_ptr += src_stride;
153     for (j = 0; j < 4; ++j) {
154       const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
155       ref_loop[j] += ref_stride;
156       sum[j] = vabal_u8(sum[j], s, b_u8);
157     }
158   }
159 
160   sad_512_pel_final_neon(sum, res);
161 }
162 
vpx_sad8x4x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)163 void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
164                         const uint8_t *const ref_array[4], int ref_stride,
165                         uint32_t *res) {
166   sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4);
167 }
168 
vpx_sad8x8x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)169 void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
170                         const uint8_t *const ref_array[4], int ref_stride,
171                         uint32_t *res) {
172   sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
173 }
174 
vpx_sad8x16x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)175 void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
176                          const uint8_t *const ref_array[4], int ref_stride,
177                          uint32_t *res) {
178   sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
179 }
180 
181 ////////////////////////////////////////////////////////////////////////////////
182 
sad16_neon(const uint8_t * ref_ptr,const uint8x16_t src_ptr,uint16x8_t * const sum)183 static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
184                               uint16x8_t *const sum) {
185   const uint8x16_t r = vld1q_u8(ref_ptr);
186   *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
187   *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));
188 }
189 
sad16x_4d(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res,const int height)190 static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
191                              const uint8_t *const ref_array[4], int ref_stride,
192                              uint32_t *res, const int height) {
193   int i, j;
194   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
195                                  ref_array[3] };
196   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
197                         vdupq_n_u16(0) };
198 
199   for (i = 0; i < height; ++i) {
200     const uint8x16_t s = vld1q_u8(src_ptr);
201     src_ptr += src_stride;
202     for (j = 0; j < 4; ++j) {
203       sad16_neon(ref_loop[j], s, &sum[j]);
204       ref_loop[j] += ref_stride;
205     }
206   }
207 
208   sad_512_pel_final_neon(sum, res);
209 }
210 
vpx_sad16x8x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)211 void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
212                          const uint8_t *const ref_array[4], int ref_stride,
213                          uint32_t *res) {
214   sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
215 }
216 
vpx_sad16x16x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)217 void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
218                           const uint8_t *const ref_array[4], int ref_stride,
219                           uint32_t *res) {
220   sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
221 }
222 
vpx_sad16x32x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)223 void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
224                           const uint8_t *const ref_array[4], int ref_stride,
225                           uint32_t *res) {
226   sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
227 }
228 
229 ////////////////////////////////////////////////////////////////////////////////
230 
sad32x_4d(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,const int height,uint16x8_t * const sum)231 static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
232                              const uint8_t *const ref_array[4], int ref_stride,
233                              const int height, uint16x8_t *const sum) {
234   int i;
235   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
236                                  ref_array[3] };
237 
238   sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
239 
240   for (i = 0; i < height; ++i) {
241     uint8x16_t s;
242 
243     s = vld1q_u8(src_ptr + 0 * 16);
244     sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
245     sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
246     sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
247     sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
248 
249     s = vld1q_u8(src_ptr + 1 * 16);
250     sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
251     sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
252     sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
253     sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
254 
255     src_ptr += src_stride;
256     ref_loop[0] += ref_stride;
257     ref_loop[1] += ref_stride;
258     ref_loop[2] += ref_stride;
259     ref_loop[3] += ref_stride;
260   }
261 }
262 
vpx_sad32x16x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)263 void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
264                           const uint8_t *const ref_array[4], int ref_stride,
265                           uint32_t *res) {
266   uint16x8_t sum[4];
267   sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
268   sad_512_pel_final_neon(sum, res);
269 }
270 
vpx_sad32x32x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)271 void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
272                           const uint8_t *const ref_array[4], int ref_stride,
273                           uint32_t *res) {
274   uint16x8_t sum[4];
275   sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
276   sad_1024_pel_final_neon(sum, res);
277 }
278 
vpx_sad32x64x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)279 void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
280                           const uint8_t *const ref_array[4], int ref_stride,
281                           uint32_t *res) {
282   uint16x8_t sum[4];
283   sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
284   sad_2048_pel_final_neon(sum, res);
285 }
286 
287 ////////////////////////////////////////////////////////////////////////////////
288 
vpx_sad64x32x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)289 void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
290                           const uint8_t *const ref_array[4], int ref_stride,
291                           uint32_t *res) {
292   int i;
293   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
294                                  ref_array[3] };
295   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
296                         vdupq_n_u16(0) };
297 
298   for (i = 0; i < 32; ++i) {
299     uint8x16_t s;
300 
301     s = vld1q_u8(src_ptr + 0 * 16);
302     sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
303     sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
304     sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
305     sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
306 
307     s = vld1q_u8(src_ptr + 1 * 16);
308     sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
309     sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
310     sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
311     sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
312 
313     s = vld1q_u8(src_ptr + 2 * 16);
314     sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
315     sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
316     sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
317     sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
318 
319     s = vld1q_u8(src_ptr + 3 * 16);
320     sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
321     sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
322     sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
323     sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
324 
325     src_ptr += src_stride;
326     ref_loop[0] += ref_stride;
327     ref_loop[1] += ref_stride;
328     ref_loop[2] += ref_stride;
329     ref_loop[3] += ref_stride;
330   }
331 
332   sad_2048_pel_final_neon(sum, res);
333 }
334 
vpx_sad64x64x4d_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t * res)335 void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
336                           const uint8_t *const ref_array[4], int ref_stride,
337                           uint32_t *res) {
338   int i;
339   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
340                                  ref_array[3] };
341   uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
342                         vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
343                         vdupq_n_u16(0), vdupq_n_u16(0) };
344 
345   for (i = 0; i < 64; ++i) {
346     uint8x16_t s;
347 
348     s = vld1q_u8(src_ptr + 0 * 16);
349     sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
350     sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
351     sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
352     sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
353 
354     s = vld1q_u8(src_ptr + 1 * 16);
355     sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
356     sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
357     sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
358     sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
359 
360     s = vld1q_u8(src_ptr + 2 * 16);
361     sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
362     sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
363     sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
364     sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
365 
366     s = vld1q_u8(src_ptr + 3 * 16);
367     sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
368     sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
369     sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
370     sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
371 
372     src_ptr += src_stride;
373     ref_loop[0] += ref_stride;
374     ref_loop[1] += ref_stride;
375     ref_loop[2] += ref_stride;
376     ref_loop[3] += ref_stride;
377   }
378 
379   sad_4096_pel_final_neon(sum, res);
380 }
381