1 /*
2  *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
12 #define AOM_AV1_COMMON_ARM_MEM_NEON_H_
13 
14 #include <arm_neon.h>
15 #include <string.h>
16 #include "aom_dsp/aom_dsp_common.h"
17 
store_row2_u8_8x8(uint8_t * s,int p,const uint8x8_t s0,const uint8x8_t s1)18 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
19                                      const uint8x8_t s1) {
20   vst1_u8(s, s0);
21   s += p;
22   vst1_u8(s, s1);
23   s += p;
24 }
25 
26 /* These intrinsics require immediate values, so we must use #defines
27    to enforce that. */
28 #define load_u8_4x1(s, s0, lane)                                           \
29   do {                                                                     \
30     *(s0) = vreinterpret_u8_u32(                                           \
31         vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
32   } while (0)
33 
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)34 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
35                                uint8x8_t *const s0, uint8x8_t *const s1,
36                                uint8x8_t *const s2, uint8x8_t *const s3,
37                                uint8x8_t *const s4, uint8x8_t *const s5,
38                                uint8x8_t *const s6, uint8x8_t *const s7) {
39   *s0 = vld1_u8(s);
40   s += p;
41   *s1 = vld1_u8(s);
42   s += p;
43   *s2 = vld1_u8(s);
44   s += p;
45   *s3 = vld1_u8(s);
46   s += p;
47   *s4 = vld1_u8(s);
48   s += p;
49   *s5 = vld1_u8(s);
50   s += p;
51   *s6 = vld1_u8(s);
52   s += p;
53   *s7 = vld1_u8(s);
54 }
55 
load_u8_8x16(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)56 static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p,
57                                 uint8x16_t *const s0, uint8x16_t *const s1,
58                                 uint8x16_t *const s2, uint8x16_t *const s3) {
59   *s0 = vld1q_u8(s);
60   s += p;
61   *s1 = vld1q_u8(s);
62   s += p;
63   *s2 = vld1q_u8(s);
64   s += p;
65   *s3 = vld1q_u8(s);
66 }
67 
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)68 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
69                                uint8x8_t *const s0, uint8x8_t *const s1,
70                                uint8x8_t *const s2, uint8x8_t *const s3) {
71   *s0 = vld1_u8(s);
72   s += p;
73   *s1 = vld1_u8(s);
74   s += p;
75   *s2 = vld1_u8(s);
76   s += p;
77   *s3 = vld1_u8(s);
78 }
79 
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)80 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
81                                 uint16x4_t *const s0, uint16x4_t *const s1,
82                                 uint16x4_t *const s2, uint16x4_t *const s3) {
83   *s0 = vld1_u16(s);
84   s += p;
85   *s1 = vld1_u16(s);
86   s += p;
87   *s2 = vld1_u16(s);
88   s += p;
89   *s3 = vld1_u16(s);
90   s += p;
91 }
92 
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)93 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
94                                 uint16x8_t *const s0, uint16x8_t *const s1,
95                                 uint16x8_t *const s2, uint16x8_t *const s3) {
96   *s0 = vld1q_u16(s);
97   s += p;
98   *s1 = vld1q_u16(s);
99   s += p;
100   *s2 = vld1q_u16(s);
101   s += p;
102   *s3 = vld1q_u16(s);
103   s += p;
104 }
105 
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)106 static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
107                                 int16x4_t *const s0, int16x4_t *const s1,
108                                 int16x4_t *const s2, int16x4_t *const s3,
109                                 int16x4_t *const s4, int16x4_t *const s5,
110                                 int16x4_t *const s6, int16x4_t *const s7) {
111   *s0 = vld1_s16(s);
112   s += p;
113   *s1 = vld1_s16(s);
114   s += p;
115   *s2 = vld1_s16(s);
116   s += p;
117   *s3 = vld1_s16(s);
118   s += p;
119   *s4 = vld1_s16(s);
120   s += p;
121   *s5 = vld1_s16(s);
122   s += p;
123   *s6 = vld1_s16(s);
124   s += p;
125   *s7 = vld1_s16(s);
126 }
127 
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)128 static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
129                                 int16x4_t *const s0, int16x4_t *const s1,
130                                 int16x4_t *const s2, int16x4_t *const s3) {
131   *s0 = vld1_s16(s);
132   s += p;
133   *s1 = vld1_s16(s);
134   s += p;
135   *s2 = vld1_s16(s);
136   s += p;
137   *s3 = vld1_s16(s);
138 }
139 
140 /* These intrinsics require immediate values, so we must use #defines
141    to enforce that. */
142 #define store_u8_4x1(s, s0, lane)                                  \
143   do {                                                             \
144     vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
145   } while (0)
146 
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)147 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
148                                 const uint8x8_t s1, const uint8x8_t s2,
149                                 const uint8x8_t s3, const uint8x8_t s4,
150                                 const uint8x8_t s5, const uint8x8_t s6,
151                                 const uint8x8_t s7) {
152   vst1_u8(s, s0);
153   s += p;
154   vst1_u8(s, s1);
155   s += p;
156   vst1_u8(s, s2);
157   s += p;
158   vst1_u8(s, s3);
159   s += p;
160   vst1_u8(s, s4);
161   s += p;
162   vst1_u8(s, s5);
163   s += p;
164   vst1_u8(s, s6);
165   s += p;
166   vst1_u8(s, s7);
167 }
168 
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)169 static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
170                                 const uint8x8_t s1, const uint8x8_t s2,
171                                 const uint8x8_t s3) {
172   vst1_u8(s, s0);
173   s += p;
174   vst1_u8(s, s1);
175   s += p;
176   vst1_u8(s, s2);
177   s += p;
178   vst1_u8(s, s3);
179 }
180 
store_u8_8x16(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)181 static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
182                                  const uint8x16_t s1, const uint8x16_t s2,
183                                  const uint8x16_t s3) {
184   vst1q_u8(s, s0);
185   s += p;
186   vst1q_u8(s, s1);
187   s += p;
188   vst1q_u8(s, s2);
189   s += p;
190   vst1q_u8(s, s3);
191 }
192 
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)193 static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
194                                  const uint16x8_t s0, const uint16x8_t s1,
195                                  const uint16x8_t s2, const uint16x8_t s3,
196                                  const uint16x8_t s4, const uint16x8_t s5,
197                                  const uint16x8_t s6, const uint16x8_t s7) {
198   vst1q_u16(s, s0);
199   s += dst_stride;
200   vst1q_u16(s, s1);
201   s += dst_stride;
202   vst1q_u16(s, s2);
203   s += dst_stride;
204   vst1q_u16(s, s3);
205   s += dst_stride;
206   vst1q_u16(s, s4);
207   s += dst_stride;
208   vst1q_u16(s, s5);
209   s += dst_stride;
210   vst1q_u16(s, s6);
211   s += dst_stride;
212   vst1q_u16(s, s7);
213 }
214 
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)215 static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
216                                  const uint16x4_t s0, const uint16x4_t s1,
217                                  const uint16x4_t s2, const uint16x4_t s3) {
218   vst1_u16(s, s0);
219   s += dst_stride;
220   vst1_u16(s, s1);
221   s += dst_stride;
222   vst1_u16(s, s2);
223   s += dst_stride;
224   vst1_u16(s, s3);
225 }
226 
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)227 static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
228                                  const uint16x8_t s0, const uint16x8_t s1,
229                                  const uint16x8_t s2, const uint16x8_t s3) {
230   vst1q_u16(s, s0);
231   s += dst_stride;
232   vst1q_u16(s, s1);
233   s += dst_stride;
234   vst1q_u16(s, s2);
235   s += dst_stride;
236   vst1q_u16(s, s3);
237 }
238 
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)239 static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
240                                  const int16x8_t s0, const int16x8_t s1,
241                                  const int16x8_t s2, const int16x8_t s3,
242                                  const int16x8_t s4, const int16x8_t s5,
243                                  const int16x8_t s6, const int16x8_t s7) {
244   vst1q_s16(s, s0);
245   s += dst_stride;
246   vst1q_s16(s, s1);
247   s += dst_stride;
248   vst1q_s16(s, s2);
249   s += dst_stride;
250   vst1q_s16(s, s3);
251   s += dst_stride;
252   vst1q_s16(s, s4);
253   s += dst_stride;
254   vst1q_s16(s, s5);
255   s += dst_stride;
256   vst1q_s16(s, s6);
257   s += dst_stride;
258   vst1q_s16(s, s7);
259 }
260 
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)261 static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
262                                  const int16x4_t s0, const int16x4_t s1,
263                                  const int16x4_t s2, const int16x4_t s3) {
264   vst1_s16(s, s0);
265   s += dst_stride;
266   vst1_s16(s, s1);
267   s += dst_stride;
268   vst1_s16(s, s2);
269   s += dst_stride;
270   vst1_s16(s, s3);
271 }
272 
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)273 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
274                                  const int16x8_t s0, const int16x8_t s1,
275                                  const int16x8_t s2, const int16x8_t s3) {
276   vst1q_s16(s, s0);
277   s += dst_stride;
278   vst1q_s16(s, s1);
279   s += dst_stride;
280   vst1q_s16(s, s2);
281   s += dst_stride;
282   vst1q_s16(s, s3);
283 }
284 
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)285 static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
286                                 int16x8_t *const s0, int16x8_t *const s1,
287                                 int16x8_t *const s2, int16x8_t *const s3,
288                                 int16x8_t *const s4, int16x8_t *const s5,
289                                 int16x8_t *const s6, int16x8_t *const s7) {
290   *s0 = vld1q_s16(s);
291   s += p;
292   *s1 = vld1q_s16(s);
293   s += p;
294   *s2 = vld1q_s16(s);
295   s += p;
296   *s3 = vld1q_s16(s);
297   s += p;
298   *s4 = vld1q_s16(s);
299   s += p;
300   *s5 = vld1q_s16(s);
301   s += p;
302   *s6 = vld1q_s16(s);
303   s += p;
304   *s7 = vld1q_s16(s);
305 }
306 
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)307 static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
308                                 int16x8_t *const s0, int16x8_t *const s1,
309                                 int16x8_t *const s2, int16x8_t *const s3) {
310   *s0 = vld1q_s16(s);
311   s += p;
312   *s1 = vld1q_s16(s);
313   s += p;
314   *s2 = vld1q_s16(s);
315   s += p;
316   *s3 = vld1q_s16(s);
317 }
318 
319 // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,int stride)320 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
321   uint32_t a;
322   uint32x4_t a_u32 = vdupq_n_u32(0);
323   if (stride == 4) return vld1q_u8(buf);
324   memcpy(&a, buf, 4);
325   buf += stride;
326   a_u32 = vsetq_lane_u32(a, a_u32, 0);
327   memcpy(&a, buf, 4);
328   buf += stride;
329   a_u32 = vsetq_lane_u32(a, a_u32, 1);
330   memcpy(&a, buf, 4);
331   buf += stride;
332   a_u32 = vsetq_lane_u32(a, a_u32, 2);
333   memcpy(&a, buf, 4);
334   buf += stride;
335   a_u32 = vsetq_lane_u32(a, a_u32, 3);
336   return vreinterpretq_u8_u32(a_u32);
337 }
338 
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint32x2_t * tu0,uint32x2_t * tu1,uint32x2_t * tu2,uint32x2_t * tu3)339 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
340                                          uint32x2_t *tu0, uint32x2_t *tu1,
341                                          uint32x2_t *tu2, uint32x2_t *tu3) {
342   uint32_t a;
343 
344   memcpy(&a, buf, 4);
345   buf += stride;
346   *tu0 = vset_lane_u32(a, *tu0, 0);
347   memcpy(&a, buf, 4);
348   buf += stride;
349   *tu0 = vset_lane_u32(a, *tu0, 1);
350   memcpy(&a, buf, 4);
351   buf += stride;
352   *tu1 = vset_lane_u32(a, *tu1, 0);
353   memcpy(&a, buf, 4);
354   buf += stride;
355   *tu1 = vset_lane_u32(a, *tu1, 1);
356   memcpy(&a, buf, 4);
357   buf += stride;
358   *tu2 = vset_lane_u32(a, *tu2, 0);
359   memcpy(&a, buf, 4);
360   buf += stride;
361   *tu2 = vset_lane_u32(a, *tu2, 1);
362   memcpy(&a, buf, 4);
363   buf += stride;
364   *tu3 = vset_lane_u32(a, *tu3, 0);
365   memcpy(&a, buf, 4);
366   *tu3 = vset_lane_u32(a, *tu3, 1);
367 }
368 
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint32x2_t * tu0,uint32x2_t * tu1)369 static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
370                                          uint32x2_t *tu0, uint32x2_t *tu1) {
371   uint32_t a;
372 
373   memcpy(&a, buf, 4);
374   buf += stride;
375   *tu0 = vset_lane_u32(a, *tu0, 0);
376   memcpy(&a, buf, 4);
377   buf += stride;
378   *tu0 = vset_lane_u32(a, *tu0, 1);
379   memcpy(&a, buf, 4);
380   buf += stride;
381   *tu1 = vset_lane_u32(a, *tu1, 0);
382   memcpy(&a, buf, 4);
383   *tu1 = vset_lane_u32(a, *tu1, 1);
384 }
385 
load_unaligned_u8_4x1(const uint8_t * buf,int stride,uint32x2_t * tu0)386 static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
387                                          uint32x2_t *tu0) {
388   uint32_t a;
389 
390   memcpy(&a, buf, 4);
391   buf += stride;
392   *tu0 = vset_lane_u32(a, *tu0, 0);
393 }
394 
load_unaligned_u8_4x2(const uint8_t * buf,int stride,uint32x2_t * tu0)395 static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
396                                          uint32x2_t *tu0) {
397   uint32_t a;
398 
399   memcpy(&a, buf, 4);
400   buf += stride;
401   *tu0 = vset_lane_u32(a, *tu0, 0);
402   memcpy(&a, buf, 4);
403   buf += stride;
404   *tu0 = vset_lane_u32(a, *tu0, 1);
405 }
406 
407 /* These intrinsics require immediate values, so we must use #defines
408    to enforce that. */
409 #define store_unaligned_u8_4x1(dst, src, lane)         \
410   do {                                                 \
411     uint32_t a;                                        \
412     a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
413     memcpy(dst, &a, 4);                                \
414   } while (0)
415 
load_unaligned_u8_2x2(const uint8_t * buf,int stride,uint16x4_t * tu0)416 static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
417                                          uint16x4_t *tu0) {
418   uint16_t a;
419 
420   memcpy(&a, buf, 2);
421   buf += stride;
422   *tu0 = vset_lane_u16(a, *tu0, 0);
423   memcpy(&a, buf, 2);
424   buf += stride;
425   *tu0 = vset_lane_u16(a, *tu0, 1);
426 }
427 
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)428 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
429                                 uint8x16_t *const s0, uint8x16_t *const s1,
430                                 uint8x16_t *const s2, uint8x16_t *const s3,
431                                 uint8x16_t *const s4, uint8x16_t *const s5,
432                                 uint8x16_t *const s6, uint8x16_t *const s7) {
433   *s0 = vld1q_u8(s);
434   s += p;
435   *s1 = vld1q_u8(s);
436   s += p;
437   *s2 = vld1q_u8(s);
438   s += p;
439   *s3 = vld1q_u8(s);
440   s += p;
441   *s4 = vld1q_u8(s);
442   s += p;
443   *s5 = vld1q_u8(s);
444   s += p;
445   *s6 = vld1q_u8(s);
446   s += p;
447   *s7 = vld1q_u8(s);
448 }
449 
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)450 static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
451                                 uint8x16_t *const s0, uint8x16_t *const s1,
452                                 uint8x16_t *const s2, uint8x16_t *const s3) {
453   *s0 = vld1q_u8(s);
454   s += p;
455   *s1 = vld1q_u8(s);
456   s += p;
457   *s2 = vld1q_u8(s);
458   s += p;
459   *s3 = vld1q_u8(s);
460 }
461 
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint64x2_t * tu0,uint64x2_t * tu1)462 static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
463                                           uint64x2_t *tu0, uint64x2_t *tu1) {
464   uint64_t a;
465 
466   memcpy(&a, buf, 8);
467   buf += stride;
468   *tu0 = vsetq_lane_u64(a, *tu0, 0);
469   memcpy(&a, buf, 8);
470   buf += stride;
471   *tu0 = vsetq_lane_u64(a, *tu0, 1);
472   memcpy(&a, buf, 8);
473   buf += stride;
474   *tu1 = vsetq_lane_u64(a, *tu1, 0);
475   memcpy(&a, buf, 8);
476   *tu1 = vsetq_lane_u64(a, *tu1, 1);
477 }
478 
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)479 static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
480                                 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
481   *s1 = vld1q_s32(s);
482   s += p;
483   *s2 = vld1q_s32(s);
484   s += p;
485   *s3 = vld1q_s32(s);
486   s += p;
487   *s4 = vld1q_s32(s);
488 }
489 
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)490 static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
491                                  int32x4_t s2, int32x4_t s3, int32x4_t s4) {
492   vst1q_s32(s, s1);
493   s += p;
494   vst1q_s32(s, s2);
495   s += p;
496   vst1q_s32(s, s3);
497   s += p;
498   vst1q_s32(s, s4);
499 }
500 
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)501 static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
502                                 uint32x4_t *s2, uint32x4_t *s3,
503                                 uint32x4_t *s4) {
504   *s1 = vld1q_u32(s);
505   s += p;
506   *s2 = vld1q_u32(s);
507   s += p;
508   *s3 = vld1q_u32(s);
509   s += p;
510   *s4 = vld1q_u32(s);
511 }
512 
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)513 static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
514                                  uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
515   vst1q_u32(s, s1);
516   s += p;
517   vst1q_u32(s, s2);
518   s += p;
519   vst1q_u32(s, s3);
520   s += p;
521   vst1q_u32(s, s4);
522 }
523 
load_tran_low_to_s16q(const tran_low_t * buf)524 static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
525   const int32x4_t v0 = vld1q_s32(buf);
526   const int32x4_t v1 = vld1q_s32(buf + 4);
527   const int16x4_t s0 = vmovn_s32(v0);
528   const int16x4_t s1 = vmovn_s32(v1);
529   return vcombine_s16(s0, s1);
530 }
531 
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)532 static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
533   const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
534   const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
535   vst1q_s32(buf, v0);
536   vst1q_s32(buf + 4, v1);
537 }
538 
539 #endif  // AOM_AV1_COMMON_ARM_MEM_NEON_H_
540