1 /*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __ARM_ACLE_H
11 #define __ARM_ACLE_H
12 
13 #ifndef __ARM_ACLE
14 #error "ACLE intrinsics support not enabled."
15 #endif
16 
17 #include <stdint.h>
18 
19 #if defined(__cplusplus)
20 extern "C" {
21 #endif
22 
23 /* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
24 /* 8.3 Memory barriers */
25 #if !defined(_MSC_VER)
26 #define __dmb(i) __builtin_arm_dmb(i)
27 #define __dsb(i) __builtin_arm_dsb(i)
28 #define __isb(i) __builtin_arm_isb(i)
29 #endif
30 
31 /* 8.4 Hints */
32 
33 #if !defined(_MSC_VER)
34 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
35   __builtin_arm_wfi();
36 }
37 
38 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
39   __builtin_arm_wfe();
40 }
41 
42 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
43   __builtin_arm_sev();
44 }
45 
46 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
47   __builtin_arm_sevl();
48 }
49 
50 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
51   __builtin_arm_yield();
52 }
53 #endif
54 
55 #if __ARM_32BIT_STATE
56 #define __dbg(t) __builtin_arm_dbg(t)
57 #endif
58 
59 /* 8.5 Swap */
60 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
61 __swp(uint32_t __x, volatile uint32_t *__p) {
62   uint32_t v;
63   do
64     v = __builtin_arm_ldrex(__p);
65   while (__builtin_arm_strex(__x, __p));
66   return v;
67 }
68 
69 /* 8.6 Memory prefetch intrinsics */
70 /* 8.6.1 Data prefetch */
71 #define __pld(addr) __pldx(0, 0, 0, addr)
72 
73 #if __ARM_32BIT_STATE
74 #define __pldx(access_kind, cache_level, retention_policy, addr) \
75   __builtin_arm_prefetch(addr, access_kind, 1)
76 #else
77 #define __pldx(access_kind, cache_level, retention_policy, addr) \
78   __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
79 #endif
80 
81 /* 8.6.2 Instruction prefetch */
82 #define __pli(addr) __plix(0, 0, addr)
83 
84 #if __ARM_32BIT_STATE
85 #define __plix(cache_level, retention_policy, addr) \
86   __builtin_arm_prefetch(addr, 0, 0)
87 #else
88 #define __plix(cache_level, retention_policy, addr) \
89   __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
90 #endif
91 
92 /* 8.7 NOP */
93 #if !defined(_MSC_VER) || !defined(__aarch64__)
94 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
95   __builtin_arm_nop();
96 }
97 #endif
98 
99 /* 9 DATA-PROCESSING INTRINSICS */
100 /* 9.2 Miscellaneous data-processing intrinsics */
101 /* ROR */
102 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
103 __ror(uint32_t __x, uint32_t __y) {
104   __y %= 32;
105   if (__y == 0)
106     return __x;
107   return (__x >> __y) | (__x << (32 - __y));
108 }
109 
110 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
111 __rorll(uint64_t __x, uint32_t __y) {
112   __y %= 64;
113   if (__y == 0)
114     return __x;
115   return (__x >> __y) | (__x << (64 - __y));
116 }
117 
118 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
119 __rorl(unsigned long __x, uint32_t __y) {
120 #if __SIZEOF_LONG__ == 4
121   return __ror(__x, __y);
122 #else
123   return __rorll(__x, __y);
124 #endif
125 }
126 
127 
128 /* CLZ */
129 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
130 __clz(uint32_t __t) {
131   return __builtin_clz(__t);
132 }
133 
134 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
135 __clzl(unsigned long __t) {
136   return __builtin_clzl(__t);
137 }
138 
139 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
140 __clzll(uint64_t __t) {
141   return __builtin_clzll(__t);
142 }
143 
144 /* CLS */
145 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
146 __cls(uint32_t __t) {
147   return __builtin_arm_cls(__t);
148 }
149 
150 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
151 __clsl(unsigned long __t) {
152 #if __SIZEOF_LONG__ == 4
153   return __builtin_arm_cls(__t);
154 #else
155   return __builtin_arm_cls64(__t);
156 #endif
157 }
158 
159 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
160 __clsll(uint64_t __t) {
161   return __builtin_arm_cls64(__t);
162 }
163 
164 /* REV */
165 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
166 __rev(uint32_t __t) {
167   return __builtin_bswap32(__t);
168 }
169 
170 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
171 __revl(unsigned long __t) {
172 #if __SIZEOF_LONG__ == 4
173   return __builtin_bswap32(__t);
174 #else
175   return __builtin_bswap64(__t);
176 #endif
177 }
178 
179 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
180 __revll(uint64_t __t) {
181   return __builtin_bswap64(__t);
182 }
183 
184 /* REV16 */
185 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
186 __rev16(uint32_t __t) {
187   return __ror(__rev(__t), 16);
188 }
189 
190 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
191 __rev16ll(uint64_t __t) {
192   return (((uint64_t)__rev16(__t >> 32)) << 32) | __rev16(__t);
193 }
194 
195 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
196 __rev16l(unsigned long __t) {
197 #if __SIZEOF_LONG__ == 4
198     return __rev16(__t);
199 #else
200     return __rev16ll(__t);
201 #endif
202 }
203 
204 /* REVSH */
205 static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
206 __revsh(int16_t __t) {
207   return __builtin_bswap16(__t);
208 }
209 
210 /* RBIT */
211 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
212 __rbit(uint32_t __t) {
213   return __builtin_arm_rbit(__t);
214 }
215 
216 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
217 __rbitll(uint64_t __t) {
218 #if __ARM_32BIT_STATE
219   return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
220          __builtin_arm_rbit(__t >> 32);
221 #else
222   return __builtin_arm_rbit64(__t);
223 #endif
224 }
225 
226 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
227 __rbitl(unsigned long __t) {
228 #if __SIZEOF_LONG__ == 4
229   return __rbit(__t);
230 #else
231   return __rbitll(__t);
232 #endif
233 }
234 
235 /*
236  * 9.3 16-bit multiplications
237  */
238 #if __ARM_FEATURE_DSP
239 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
240 __smulbb(int32_t __a, int32_t __b) {
241   return __builtin_arm_smulbb(__a, __b);
242 }
243 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
244 __smulbt(int32_t __a, int32_t __b) {
245   return __builtin_arm_smulbt(__a, __b);
246 }
247 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
248 __smultb(int32_t __a, int32_t __b) {
249   return __builtin_arm_smultb(__a, __b);
250 }
251 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
252 __smultt(int32_t __a, int32_t __b) {
253   return __builtin_arm_smultt(__a, __b);
254 }
255 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
256 __smulwb(int32_t __a, int32_t __b) {
257   return __builtin_arm_smulwb(__a, __b);
258 }
259 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
260 __smulwt(int32_t __a, int32_t __b) {
261   return __builtin_arm_smulwt(__a, __b);
262 }
263 #endif
264 
265 /*
266  * 9.4 Saturating intrinsics
267  *
268  * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
269  * intrinsics are implemented and the flag is enabled.
270  */
271 /* 9.4.1 Width-specified saturation intrinsics */
272 #if __ARM_FEATURE_SAT
273 #define __ssat(x, y) __builtin_arm_ssat(x, y)
274 #define __usat(x, y) __builtin_arm_usat(x, y)
275 #endif
276 
277 /* 9.4.2 Saturating addition and subtraction intrinsics */
278 #if __ARM_FEATURE_DSP
279 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
280 __qadd(int32_t __t, int32_t __v) {
281   return __builtin_arm_qadd(__t, __v);
282 }
283 
284 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
285 __qsub(int32_t __t, int32_t __v) {
286   return __builtin_arm_qsub(__t, __v);
287 }
288 
289 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
290 __qdbl(int32_t __t) {
291   return __builtin_arm_qadd(__t, __t);
292 }
293 #endif
294 
295 /* 9.4.3 Accumultating multiplications */
296 #if __ARM_FEATURE_DSP
297 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
298 __smlabb(int32_t __a, int32_t __b, int32_t __c) {
299   return __builtin_arm_smlabb(__a, __b, __c);
300 }
301 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
302 __smlabt(int32_t __a, int32_t __b, int32_t __c) {
303   return __builtin_arm_smlabt(__a, __b, __c);
304 }
305 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
306 __smlatb(int32_t __a, int32_t __b, int32_t __c) {
307   return __builtin_arm_smlatb(__a, __b, __c);
308 }
309 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
310 __smlatt(int32_t __a, int32_t __b, int32_t __c) {
311   return __builtin_arm_smlatt(__a, __b, __c);
312 }
313 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
314 __smlawb(int32_t __a, int32_t __b, int32_t __c) {
315   return __builtin_arm_smlawb(__a, __b, __c);
316 }
317 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
318 __smlawt(int32_t __a, int32_t __b, int32_t __c) {
319   return __builtin_arm_smlawt(__a, __b, __c);
320 }
321 #endif
322 
323 
324 /* 9.5.4 Parallel 16-bit saturation */
325 #if __ARM_FEATURE_SIMD32
326 #define __ssat16(x, y) __builtin_arm_ssat16(x, y)
327 #define __usat16(x, y) __builtin_arm_usat16(x, y)
328 #endif
329 
330 /* 9.5.5 Packing and unpacking */
331 #if __ARM_FEATURE_SIMD32
332 typedef int32_t int8x4_t;
333 typedef int32_t int16x2_t;
334 typedef uint32_t uint8x4_t;
335 typedef uint32_t uint16x2_t;
336 
337 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
338 __sxtab16(int16x2_t __a, int8x4_t __b) {
339   return __builtin_arm_sxtab16(__a, __b);
340 }
341 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
342 __sxtb16(int8x4_t __a) {
343   return __builtin_arm_sxtb16(__a);
344 }
345 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
346 __uxtab16(int16x2_t __a, int8x4_t __b) {
347   return __builtin_arm_uxtab16(__a, __b);
348 }
349 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
350 __uxtb16(int8x4_t __a) {
351   return __builtin_arm_uxtb16(__a);
352 }
353 #endif
354 
355 /* 9.5.6 Parallel selection */
356 #if __ARM_FEATURE_SIMD32
357 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
358 __sel(uint8x4_t __a, uint8x4_t __b) {
359   return __builtin_arm_sel(__a, __b);
360 }
361 #endif
362 
363 /* 9.5.7 Parallel 8-bit addition and subtraction */
364 #if __ARM_FEATURE_SIMD32
365 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
366 __qadd8(int8x4_t __a, int8x4_t __b) {
367   return __builtin_arm_qadd8(__a, __b);
368 }
369 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
370 __qsub8(int8x4_t __a, int8x4_t __b) {
371   return __builtin_arm_qsub8(__a, __b);
372 }
373 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
374 __sadd8(int8x4_t __a, int8x4_t __b) {
375   return __builtin_arm_sadd8(__a, __b);
376 }
377 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
378 __shadd8(int8x4_t __a, int8x4_t __b) {
379   return __builtin_arm_shadd8(__a, __b);
380 }
381 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
382 __shsub8(int8x4_t __a, int8x4_t __b) {
383   return __builtin_arm_shsub8(__a, __b);
384 }
385 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
386 __ssub8(int8x4_t __a, int8x4_t __b) {
387   return __builtin_arm_ssub8(__a, __b);
388 }
389 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
390 __uadd8(uint8x4_t __a, uint8x4_t __b) {
391   return __builtin_arm_uadd8(__a, __b);
392 }
393 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
394 __uhadd8(uint8x4_t __a, uint8x4_t __b) {
395   return __builtin_arm_uhadd8(__a, __b);
396 }
397 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
398 __uhsub8(uint8x4_t __a, uint8x4_t __b) {
399   return __builtin_arm_uhsub8(__a, __b);
400 }
401 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
402 __uqadd8(uint8x4_t __a, uint8x4_t __b) {
403   return __builtin_arm_uqadd8(__a, __b);
404 }
405 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
406 __uqsub8(uint8x4_t __a, uint8x4_t __b) {
407   return __builtin_arm_uqsub8(__a, __b);
408 }
409 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
410 __usub8(uint8x4_t __a, uint8x4_t __b) {
411   return __builtin_arm_usub8(__a, __b);
412 }
413 #endif
414 
415 /* 9.5.8 Sum of 8-bit absolute differences */
416 #if __ARM_FEATURE_SIMD32
417 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
418 __usad8(uint8x4_t __a, uint8x4_t __b) {
419   return __builtin_arm_usad8(__a, __b);
420 }
421 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
422 __usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
423   return __builtin_arm_usada8(__a, __b, __c);
424 }
425 #endif
426 
427 /* 9.5.9 Parallel 16-bit addition and subtraction */
428 #if __ARM_FEATURE_SIMD32
429 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
430 __qadd16(int16x2_t __a, int16x2_t __b) {
431   return __builtin_arm_qadd16(__a, __b);
432 }
433 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
434 __qasx(int16x2_t __a, int16x2_t __b) {
435   return __builtin_arm_qasx(__a, __b);
436 }
437 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
438 __qsax(int16x2_t __a, int16x2_t __b) {
439   return __builtin_arm_qsax(__a, __b);
440 }
441 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
442 __qsub16(int16x2_t __a, int16x2_t __b) {
443   return __builtin_arm_qsub16(__a, __b);
444 }
445 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
446 __sadd16(int16x2_t __a, int16x2_t __b) {
447   return __builtin_arm_sadd16(__a, __b);
448 }
449 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
450 __sasx(int16x2_t __a, int16x2_t __b) {
451   return __builtin_arm_sasx(__a, __b);
452 }
453 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
454 __shadd16(int16x2_t __a, int16x2_t __b) {
455   return __builtin_arm_shadd16(__a, __b);
456 }
457 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
458 __shasx(int16x2_t __a, int16x2_t __b) {
459   return __builtin_arm_shasx(__a, __b);
460 }
461 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
462 __shsax(int16x2_t __a, int16x2_t __b) {
463   return __builtin_arm_shsax(__a, __b);
464 }
465 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
466 __shsub16(int16x2_t __a, int16x2_t __b) {
467   return __builtin_arm_shsub16(__a, __b);
468 }
469 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
470 __ssax(int16x2_t __a, int16x2_t __b) {
471   return __builtin_arm_ssax(__a, __b);
472 }
473 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
474 __ssub16(int16x2_t __a, int16x2_t __b) {
475   return __builtin_arm_ssub16(__a, __b);
476 }
477 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
478 __uadd16(uint16x2_t __a, uint16x2_t __b) {
479   return __builtin_arm_uadd16(__a, __b);
480 }
481 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
482 __uasx(uint16x2_t __a, uint16x2_t __b) {
483   return __builtin_arm_uasx(__a, __b);
484 }
485 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
486 __uhadd16(uint16x2_t __a, uint16x2_t __b) {
487   return __builtin_arm_uhadd16(__a, __b);
488 }
489 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
490 __uhasx(uint16x2_t __a, uint16x2_t __b) {
491   return __builtin_arm_uhasx(__a, __b);
492 }
493 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
494 __uhsax(uint16x2_t __a, uint16x2_t __b) {
495   return __builtin_arm_uhsax(__a, __b);
496 }
497 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
498 __uhsub16(uint16x2_t __a, uint16x2_t __b) {
499   return __builtin_arm_uhsub16(__a, __b);
500 }
501 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
502 __uqadd16(uint16x2_t __a, uint16x2_t __b) {
503   return __builtin_arm_uqadd16(__a, __b);
504 }
505 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
506 __uqasx(uint16x2_t __a, uint16x2_t __b) {
507   return __builtin_arm_uqasx(__a, __b);
508 }
509 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
510 __uqsax(uint16x2_t __a, uint16x2_t __b) {
511   return __builtin_arm_uqsax(__a, __b);
512 }
513 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
514 __uqsub16(uint16x2_t __a, uint16x2_t __b) {
515   return __builtin_arm_uqsub16(__a, __b);
516 }
517 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
518 __usax(uint16x2_t __a, uint16x2_t __b) {
519   return __builtin_arm_usax(__a, __b);
520 }
521 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
522 __usub16(uint16x2_t __a, uint16x2_t __b) {
523   return __builtin_arm_usub16(__a, __b);
524 }
525 #endif
526 
527 /* 9.5.10 Parallel 16-bit multiplications */
528 #if __ARM_FEATURE_SIMD32
529 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
530 __smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
531   return __builtin_arm_smlad(__a, __b, __c);
532 }
533 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
534 __smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
535   return __builtin_arm_smladx(__a, __b, __c);
536 }
537 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
538 __smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
539   return __builtin_arm_smlald(__a, __b, __c);
540 }
541 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
542 __smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
543   return __builtin_arm_smlaldx(__a, __b, __c);
544 }
545 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
546 __smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
547   return __builtin_arm_smlsd(__a, __b, __c);
548 }
549 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
550 __smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
551   return __builtin_arm_smlsdx(__a, __b, __c);
552 }
553 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
554 __smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
555   return __builtin_arm_smlsld(__a, __b, __c);
556 }
557 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
558 __smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
559   return __builtin_arm_smlsldx(__a, __b, __c);
560 }
561 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
562 __smuad(int16x2_t __a, int16x2_t __b) {
563   return __builtin_arm_smuad(__a, __b);
564 }
565 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
566 __smuadx(int16x2_t __a, int16x2_t __b) {
567   return __builtin_arm_smuadx(__a, __b);
568 }
569 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
570 __smusd(int16x2_t __a, int16x2_t __b) {
571   return __builtin_arm_smusd(__a, __b);
572 }
573 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
574 __smusdx(int16x2_t __a, int16x2_t __b) {
575   return __builtin_arm_smusdx(__a, __b);
576 }
577 #endif
578 
579 /* 9.7 CRC32 intrinsics */
580 #if __ARM_FEATURE_CRC32
581 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
582 __crc32b(uint32_t __a, uint8_t __b) {
583   return __builtin_arm_crc32b(__a, __b);
584 }
585 
586 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
587 __crc32h(uint32_t __a, uint16_t __b) {
588   return __builtin_arm_crc32h(__a, __b);
589 }
590 
591 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
592 __crc32w(uint32_t __a, uint32_t __b) {
593   return __builtin_arm_crc32w(__a, __b);
594 }
595 
596 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
597 __crc32d(uint32_t __a, uint64_t __b) {
598   return __builtin_arm_crc32d(__a, __b);
599 }
600 
601 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
602 __crc32cb(uint32_t __a, uint8_t __b) {
603   return __builtin_arm_crc32cb(__a, __b);
604 }
605 
606 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
607 __crc32ch(uint32_t __a, uint16_t __b) {
608   return __builtin_arm_crc32ch(__a, __b);
609 }
610 
611 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
612 __crc32cw(uint32_t __a, uint32_t __b) {
613   return __builtin_arm_crc32cw(__a, __b);
614 }
615 
616 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
617 __crc32cd(uint32_t __a, uint64_t __b) {
618   return __builtin_arm_crc32cd(__a, __b);
619 }
620 #endif
621 
622 /* Armv8.3-A Javascript conversion intrinsic */
623 #if __ARM_64BIT_STATE && defined(__ARM_FEATURE_JCVT)
624 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
625 __jcvt(double __a) {
626   return __builtin_arm_jcvt(__a);
627 }
628 #endif
629 
630 /* 10.1 Special register intrinsics */
631 #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
632 #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
633 #define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
634 #define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
635 #define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
636 #define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
637 #define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
638 #define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
639 #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
640 #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
641 
642 /* Memory Tagging Extensions (MTE) Intrinsics */
643 #if __ARM_FEATURE_MEMORY_TAGGING
644 #define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
645 #define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
646 #define __arm_mte_exclude_tag(__ptr, __excluded)  __builtin_arm_gmi(__ptr, __excluded)
647 #define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
648 #define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
649 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
650 #endif
651 
652 /* Transactional Memory Extension (TME) Intrinsics */
653 #if __ARM_FEATURE_TME
654 
655 #define _TMFAILURE_REASON  0x00007fffu
656 #define _TMFAILURE_RTRY    0x00008000u
657 #define _TMFAILURE_CNCL    0x00010000u
658 #define _TMFAILURE_MEM     0x00020000u
659 #define _TMFAILURE_IMP     0x00040000u
660 #define _TMFAILURE_ERR     0x00080000u
661 #define _TMFAILURE_SIZE    0x00100000u
662 #define _TMFAILURE_NEST    0x00200000u
663 #define _TMFAILURE_DBG     0x00400000u
664 #define _TMFAILURE_INT     0x00800000u
665 #define _TMFAILURE_TRIVIAL 0x01000000u
666 
667 #define __tstart()        __builtin_arm_tstart()
668 #define __tcommit()       __builtin_arm_tcommit()
669 #define __tcancel(__arg)  __builtin_arm_tcancel(__arg)
670 #define __ttest()         __builtin_arm_ttest()
671 
672 #endif /* __ARM_FEATURE_TME */
673 
674 #if defined(__cplusplus)
675 }
676 #endif
677 
678 #endif /* __ARM_ACLE_H */
679