1 /*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __AMMINTRIN_H
11 #define __AMMINTRIN_H
12 
13 #include <pmmintrin.h>
14 
15 /* Define the default attributes for the functions in this file. */
16 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
17 
18 /// Extracts the specified bits from the lower 64 bits of the 128-bit
19 ///    integer vector operand at the index \a idx and of the length \a len.
20 ///
21 /// \headerfile <x86intrin.h>
22 ///
23 /// \code
24 /// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
25 /// \endcode
26 ///
27 /// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
28 ///
29 /// \param x
30 ///    The value from which bits are extracted.
31 /// \param len
32 ///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
33 ///    are zero, the length is interpreted as 64.
34 /// \param idx
35 ///    Bits [5:0] specify the index of the least significant bit; the other
36 ///    bits are ignored. If the sum of the index and length is greater than 64,
37 ///    the result is undefined. If the length and index are both zero, bits
38 ///    [63:0] of parameter \a x are extracted. If the length is zero but the
39 ///    index is non-zero, the result is undefined.
40 /// \returns A 128-bit integer vector whose lower 64 bits contain the bits
41 ///    extracted from the source operand.
42 #define _mm_extracti_si64(x, len, idx) \
43   ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
44                                   (char)(len), (char)(idx)))
45 
46 /// Extracts the specified bits from the lower 64 bits of the 128-bit
47 ///    integer vector operand at the index and of the length specified by
48 ///    \a __y.
49 ///
50 /// \headerfile <x86intrin.h>
51 ///
52 /// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
53 ///
54 /// \param __x
55 ///    The value from which bits are extracted.
56 /// \param __y
57 ///    Specifies the index of the least significant bit at [13:8] and the
58 ///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
59 ///    length is interpreted as 64. If the sum of the index and length is
60 ///    greater than 64, the result is undefined. If the length and index are
61 ///    both zero, bits [63:0] of parameter \a __x are extracted. If the length
62 ///    is zero but the index is non-zero, the result is undefined.
63 /// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
64 ///    from the source operand.
65 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_extract_si64(__m128i __x,__m128i __y)66 _mm_extract_si64(__m128i __x, __m128i __y)
67 {
68   return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
69 }
70 
71 /// Inserts bits of a specified length from the source integer vector
72 ///    \a y into the lower 64 bits of the destination integer vector \a x at
73 ///    the index \a idx and of the length \a len.
74 ///
75 /// \headerfile <x86intrin.h>
76 ///
77 /// \code
78 /// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
79 /// const int idx);
80 /// \endcode
81 ///
82 /// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
83 ///
84 /// \param x
85 ///    The destination operand where bits will be inserted. The inserted bits
86 ///    are defined by the length \a len and by the index \a idx specifying the
87 ///    least significant bit.
88 /// \param y
89 ///    The source operand containing the bits to be extracted. The extracted
90 ///    bits are the least significant bits of operand \a y of length \a len.
91 /// \param len
92 ///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
93 ///    are zero, the length is interpreted as 64.
94 /// \param idx
95 ///    Bits [5:0] specify the index of the least significant bit; the other
96 ///    bits are ignored. If the sum of the index and length is greater than 64,
97 ///    the result is undefined. If the length and index are both zero, bits
98 ///    [63:0] of parameter \a y are inserted into parameter \a x. If the length
99 ///    is zero but the index is non-zero, the result is undefined.
100 /// \returns A 128-bit integer vector containing the original lower 64-bits of
101 ///    destination operand \a x with the specified bitfields replaced by the
102 ///    lower bits of source operand \a y. The upper 64 bits of the return value
103 ///    are undefined.
104 #define _mm_inserti_si64(x, y, len, idx) \
105   ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
106                                     (__v2di)(__m128i)(y), \
107                                     (char)(len), (char)(idx)))
108 
109 /// Inserts bits of a specified length from the source integer vector
110 ///    \a __y into the lower 64 bits of the destination integer vector \a __x
111 ///    at the index and of the length specified by \a __y.
112 ///
113 /// \headerfile <x86intrin.h>
114 ///
115 /// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
116 ///
117 /// \param __x
118 ///    The destination operand where bits will be inserted. The inserted bits
119 ///    are defined by the length and by the index of the least significant bit
120 ///    specified by operand \a __y.
121 /// \param __y
122 ///    The source operand containing the bits to be extracted. The extracted
123 ///    bits are the least significant bits of operand \a __y with length
124 ///    specified by bits [69:64]. These are inserted into the destination at the
125 ///    index specified by bits [77:72]; all other bits are ignored. If bits
126 ///    [69:64] are zero, the length is interpreted as 64. If the sum of the
127 ///    index and length is greater than 64, the result is undefined. If the
128 ///    length and index are both zero, bits [63:0] of parameter \a __y are
129 ///    inserted into parameter \a __x. If the length is zero but the index is
130 ///    non-zero, the result is undefined.
131 /// \returns A 128-bit integer vector containing the original lower 64-bits of
132 ///    destination operand \a __x with the specified bitfields replaced by the
133 ///    lower bits of source operand \a __y. The upper 64 bits of the return
134 ///    value are undefined.
135 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_insert_si64(__m128i __x,__m128i __y)136 _mm_insert_si64(__m128i __x, __m128i __y)
137 {
138   return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
139 }
140 
141 /// Stores a 64-bit double-precision value in a 64-bit memory location.
142 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
143 ///    used again soon).
144 ///
145 /// \headerfile <x86intrin.h>
146 ///
147 /// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
148 ///
149 /// \param __p
150 ///    The 64-bit memory location used to store the register value.
151 /// \param __a
152 ///    The 64-bit double-precision floating-point register value to be stored.
153 static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_sd(double * __p,__m128d __a)154 _mm_stream_sd(double *__p, __m128d __a)
155 {
156   __builtin_ia32_movntsd(__p, (__v2df)__a);
157 }
158 
159 /// Stores a 32-bit single-precision floating-point value in a 32-bit
160 ///    memory location. To minimize caching, the data is flagged as
161 ///    non-temporal (unlikely to be used again soon).
162 ///
163 /// \headerfile <x86intrin.h>
164 ///
165 /// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
166 ///
167 /// \param __p
168 ///    The 32-bit memory location used to store the register value.
169 /// \param __a
170 ///    The 32-bit single-precision floating-point register value to be stored.
171 static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ss(float * __p,__m128 __a)172 _mm_stream_ss(float *__p, __m128 __a)
173 {
174   __builtin_ia32_movntss(__p, (__v4sf)__a);
175 }
176 
177 #undef __DEFAULT_FN_ATTRS
178 
179 #endif /* __AMMINTRIN_H */
180