1 /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __PMMINTRIN_H
11 #define __PMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <emmintrin.h>
18 
19 /* Define the default attributes for the functions in this file. */
20 #define __DEFAULT_FN_ATTRS                                                     \
21   __attribute__((__always_inline__, __nodebug__,                               \
22                  __target__("sse3,no-evex512"), __min_vector_width__(128)))
23 
24 /// Loads data from an unaligned memory location to elements in a 128-bit
25 ///    vector.
26 ///
27 ///    If the address of the data is not 16-byte aligned, the instruction may
28 ///    read two adjacent aligned blocks of memory to retrieve the requested
29 ///    data.
30 ///
31 /// \headerfile <x86intrin.h>
32 ///
33 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
34 ///
35 /// \param __p
36 ///    A pointer to a 128-bit integer vector containing integer values.
37 /// \returns A 128-bit vector containing the moved values.
38 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_lddqu_si128(__m128i_u const * __p)39 _mm_lddqu_si128(__m128i_u const *__p)
40 {
41   return (__m128i)__builtin_ia32_lddqu((char const *)__p);
42 }
43 
44 /// Adds the even-indexed values and subtracts the odd-indexed values of
45 ///    two 128-bit vectors of [4 x float].
46 ///
47 /// \headerfile <x86intrin.h>
48 ///
49 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
50 ///
51 /// \param __a
52 ///    A 128-bit vector of [4 x float] containing the left source operand.
53 /// \param __b
54 ///    A 128-bit vector of [4 x float] containing the right source operand.
55 /// \returns A 128-bit vector of [4 x float] containing the alternating sums and
56 ///    differences of both operands.
57 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_addsub_ps(__m128 __a,__m128 __b)58 _mm_addsub_ps(__m128 __a, __m128 __b)
59 {
60   return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
61 }
62 
63 /// Horizontally adds the adjacent pairs of values contained in two
64 ///    128-bit vectors of [4 x float].
65 ///
66 /// \headerfile <x86intrin.h>
67 ///
68 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
69 ///
70 /// \param __a
71 ///    A 128-bit vector of [4 x float] containing one of the source operands.
72 ///    The horizontal sums of the values are stored in the lower bits of the
73 ///    destination.
74 /// \param __b
75 ///    A 128-bit vector of [4 x float] containing one of the source operands.
76 ///    The horizontal sums of the values are stored in the upper bits of the
77 ///    destination.
78 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
79 ///    both operands.
80 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_hadd_ps(__m128 __a,__m128 __b)81 _mm_hadd_ps(__m128 __a, __m128 __b)
82 {
83   return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
84 }
85 
86 /// Horizontally subtracts the adjacent pairs of values contained in two
87 ///    128-bit vectors of [4 x float].
88 ///
89 /// \headerfile <x86intrin.h>
90 ///
91 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
92 ///
93 /// \param __a
94 ///    A 128-bit vector of [4 x float] containing one of the source operands.
95 ///    The horizontal differences between the values are stored in the lower
96 ///    bits of the destination.
97 /// \param __b
98 ///    A 128-bit vector of [4 x float] containing one of the source operands.
99 ///    The horizontal differences between the values are stored in the upper
100 ///    bits of the destination.
101 /// \returns A 128-bit vector of [4 x float] containing the horizontal
102 ///    differences of both operands.
103 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_hsub_ps(__m128 __a,__m128 __b)104 _mm_hsub_ps(__m128 __a, __m128 __b)
105 {
106   return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
107 }
108 
109 /// Moves and duplicates odd-indexed values from a 128-bit vector
110 ///    of [4 x float] to float values stored in a 128-bit vector of
111 ///    [4 x float].
112 ///
113 /// \headerfile <x86intrin.h>
114 ///
115 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
116 ///
117 /// \param __a
118 ///    A 128-bit vector of [4 x float]. \n
119 ///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
120 ///    the destination. \n
121 ///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
122 ///    destination.
123 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
124 ///    values.
125 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movehdup_ps(__m128 __a)126 _mm_movehdup_ps(__m128 __a)
127 {
128   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
129 }
130 
131 /// Duplicates even-indexed values from a 128-bit vector of
132 ///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
133 ///
134 /// \headerfile <x86intrin.h>
135 ///
136 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
137 ///
138 /// \param __a
139 ///    A 128-bit vector of [4 x float] \n
140 ///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
141 ///    the destination. \n
142 ///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
143 ///    destination.
144 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
145 ///    values.
146 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_moveldup_ps(__m128 __a)147 _mm_moveldup_ps(__m128 __a)
148 {
149   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
150 }
151 
152 /// Adds the even-indexed values and subtracts the odd-indexed values of
153 ///    two 128-bit vectors of [2 x double].
154 ///
155 /// \headerfile <x86intrin.h>
156 ///
157 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
158 ///
159 /// \param __a
160 ///    A 128-bit vector of [2 x double] containing the left source operand.
161 /// \param __b
162 ///    A 128-bit vector of [2 x double] containing the right source operand.
163 /// \returns A 128-bit vector of [2 x double] containing the alternating sums
164 ///    and differences of both operands.
165 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_addsub_pd(__m128d __a,__m128d __b)166 _mm_addsub_pd(__m128d __a, __m128d __b)
167 {
168   return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
169 }
170 
171 /// Horizontally adds the pairs of values contained in two 128-bit
172 ///    vectors of [2 x double].
173 ///
174 /// \headerfile <x86intrin.h>
175 ///
176 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
177 ///
178 /// \param __a
179 ///    A 128-bit vector of [2 x double] containing one of the source operands.
180 ///    The horizontal sum of the values is stored in the lower bits of the
181 ///    destination.
182 /// \param __b
183 ///    A 128-bit vector of [2 x double] containing one of the source operands.
184 ///    The horizontal sum of the values is stored in the upper bits of the
185 ///    destination.
186 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
187 ///    both operands.
188 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_hadd_pd(__m128d __a,__m128d __b)189 _mm_hadd_pd(__m128d __a, __m128d __b)
190 {
191   return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
192 }
193 
194 /// Horizontally subtracts the pairs of values contained in two 128-bit
195 ///    vectors of [2 x double].
196 ///
197 /// \headerfile <x86intrin.h>
198 ///
199 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
200 ///
201 /// \param __a
202 ///    A 128-bit vector of [2 x double] containing one of the source operands.
203 ///    The horizontal difference of the values is stored in the lower bits of
204 ///    the destination.
205 /// \param __b
206 ///    A 128-bit vector of [2 x double] containing one of the source operands.
207 ///    The horizontal difference of the values is stored in the upper bits of
208 ///    the destination.
209 /// \returns A 128-bit vector of [2 x double] containing the horizontal
210 ///    differences of both operands.
211 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_hsub_pd(__m128d __a,__m128d __b)212 _mm_hsub_pd(__m128d __a, __m128d __b)
213 {
214   return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
215 }
216 
217 /// Moves and duplicates one double-precision value to double-precision
218 ///    values stored in a 128-bit vector of [2 x double].
219 ///
220 /// \headerfile <x86intrin.h>
221 ///
222 /// \code
223 /// __m128d _mm_loaddup_pd(double const *dp);
224 /// \endcode
225 ///
226 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
227 ///
228 /// \param dp
229 ///    A pointer to a double-precision value to be moved and duplicated.
230 /// \returns A 128-bit vector of [2 x double] containing the moved and
231 ///    duplicated values.
232 #define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
233 
234 /// Moves and duplicates the double-precision value in the lower bits of
235 ///    a 128-bit vector of [2 x double] to double-precision values stored in a
236 ///    128-bit vector of [2 x double].
237 ///
238 /// \headerfile <x86intrin.h>
239 ///
240 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
241 ///
242 /// \param __a
243 ///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
244 ///    [127:64] and [63:0] of the destination.
245 /// \returns A 128-bit vector of [2 x double] containing the moved and
246 ///    duplicated values.
247 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_movedup_pd(__m128d __a)248 _mm_movedup_pd(__m128d __a)
249 {
250   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
251 }
252 
253 /// Establishes a linear address memory range to be monitored and puts
254 ///    the processor in the monitor event pending state. Data stored in the
255 ///    monitored address range causes the processor to exit the pending state.
256 ///
257 /// The \c MONITOR instruction can be used in kernel mode, and in other modes
258 /// if MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
259 ///
260 /// \headerfile <x86intrin.h>
261 ///
262 /// This intrinsic corresponds to the \c MONITOR instruction.
263 ///
264 /// \param __p
265 ///    The memory range to be monitored. The size of the range is determined by
266 ///    CPUID function 0000_0005h.
267 /// \param __extensions
268 ///    Optional extensions for the monitoring state.
269 /// \param __hints
270 ///    Optional hints for the monitoring state.
271 static __inline__ void __DEFAULT_FN_ATTRS
_mm_monitor(void const * __p,unsigned __extensions,unsigned __hints)272 _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
273 {
274   __builtin_ia32_monitor(__p, __extensions, __hints);
275 }
276 
277 /// Used with the \c MONITOR instruction to wait while the processor is in
278 ///    the monitor event pending state. Data stored in the monitored address
279 ///    range, or an interrupt, causes the processor to exit the pending state.
280 ///
281 /// The \c MWAIT instruction can be used in kernel mode, and in other modes if
282 /// MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
283 ///
284 /// \headerfile <x86intrin.h>
285 ///
286 /// This intrinsic corresponds to the \c MWAIT instruction.
287 ///
288 /// \param __extensions
289 ///    Optional extensions for the monitoring state, which can vary by
290 ///    processor.
291 /// \param __hints
292 ///    Optional hints for the monitoring state, which can vary by processor.
293 static __inline__ void __DEFAULT_FN_ATTRS
_mm_mwait(unsigned __extensions,unsigned __hints)294 _mm_mwait(unsigned __extensions, unsigned __hints)
295 {
296   __builtin_ia32_mwait(__extensions, __hints);
297 }
298 
299 #undef __DEFAULT_FN_ATTRS
300 
301 #endif /* __PMMINTRIN_H */
302