1 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10 #ifndef __IMMINTRIN_H
11 #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
12 #endif
13
14 #ifndef __FMAINTRIN_H
15 #define __FMAINTRIN_H
16
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
20
21 /// Computes a multiply-add of 128-bit vectors of [4 x float].
22 /// For each element, computes <c> (__A * __B) + __C </c>.
23 ///
24 /// \headerfile <immintrin.h>
25 ///
26 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
27 ///
28 /// \param __A
29 /// A 128-bit vector of [4 x float] containing the multiplicand.
30 /// \param __B
31 /// A 128-bit vector of [4 x float] containing the multiplier.
32 /// \param __C
33 /// A 128-bit vector of [4 x float] containing the addend.
34 /// \returns A 128-bit vector of [4 x float] containing the result.
35 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmadd_ps(__m128 __A,__m128 __B,__m128 __C)36 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
37 {
38 return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
39 }
40
41 /// Computes a multiply-add of 128-bit vectors of [2 x double].
42 /// For each element, computes <c> (__A * __B) + __C </c>.
43 ///
44 /// \headerfile <immintrin.h>
45 ///
46 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
47 ///
48 /// \param __A
49 /// A 128-bit vector of [2 x double] containing the multiplicand.
50 /// \param __B
51 /// A 128-bit vector of [2 x double] containing the multiplier.
52 /// \param __C
53 /// A 128-bit vector of [2 x double] containing the addend.
54 /// \returns A 128-bit [2 x double] vector containing the result.
55 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmadd_pd(__m128d __A,__m128d __B,__m128d __C)56 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
57 {
58 return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
59 }
60
61 /// Computes a scalar multiply-add of the single-precision values in the
62 /// low 32 bits of 128-bit vectors of [4 x float].
63 /// \code
64 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
65 /// result[127:32] = __A[127:32]
66 /// \endcode
67 ///
68 /// \headerfile <immintrin.h>
69 ///
70 /// This intrinsic corresponds to the \c VFMADD213SS instruction.
71 ///
72 /// \param __A
73 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
74 /// 32 bits.
75 /// \param __B
76 /// A 128-bit vector of [4 x float] containing the multiplier in the low
77 /// 32 bits.
78 /// \param __C
79 /// A 128-bit vector of [4 x float] containing the addend in the low
80 /// 32 bits.
81 /// \returns A 128-bit vector of [4 x float] containing the result in the low
82 /// 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
83 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmadd_ss(__m128 __A,__m128 __B,__m128 __C)84 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
85 {
86 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
87 }
88
89 /// Computes a scalar multiply-add of the double-precision values in the
90 /// low 64 bits of 128-bit vectors of [2 x double].
91 /// \code
92 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
93 /// result[127:64] = __A[127:64]
94 /// \endcode
95 ///
96 /// \headerfile <immintrin.h>
97 ///
98 /// This intrinsic corresponds to the \c VFMADD213SD instruction.
99 ///
100 /// \param __A
101 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
102 /// 64 bits.
103 /// \param __B
104 /// A 128-bit vector of [2 x double] containing the multiplier in the low
105 /// 64 bits.
106 /// \param __C
107 /// A 128-bit vector of [2 x double] containing the addend in the low
108 /// 64 bits.
109 /// \returns A 128-bit vector of [2 x double] containing the result in the low
110 /// 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
111 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmadd_sd(__m128d __A,__m128d __B,__m128d __C)112 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
113 {
114 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
115 }
116
117 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
118 /// For each element, computes <c> (__A * __B) - __C </c>.
119 ///
120 /// \headerfile <immintrin.h>
121 ///
122 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
123 ///
124 /// \param __A
125 /// A 128-bit vector of [4 x float] containing the multiplicand.
126 /// \param __B
127 /// A 128-bit vector of [4 x float] containing the multiplier.
128 /// \param __C
129 /// A 128-bit vector of [4 x float] containing the subtrahend.
130 /// \returns A 128-bit vector of [4 x float] containing the result.
131 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmsub_ps(__m128 __A,__m128 __B,__m128 __C)132 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
133 {
134 return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
135 }
136
137 /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
138 /// For each element, computes <c> (__A * __B) - __C </c>.
139 ///
140 /// \headerfile <immintrin.h>
141 ///
142 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
143 ///
144 /// \param __A
145 /// A 128-bit vector of [2 x double] containing the multiplicand.
146 /// \param __B
147 /// A 128-bit vector of [2 x double] containing the multiplier.
148 /// \param __C
149 /// A 128-bit vector of [2 x double] containing the addend.
150 /// \returns A 128-bit vector of [2 x double] containing the result.
151 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmsub_pd(__m128d __A,__m128d __B,__m128d __C)152 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
153 {
154 return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
155 }
156
157 /// Computes a scalar multiply-subtract of the single-precision values in
158 /// the low 32 bits of 128-bit vectors of [4 x float].
159 /// \code
160 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
161 /// result[127:32] = __A[127:32]
162 /// \endcode
163 ///
164 /// \headerfile <immintrin.h>
165 ///
166 /// This intrinsic corresponds to the \c VFMSUB213SS instruction.
167 ///
168 /// \param __A
169 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
170 /// 32 bits.
171 /// \param __B
172 /// A 128-bit vector of [4 x float] containing the multiplier in the low
173 /// 32 bits.
174 /// \param __C
175 /// A 128-bit vector of [4 x float] containing the subtrahend in the low
176 /// 32 bits.
177 /// \returns A 128-bit vector of [4 x float] containing the result in the low
178 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
179 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmsub_ss(__m128 __A,__m128 __B,__m128 __C)180 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
181 {
182 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
183 }
184
185 /// Computes a scalar multiply-subtract of the double-precision values in
186 /// the low 64 bits of 128-bit vectors of [2 x double].
187 /// \code
188 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
189 /// result[127:64] = __A[127:64]
190 /// \endcode
191 ///
192 /// \headerfile <immintrin.h>
193 ///
194 /// This intrinsic corresponds to the \c VFMSUB213SD instruction.
195 ///
196 /// \param __A
197 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
198 /// 64 bits.
199 /// \param __B
200 /// A 128-bit vector of [2 x double] containing the multiplier in the low
201 /// 64 bits.
202 /// \param __C
203 /// A 128-bit vector of [2 x double] containing the subtrahend in the low
204 /// 64 bits.
205 /// \returns A 128-bit vector of [2 x double] containing the result in the low
206 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
207 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmsub_sd(__m128d __A,__m128d __B,__m128d __C)208 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
209 {
210 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
211 }
212
213 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
214 /// For each element, computes <c> -(__A * __B) + __C </c>.
215 ///
216 /// \headerfile <immintrin.h>
217 ///
218 /// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
219 ///
220 /// \param __A
221 /// A 128-bit vector of [4 x float] containing the multiplicand.
222 /// \param __B
223 /// A 128-bit vector of [4 x float] containing the multiplier.
224 /// \param __C
225 /// A 128-bit vector of [4 x float] containing the addend.
226 /// \returns A 128-bit [4 x float] vector containing the result.
227 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmadd_ps(__m128 __A,__m128 __B,__m128 __C)228 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
229 {
230 return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
231 }
232
233 /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
234 /// For each element, computes <c> -(__A * __B) + __C </c>.
235 ///
236 /// \headerfile <immintrin.h>
237 ///
238 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
239 ///
240 /// \param __A
241 /// A 128-bit vector of [2 x double] containing the multiplicand.
242 /// \param __B
243 /// A 128-bit vector of [2 x double] containing the multiplier.
244 /// \param __C
245 /// A 128-bit vector of [2 x double] containing the addend.
246 /// \returns A 128-bit vector of [2 x double] containing the result.
247 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmadd_pd(__m128d __A,__m128d __B,__m128d __C)248 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
249 {
250 return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
251 }
252
253 /// Computes a scalar negated multiply-add of the single-precision values in
254 /// the low 32 bits of 128-bit vectors of [4 x float].
255 /// \code
256 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
257 /// result[127:32] = __A[127:32]
258 /// \endcode
259 ///
260 /// \headerfile <immintrin.h>
261 ///
262 /// This intrinsic corresponds to the \c VFNMADD213SS instruction.
263 ///
264 /// \param __A
265 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
266 /// 32 bits.
267 /// \param __B
268 /// A 128-bit vector of [4 x float] containing the multiplier in the low
269 /// 32 bits.
270 /// \param __C
271 /// A 128-bit vector of [4 x float] containing the addend in the low
272 /// 32 bits.
273 /// \returns A 128-bit vector of [4 x float] containing the result in the low
274 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
275 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmadd_ss(__m128 __A,__m128 __B,__m128 __C)276 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
277 {
278 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
279 }
280
281 /// Computes a scalar negated multiply-add of the double-precision values
282 /// in the low 64 bits of 128-bit vectors of [2 x double].
283 /// \code
284 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
285 /// result[127:64] = __A[127:64]
286 /// \endcode
287 ///
288 /// \headerfile <immintrin.h>
289 ///
290 /// This intrinsic corresponds to the \c VFNMADD213SD instruction.
291 ///
292 /// \param __A
293 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
294 /// 64 bits.
295 /// \param __B
296 /// A 128-bit vector of [2 x double] containing the multiplier in the low
297 /// 64 bits.
298 /// \param __C
299 /// A 128-bit vector of [2 x double] containing the addend in the low
300 /// 64 bits.
301 /// \returns A 128-bit vector of [2 x double] containing the result in the low
302 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
303 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmadd_sd(__m128d __A,__m128d __B,__m128d __C)304 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
305 {
306 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
307 }
308
309 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
310 /// For each element, computes <c> -(__A * __B) - __C </c>.
311 ///
312 /// \headerfile <immintrin.h>
313 ///
314 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
315 ///
316 /// \param __A
317 /// A 128-bit vector of [4 x float] containing the multiplicand.
318 /// \param __B
319 /// A 128-bit vector of [4 x float] containing the multiplier.
320 /// \param __C
321 /// A 128-bit vector of [4 x float] containing the subtrahend.
322 /// \returns A 128-bit vector of [4 x float] containing the result.
323 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmsub_ps(__m128 __A,__m128 __B,__m128 __C)324 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
325 {
326 return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
327 }
328
329 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
330 /// For each element, computes <c> -(__A * __B) - __C </c>.
331 ///
332 /// \headerfile <immintrin.h>
333 ///
334 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
335 ///
336 /// \param __A
337 /// A 128-bit vector of [2 x double] containing the multiplicand.
338 /// \param __B
339 /// A 128-bit vector of [2 x double] containing the multiplier.
340 /// \param __C
341 /// A 128-bit vector of [2 x double] containing the subtrahend.
342 /// \returns A 128-bit vector of [2 x double] containing the result.
343 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmsub_pd(__m128d __A,__m128d __B,__m128d __C)344 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
345 {
346 return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
347 }
348
349 /// Computes a scalar negated multiply-subtract of the single-precision
350 /// values in the low 32 bits of 128-bit vectors of [4 x float].
351 /// \code
352 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
353 /// result[127:32] = __A[127:32]
354 /// \endcode
355 ///
356 /// \headerfile <immintrin.h>
357 ///
358 /// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
359 ///
360 /// \param __A
361 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
362 /// 32 bits.
363 /// \param __B
364 /// A 128-bit vector of [4 x float] containing the multiplier in the low
365 /// 32 bits.
366 /// \param __C
367 /// A 128-bit vector of [4 x float] containing the subtrahend in the low
368 /// 32 bits.
369 /// \returns A 128-bit vector of [4 x float] containing the result in the low
370 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
371 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmsub_ss(__m128 __A,__m128 __B,__m128 __C)372 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
373 {
374 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
375 }
376
377 /// Computes a scalar negated multiply-subtract of the double-precision
378 /// values in the low 64 bits of 128-bit vectors of [2 x double].
379 /// \code
380 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
381 /// result[127:64] = __A[127:64]
382 /// \endcode
383 ///
384 /// \headerfile <immintrin.h>
385 ///
386 /// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
387 ///
388 /// \param __A
389 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
390 /// 64 bits.
391 /// \param __B
392 /// A 128-bit vector of [2 x double] containing the multiplier in the low
393 /// 64 bits.
394 /// \param __C
395 /// A 128-bit vector of [2 x double] containing the subtrahend in the low
396 /// 64 bits.
397 /// \returns A 128-bit vector of [2 x double] containing the result in the low
398 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
399 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmsub_sd(__m128d __A,__m128d __B,__m128d __C)400 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
401 {
402 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
403 }
404
405 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
406 /// [4 x float].
407 /// \code
408 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
409 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
410 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
411 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
412 /// \endcode
413 ///
414 /// \headerfile <immintrin.h>
415 ///
416 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
417 ///
418 /// \param __A
419 /// A 128-bit vector of [4 x float] containing the multiplicand.
420 /// \param __B
421 /// A 128-bit vector of [4 x float] containing the multiplier.
422 /// \param __C
423 /// A 128-bit vector of [4 x float] containing the addend/subtrahend.
424 /// \returns A 128-bit vector of [4 x float] containing the result.
425 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmaddsub_ps(__m128 __A,__m128 __B,__m128 __C)426 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
427 {
428 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
429 }
430
431 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
432 /// [2 x double].
433 /// \code
434 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
435 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
436 /// \endcode
437 ///
438 /// \headerfile <immintrin.h>
439 ///
440 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
441 ///
442 /// \param __A
443 /// A 128-bit vector of [2 x double] containing the multiplicand.
444 /// \param __B
445 /// A 128-bit vector of [2 x double] containing the multiplier.
446 /// \param __C
447 /// A 128-bit vector of [2 x double] containing the addend/subtrahend.
448 /// \returns A 128-bit vector of [2 x double] containing the result.
449 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmaddsub_pd(__m128d __A,__m128d __B,__m128d __C)450 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
451 {
452 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
453 }
454
455 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
456 /// [4 x float].
457 /// \code
458 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
459 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
460 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
461 /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
462 /// \endcode
463 ///
464 /// \headerfile <immintrin.h>
465 ///
466 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
467 ///
468 /// \param __A
469 /// A 128-bit vector of [4 x float] containing the multiplicand.
470 /// \param __B
471 /// A 128-bit vector of [4 x float] containing the multiplier.
472 /// \param __C
473 /// A 128-bit vector of [4 x float] containing the addend/subtrahend.
474 /// \returns A 128-bit vector of [4 x float] containing the result.
475 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmsubadd_ps(__m128 __A,__m128 __B,__m128 __C)476 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
477 {
478 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
479 }
480
481 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
482 /// [2 x double].
483 /// \code
484 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
485 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
486 /// \endcode
487 ///
488 /// \headerfile <immintrin.h>
489 ///
490 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
491 ///
492 /// \param __A
493 /// A 128-bit vector of [2 x double] containing the multiplicand.
494 /// \param __B
495 /// A 128-bit vector of [2 x double] containing the multiplier.
496 /// \param __C
497 /// A 128-bit vector of [2 x double] containing the addend/subtrahend.
498 /// \returns A 128-bit vector of [2 x double] containing the result.
499 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmsubadd_pd(__m128d __A,__m128d __B,__m128d __C)500 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
501 {
502 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
503 }
504
505 /// Computes a multiply-add of 256-bit vectors of [8 x float].
506 /// For each element, computes <c> (__A * __B) + __C </c>.
507 ///
508 /// \headerfile <immintrin.h>
509 ///
510 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
511 ///
512 /// \param __A
513 /// A 256-bit vector of [8 x float] containing the multiplicand.
514 /// \param __B
515 /// A 256-bit vector of [8 x float] containing the multiplier.
516 /// \param __C
517 /// A 256-bit vector of [8 x float] containing the addend.
518 /// \returns A 256-bit vector of [8 x float] containing the result.
519 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmadd_ps(__m256 __A,__m256 __B,__m256 __C)520 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
521 {
522 return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
523 }
524
525 /// Computes a multiply-add of 256-bit vectors of [4 x double].
526 /// For each element, computes <c> (__A * __B) + __C </c>.
527 ///
528 /// \headerfile <immintrin.h>
529 ///
530 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
531 ///
532 /// \param __A
533 /// A 256-bit vector of [4 x double] containing the multiplicand.
534 /// \param __B
535 /// A 256-bit vector of [4 x double] containing the multiplier.
536 /// \param __C
537 /// A 256-bit vector of [4 x double] containing the addend.
538 /// \returns A 256-bit vector of [4 x double] containing the result.
539 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmadd_pd(__m256d __A,__m256d __B,__m256d __C)540 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
541 {
542 return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
543 }
544
545 /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
546 /// For each element, computes <c> (__A * __B) - __C </c>.
547 ///
548 /// \headerfile <immintrin.h>
549 ///
550 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
551 ///
552 /// \param __A
553 /// A 256-bit vector of [8 x float] containing the multiplicand.
554 /// \param __B
555 /// A 256-bit vector of [8 x float] containing the multiplier.
556 /// \param __C
557 /// A 256-bit vector of [8 x float] containing the subtrahend.
558 /// \returns A 256-bit vector of [8 x float] containing the result.
559 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmsub_ps(__m256 __A,__m256 __B,__m256 __C)560 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
561 {
562 return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
563 }
564
565 /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
566 /// For each element, computes <c> (__A * __B) - __C </c>.
567 ///
568 /// \headerfile <immintrin.h>
569 ///
570 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
571 ///
572 /// \param __A
573 /// A 256-bit vector of [4 x double] containing the multiplicand.
574 /// \param __B
575 /// A 256-bit vector of [4 x double] containing the multiplier.
576 /// \param __C
577 /// A 256-bit vector of [4 x double] containing the subtrahend.
578 /// \returns A 256-bit vector of [4 x double] containing the result.
579 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmsub_pd(__m256d __A,__m256d __B,__m256d __C)580 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
581 {
582 return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
583 }
584
585 /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
586 /// For each element, computes <c> -(__A * __B) + __C </c>.
587 ///
588 /// \headerfile <immintrin.h>
589 ///
590 /// This intrinsic corresponds to the \c VFNMADD213PS instruction.
591 ///
592 /// \param __A
593 /// A 256-bit vector of [8 x float] containing the multiplicand.
594 /// \param __B
595 /// A 256-bit vector of [8 x float] containing the multiplier.
596 /// \param __C
597 /// A 256-bit vector of [8 x float] containing the addend.
598 /// \returns A 256-bit vector of [8 x float] containing the result.
599 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fnmadd_ps(__m256 __A,__m256 __B,__m256 __C)600 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
601 {
602 return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
603 }
604
605 /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
606 /// For each element, computes <c> -(__A * __B) + __C </c>.
607 ///
608 /// \headerfile <immintrin.h>
609 ///
610 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
611 ///
612 /// \param __A
613 /// A 256-bit vector of [4 x double] containing the multiplicand.
614 /// \param __B
615 /// A 256-bit vector of [4 x double] containing the multiplier.
616 /// \param __C
617 /// A 256-bit vector of [4 x double] containing the addend.
618 /// \returns A 256-bit vector of [4 x double] containing the result.
619 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fnmadd_pd(__m256d __A,__m256d __B,__m256d __C)620 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
621 {
622 return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
623 }
624
625 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
626 /// For each element, computes <c> -(__A * __B) - __C </c>.
627 ///
628 /// \headerfile <immintrin.h>
629 ///
630 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
631 ///
632 /// \param __A
633 /// A 256-bit vector of [8 x float] containing the multiplicand.
634 /// \param __B
635 /// A 256-bit vector of [8 x float] containing the multiplier.
636 /// \param __C
637 /// A 256-bit vector of [8 x float] containing the subtrahend.
638 /// \returns A 256-bit vector of [8 x float] containing the result.
639 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fnmsub_ps(__m256 __A,__m256 __B,__m256 __C)640 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
641 {
642 return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
643 }
644
645 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
646 /// For each element, computes <c> -(__A * __B) - __C </c>.
647 ///
648 /// \headerfile <immintrin.h>
649 ///
650 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
651 ///
652 /// \param __A
653 /// A 256-bit vector of [4 x double] containing the multiplicand.
654 /// \param __B
655 /// A 256-bit vector of [4 x double] containing the multiplier.
656 /// \param __C
657 /// A 256-bit vector of [4 x double] containing the subtrahend.
658 /// \returns A 256-bit vector of [4 x double] containing the result.
659 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fnmsub_pd(__m256d __A,__m256d __B,__m256d __C)660 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
661 {
662 return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
663 }
664
665 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
666 /// [8 x float].
667 /// \code
668 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
669 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
670 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
671 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
672 /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
673 /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
674 /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
675 /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
676 /// \endcode
677 ///
678 /// \headerfile <immintrin.h>
679 ///
680 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
681 ///
682 /// \param __A
683 /// A 256-bit vector of [8 x float] containing the multiplicand.
684 /// \param __B
685 /// A 256-bit vector of [8 x float] containing the multiplier.
686 /// \param __C
687 /// A 256-bit vector of [8 x float] containing the addend/subtrahend.
688 /// \returns A 256-bit vector of [8 x float] containing the result.
689 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmaddsub_ps(__m256 __A,__m256 __B,__m256 __C)690 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
691 {
692 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
693 }
694
695 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
696 /// [4 x double].
697 /// \code
698 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
699 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
700 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
701 /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
702 /// \endcode
703 ///
704 /// \headerfile <immintrin.h>
705 ///
706 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
707 ///
708 /// \param __A
709 /// A 256-bit vector of [4 x double] containing the multiplicand.
710 /// \param __B
711 /// A 256-bit vector of [4 x double] containing the multiplier.
712 /// \param __C
713 /// A 256-bit vector of [4 x double] containing the addend/subtrahend.
714 /// \returns A 256-bit vector of [4 x double] containing the result.
715 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmaddsub_pd(__m256d __A,__m256d __B,__m256d __C)716 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
717 {
718 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
719 }
720
721 /// Computes a vector multiply with alternating add/subtract of 256-bit
722 /// vectors of [8 x float].
723 /// \code
724 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
725 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
726 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
727 /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
728 /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
729 /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
730 /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
731 /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
732 /// \endcode
733 ///
734 /// \headerfile <immintrin.h>
735 ///
736 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
737 ///
738 /// \param __A
739 /// A 256-bit vector of [8 x float] containing the multiplicand.
740 /// \param __B
741 /// A 256-bit vector of [8 x float] containing the multiplier.
742 /// \param __C
743 /// A 256-bit vector of [8 x float] containing the addend/subtrahend.
744 /// \returns A 256-bit vector of [8 x float] containing the result.
745 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmsubadd_ps(__m256 __A,__m256 __B,__m256 __C)746 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
747 {
748 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
749 }
750
751 /// Computes a vector multiply with alternating add/subtract of 256-bit
752 /// vectors of [4 x double].
753 /// \code
754 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
755 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
756 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
757 /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
758 /// \endcode
759 ///
760 /// \headerfile <immintrin.h>
761 ///
762 /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
763 ///
764 /// \param __A
765 /// A 256-bit vector of [4 x double] containing the multiplicand.
766 /// \param __B
767 /// A 256-bit vector of [4 x double] containing the multiplier.
768 /// \param __C
769 /// A 256-bit vector of [4 x double] containing the addend/subtrahend.
770 /// \returns A 256-bit vector of [4 x double] containing the result.
771 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmsubadd_pd(__m256d __A,__m256d __B,__m256d __C)772 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
773 {
774 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
775 }
776
777 #undef __DEFAULT_FN_ATTRS128
778 #undef __DEFAULT_FN_ATTRS256
779
780 #endif /* __FMAINTRIN_H */
781