1 /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __IMMINTRIN_H
11 #error                                                                         \
12     "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13 #endif // __IMMINTRIN_H
14 
15 #ifndef __AVXVNNIINT16INTRIN_H
16 #define __AVXVNNIINT16INTRIN_H
17 
18 /* Define the default attributes for the functions in this file. */
19 #define __DEFAULT_FN_ATTRS128                                                  \
20   __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
21                  __min_vector_width__(128)))
22 #define __DEFAULT_FN_ATTRS256                                                  \
23   __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
24                  __min_vector_width__(256)))
25 
26 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
27 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
28 ///    signed 16-bit results. Sum these 2 results with the corresponding
29 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
30 ///
31 /// \headerfile <immintrin.h>
32 ///
33 /// \code
34 /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
35 /// \endcode
36 ///
37 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
38 ///
39 /// \param __W
40 ///    A 128-bit vector of [4 x int].
41 /// \param __A
42 ///    A 128-bit vector of [8 x short].
43 /// \param __B
44 ///    A 128-bit vector of [8 x unsigned short].
45 /// \returns
46 ///    A 128-bit vector of [4 x int].
47 ///
48 /// \code{.operation}
49 /// FOR j := 0 to 3
50 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
51 /// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
52 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
53 /// ENDFOR
54 /// dst[MAX:128] := 0
55 /// \endcode
56 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
57                                                                  __m128i __A,
58                                                                  __m128i __B) {
59   return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
60                                              (__v4si)__B);
61 }
62 
63 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
64 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
65 ///    signed 16-bit results. Sum these 2 results with the corresponding
66 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
67 ///
68 /// \headerfile <immintrin.h>
69 ///
70 /// \code
71 /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
72 /// \endcode
73 ///
74 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
75 ///
76 /// \param __W
77 ///    A 256-bit vector of [8 x int].
78 /// \param __A
79 ///    A 256-bit vector of [16 x short].
80 /// \param __B
81 ///    A 256-bit vector of [16 x unsigned short].
82 /// \returns
83 ///    A 256-bit vector of [8 x int].
84 ///
85 /// \code{.operation}
86 /// FOR j := 0 to 7
87 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
88 /// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
89 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
90 /// ENDFOR
91 /// dst[MAX:256] := 0
92 /// \endcode
93 static __inline__ __m256i __DEFAULT_FN_ATTRS256
94 _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
95   return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
96                                              (__v8si)__B);
97 }
98 
99 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
100 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
101 ///    signed 16-bit results. Sum these 2 results with the corresponding
102 ///    32-bit integer in \a __W with signed saturation, and store the packed
103 ///    32-bit results in \a dst.
104 ///
105 /// \headerfile <immintrin.h>
106 ///
107 /// \code
108 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
109 /// \endcode
110 ///
111 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
112 ///
113 /// \param __W
114 ///    A 128-bit vector of [4 x int].
115 /// \param __A
116 ///    A 128-bit vector of [8 x short].
117 /// \param __B
118 ///    A 128-bit vector of [8 x unsigned short].
119 /// \returns
120 ///    A 128-bit vector of [4 x int].
121 ///
122 /// \code{.operation}
123 /// FOR j := 0 to 3
124 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
125 /// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
126 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
127 /// ENDFOR
128 /// dst[MAX:128] := 0
129 /// \endcode
130 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
131                                                                   __m128i __A,
132                                                                   __m128i __B) {
133   return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
134                                               (__v4si)__B);
135 }
136 
137 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
138 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
139 ///    signed 16-bit results. Sum these 2 results with the corresponding
140 ///    32-bit integer in \a __W with signed saturation, and store the packed
141 ///    32-bit results in \a dst.
142 ///
143 /// \headerfile <immintrin.h>
144 ///
145 /// \code
146 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
147 /// \endcode
148 ///
149 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
150 ///
151 /// \param __W
152 ///    A 256-bit vector of [8 x int].
153 /// \param __A
154 ///    A 256-bit vector of [16 x short].
155 /// \param __B
156 ///    A 256-bit vector of [16 x unsigned short].
157 /// \returns
158 ///    A 256-bit vector of [8 x int].
159 ///
160 /// \code{.operation}
161 /// FOR j := 0 to 7
162 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
163 /// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
164 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
165 /// ENDFOR
166 /// dst[MAX:256] := 0
167 /// \endcode
168 static __inline__ __m256i __DEFAULT_FN_ATTRS256
169 _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
170   return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
171                                               (__v8si)__B);
172 }
173 
174 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
175 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
176 ///    signed 16-bit results. Sum these 2 results with the corresponding
177 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
178 ///
179 /// \headerfile <immintrin.h>
180 ///
181 /// \code
182 /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
183 /// \endcode
184 ///
185 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
186 ///
187 /// \param __W
188 ///    A 128-bit vector of [4 x int].
189 /// \param __A
190 ///    A 128-bit vector of [8 x unsigned short].
191 /// \param __B
192 ///    A 128-bit vector of [8 x short].
193 /// \returns
194 ///    A 128-bit vector of [4 x int].
195 ///
196 /// \code{.operation}
197 /// FOR j := 0 to 3
198 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
199 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
200 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
201 /// ENDFOR
202 /// dst[MAX:128] := 0
203 /// \endcode
204 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
205                                                                  __m128i __A,
206                                                                  __m128i __B) {
207   return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
208                                              (__v4si)__B);
209 }
210 
211 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
212 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
213 ///    signed 16-bit results. Sum these 2 results with the corresponding
214 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
215 ///
216 /// \headerfile <immintrin.h>
217 ///
218 /// \code
219 /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
220 /// \endcode
221 ///
222 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
223 ///
224 /// \param __W
225 ///    A 256-bit vector of [8 x int].
226 /// \param __A
227 ///    A 256-bit vector of [16 x unsigned short].
228 /// \param __B
229 ///    A 256-bit vector of [16 x short].
230 /// \returns
231 ///    A 256-bit vector of [8 x int].
232 ///
233 /// \code{.operation}
234 /// FOR j := 0 to 7
235 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
236 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
237 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
238 /// ENDFOR
239 /// dst[MAX:256] := 0
240 /// \endcode
241 static __inline__ __m256i __DEFAULT_FN_ATTRS256
242 _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
243   return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
244                                              (__v8si)__B);
245 }
246 
247 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
248 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
249 ///    signed 16-bit results. Sum these 2 results with the corresponding
250 ///    32-bit integer in \a __W with signed saturation, and store the packed
251 ///    32-bit results in \a dst.
252 ///
253 /// \headerfile <immintrin.h>
254 ///
255 /// \code
256 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
257 /// \endcode
258 ///
259 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
260 ///
261 /// \param __W
262 ///    A 128-bit vector of [4 x int].
263 /// \param __A
264 ///    A 128-bit vector of [8 x unsigned short].
265 /// \param __B
266 ///    A 128-bit vector of [8 x short].
267 /// \returns
268 ///    A 128-bit vector of [4 x int].
269 ///
270 /// \code{.operation}
271 /// FOR j := 0 to 3
272 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
273 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
274 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
275 /// ENDFOR
276 /// dst[MAX:128] := 0
277 /// \endcode
278 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
279                                                                   __m128i __A,
280                                                                   __m128i __B) {
281   return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
282                                               (__v4si)__B);
283 }
284 
285 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
286 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
287 ///    signed 16-bit results. Sum these 2 results with the corresponding
288 ///    32-bit integer in \a __W with signed saturation, and store the packed
289 ///    32-bit results in \a dst.
290 ///
291 /// \headerfile <immintrin.h>
292 ///
293 /// \code
294 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
295 /// \endcode
296 ///
297 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
298 ///
299 /// \param __W
300 ///    A 256-bit vector of [8 x int].
301 /// \param __A
302 ///    A 256-bit vector of [16 x unsigned short].
303 /// \param __B
304 ///    A 256-bit vector of [16 x short].
305 /// \returns
306 ///    A 256-bit vector of [8 x int].
307 ///
308 /// \code{.operation}
309 /// FOR j := 0 to 7
310 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
311 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
312 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
313 /// ENDFOR
314 /// dst[MAX:256] := 0
315 /// \endcode
316 static __inline__ __m256i __DEFAULT_FN_ATTRS256
317 _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
318   return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
319                                               (__v8si)__B);
320 }
321 
322 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
323 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
324 ///    signed 16-bit results. Sum these 2 results with the corresponding
325 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
326 ///
327 /// \headerfile <immintrin.h>
328 ///
329 /// \code
330 /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
331 /// \endcode
332 ///
333 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
334 ///
335 /// \param __W
336 ///    A 128-bit vector of [4 x unsigned int].
337 /// \param __A
338 ///    A 128-bit vector of [8 x unsigned short].
339 /// \param __B
340 ///    A 128-bit vector of [8 x unsigned short].
341 /// \returns
342 ///    A 128-bit vector of [4 x unsigned int].
343 ///
344 /// \code{.operation}
345 /// FOR j := 0 to 3
346 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
347 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
348 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
349 /// ENDFOR
350 /// dst[MAX:128] := 0
351 /// \endcode
352 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
353                                                                  __m128i __A,
354                                                                  __m128i __B) {
355   return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
356                                              (__v4si)__B);
357 }
358 
359 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
360 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
361 ///    signed 16-bit results. Sum these 2 results with the corresponding
362 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
363 ///
364 /// \headerfile <immintrin.h>
365 ///
366 /// \code
367 /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
368 /// \endcode
369 ///
370 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
371 ///
372 /// \param __W
373 ///    A 256-bit vector of [8 x unsigned int].
374 /// \param __A
375 ///    A 256-bit vector of [16 x unsigned short].
376 /// \param __B
377 ///    A 256-bit vector of [16 x unsigned short].
378 /// \returns
379 ///    A 256-bit vector of [8 x unsigned int].
380 ///
381 /// \code{.operation}
382 /// FOR j := 0 to 7
383 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
384 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
385 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
386 /// ENDFOR
387 /// dst[MAX:256] := 0
388 /// \endcode
389 static __inline__ __m256i __DEFAULT_FN_ATTRS256
390 _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
391   return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
392                                              (__v8si)__B);
393 }
394 
395 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
396 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
397 ///    signed 16-bit results. Sum these 2 results with the corresponding
398 ///    32-bit integer in \a __W with signed saturation, and store the packed
399 ///    32-bit results in \a dst.
400 ///
401 /// \headerfile <immintrin.h>
402 ///
403 /// \code
404 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
405 /// \endcode
406 ///
407 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
408 ///
409 /// \param __W
410 ///    A 128-bit vector of [4 x unsigned int].
411 /// \param __A
412 ///    A 128-bit vector of [8 x unsigned short].
413 /// \param __B
414 ///    A 128-bit vector of [8 x unsigned short].
415 /// \returns
416 ///    A 128-bit vector of [4 x unsigned int].
417 ///
418 /// \code{.operation}
419 /// FOR j := 0 to 3
420 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
421 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
422 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
423 /// ENDFOR
424 /// dst[MAX:128] := 0
425 /// \endcode
426 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
427                                                                   __m128i __A,
428                                                                   __m128i __B) {
429   return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
430                                               (__v4si)__B);
431 }
432 
433 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
434 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
435 ///    signed 16-bit results. Sum these 2 results with the corresponding
436 ///    32-bit integer in \a __W with signed saturation, and store the packed
437 ///    32-bit results in \a dst.
438 ///
439 /// \headerfile <immintrin.h>
440 ///
441 /// \code
442 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
443 /// \endcode
444 ///
445 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
446 ///
447 /// \param __W
448 ///    A 256-bit vector of [8 x unsigned int].
449 /// \param __A
450 ///    A 256-bit vector of [16 x unsigned short].
451 /// \param __B
452 ///    A 256-bit vector of [16 x unsigned short].
453 /// \returns
454 ///    A 256-bit vector of [8 x unsigned int].
455 ///
456 /// \code{.operation}
457 /// FOR j := 0 to 7
458 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
459 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
460 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
461 /// ENDFOR
462 /// dst[MAX:256] := 0
463 /// \endcode
464 static __inline__ __m256i __DEFAULT_FN_ATTRS256
465 _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
466   return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
467                                               (__v8si)__B);
468 }
469 
470 #undef __DEFAULT_FN_ATTRS128
471 #undef __DEFAULT_FN_ATTRS256
472 
473 #endif // __AVXVNNIINT16INTRIN_H
474