1*38fd1498Szrj /* Copyright (C) 2013-2018 Free Software Foundation, Inc.
2*38fd1498Szrj 
3*38fd1498Szrj    This file is part of GCC.
4*38fd1498Szrj 
5*38fd1498Szrj    GCC is free software; you can redistribute it and/or modify
6*38fd1498Szrj    it under the terms of the GNU General Public License as published by
7*38fd1498Szrj    the Free Software Foundation; either version 3, or (at your option)
8*38fd1498Szrj    any later version.
9*38fd1498Szrj 
10*38fd1498Szrj    GCC is distributed in the hope that it will be useful,
11*38fd1498Szrj    but WITHOUT ANY WARRANTY; without even the implied warranty of
12*38fd1498Szrj    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13*38fd1498Szrj    GNU General Public License for more details.
14*38fd1498Szrj 
15*38fd1498Szrj    Under Section 7 of GPL version 3, you are granted additional
16*38fd1498Szrj    permissions described in the GCC Runtime Library Exception, version
17*38fd1498Szrj    3.1, as published by the Free Software Foundation.
18*38fd1498Szrj 
19*38fd1498Szrj    You should have received a copy of the GNU General Public License and
20*38fd1498Szrj    a copy of the GCC Runtime Library Exception along with this program;
21*38fd1498Szrj    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22*38fd1498Szrj    <http://www.gnu.org/licenses/>.  */
23*38fd1498Szrj 
24*38fd1498Szrj #ifndef _IMMINTRIN_H_INCLUDED
25*38fd1498Szrj #error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
26*38fd1498Szrj #endif
27*38fd1498Szrj 
28*38fd1498Szrj #ifndef __AVX512VBMI2INTRIN_H_INCLUDED
29*38fd1498Szrj #define __AVX512VBMI2INTRIN_H_INCLUDED
30*38fd1498Szrj 
31*38fd1498Szrj #if !defined(__AVX512VBMI2__)
32*38fd1498Szrj #pragma GCC push_options
33*38fd1498Szrj #pragma GCC target("avx512vbmi2")
34*38fd1498Szrj #define __DISABLE_AVX512VBMI2__
35*38fd1498Szrj #endif /* __AVX512VBMI2__ */
36*38fd1498Szrj 
37*38fd1498Szrj #ifdef __OPTIMIZE__
38*38fd1498Szrj extern __inline __m512i
39*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shrdi_epi16(__m512i __A,__m512i __B,int __C)40*38fd1498Szrj _mm512_shrdi_epi16 (__m512i __A, __m512i __B, int __C)
41*38fd1498Szrj {
42*38fd1498Szrj   return (__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)__A, (__v32hi) __B,
43*38fd1498Szrj 									__C);
44*38fd1498Szrj }
45*38fd1498Szrj 
46*38fd1498Szrj extern __inline __m512i
47*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shrdi_epi32(__m512i __A,__m512i __B,int __C)48*38fd1498Szrj _mm512_shrdi_epi32 (__m512i __A, __m512i __B, int __C)
49*38fd1498Szrj {
50*38fd1498Szrj   return (__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)__A, (__v16si) __B,
51*38fd1498Szrj 									__C);
52*38fd1498Szrj }
53*38fd1498Szrj 
54*38fd1498Szrj extern __inline __m512i
55*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shrdi_epi32(__m512i __A,__mmask16 __B,__m512i __C,__m512i __D,int __E)56*38fd1498Szrj _mm512_mask_shrdi_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D,
57*38fd1498Szrj 								int __E)
58*38fd1498Szrj {
59*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshrd_v16si_mask ((__v16si)__C,
60*38fd1498Szrj 			(__v16si) __D, __E, (__v16si) __A, (__mmask16)__B);
61*38fd1498Szrj }
62*38fd1498Szrj 
63*38fd1498Szrj extern __inline __m512i
64*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shrdi_epi32(__mmask16 __A,__m512i __B,__m512i __C,int __D)65*38fd1498Szrj _mm512_maskz_shrdi_epi32 (__mmask16 __A, __m512i __B, __m512i __C, int __D)
66*38fd1498Szrj {
67*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshrd_v16si_mask ((__v16si)__B,
68*38fd1498Szrj 	(__v16si) __C, __D, (__v16si) _mm512_setzero_si512 (), (__mmask16)__A);
69*38fd1498Szrj }
70*38fd1498Szrj 
71*38fd1498Szrj extern __inline __m512i
72*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shrdi_epi64(__m512i __A,__m512i __B,int __C)73*38fd1498Szrj _mm512_shrdi_epi64 (__m512i __A, __m512i __B, int __C)
74*38fd1498Szrj {
75*38fd1498Szrj   return (__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)__A, (__v8di) __B, __C);
76*38fd1498Szrj }
77*38fd1498Szrj 
78*38fd1498Szrj extern __inline __m512i
79*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shrdi_epi64(__m512i __A,__mmask8 __B,__m512i __C,__m512i __D,int __E)80*38fd1498Szrj _mm512_mask_shrdi_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D,
81*38fd1498Szrj 								int __E)
82*38fd1498Szrj {
83*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshrd_v8di_mask ((__v8di)__C, (__v8di) __D,
84*38fd1498Szrj 					__E, (__v8di) __A, (__mmask8)__B);
85*38fd1498Szrj }
86*38fd1498Szrj 
87*38fd1498Szrj extern __inline __m512i
88*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shrdi_epi64(__mmask8 __A,__m512i __B,__m512i __C,int __D)89*38fd1498Szrj _mm512_maskz_shrdi_epi64 (__mmask8 __A, __m512i __B, __m512i __C, int __D)
90*38fd1498Szrj {
91*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshrd_v8di_mask ((__v8di)__B, (__v8di) __C,
92*38fd1498Szrj 			__D, (__v8di) _mm512_setzero_si512 (), (__mmask8)__A);
93*38fd1498Szrj }
94*38fd1498Szrj 
95*38fd1498Szrj extern __inline __m512i
96*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shldi_epi16(__m512i __A,__m512i __B,int __C)97*38fd1498Szrj _mm512_shldi_epi16 (__m512i __A, __m512i __B, int __C)
98*38fd1498Szrj {
99*38fd1498Szrj   return (__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)__A, (__v32hi) __B,
100*38fd1498Szrj 									__C);
101*38fd1498Szrj }
102*38fd1498Szrj 
103*38fd1498Szrj extern __inline __m512i
104*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shldi_epi32(__m512i __A,__m512i __B,int __C)105*38fd1498Szrj _mm512_shldi_epi32 (__m512i __A, __m512i __B, int __C)
106*38fd1498Szrj {
107*38fd1498Szrj   return (__m512i) __builtin_ia32_vpshld_v16si ((__v16si)__A, (__v16si) __B,
108*38fd1498Szrj 									__C);
109*38fd1498Szrj }
110*38fd1498Szrj 
111*38fd1498Szrj extern __inline __m512i
112*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shldi_epi32(__m512i __A,__mmask16 __B,__m512i __C,__m512i __D,int __E)113*38fd1498Szrj _mm512_mask_shldi_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D,
114*38fd1498Szrj 								int __E)
115*38fd1498Szrj {
116*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshld_v16si_mask ((__v16si)__C,
117*38fd1498Szrj 			(__v16si) __D, __E, (__v16si) __A, (__mmask16)__B);
118*38fd1498Szrj }
119*38fd1498Szrj 
120*38fd1498Szrj extern __inline __m512i
121*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shldi_epi32(__mmask16 __A,__m512i __B,__m512i __C,int __D)122*38fd1498Szrj _mm512_maskz_shldi_epi32 (__mmask16 __A, __m512i __B, __m512i __C, int __D)
123*38fd1498Szrj {
124*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshld_v16si_mask ((__v16si)__B,
125*38fd1498Szrj 	(__v16si) __C, __D, (__v16si) _mm512_setzero_si512 (), (__mmask16)__A);
126*38fd1498Szrj }
127*38fd1498Szrj 
128*38fd1498Szrj extern __inline __m512i
129*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shldi_epi64(__m512i __A,__m512i __B,int __C)130*38fd1498Szrj _mm512_shldi_epi64 (__m512i __A, __m512i __B, int __C)
131*38fd1498Szrj {
132*38fd1498Szrj   return (__m512i) __builtin_ia32_vpshld_v8di ((__v8di)__A, (__v8di) __B, __C);
133*38fd1498Szrj }
134*38fd1498Szrj 
135*38fd1498Szrj extern __inline __m512i
136*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shldi_epi64(__m512i __A,__mmask8 __B,__m512i __C,__m512i __D,int __E)137*38fd1498Szrj _mm512_mask_shldi_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D,
138*38fd1498Szrj 								int __E)
139*38fd1498Szrj {
140*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshld_v8di_mask ((__v8di)__C, (__v8di) __D,
141*38fd1498Szrj 					__E, (__v8di) __A, (__mmask8)__B);
142*38fd1498Szrj }
143*38fd1498Szrj 
144*38fd1498Szrj extern __inline __m512i
145*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shldi_epi64(__mmask8 __A,__m512i __B,__m512i __C,int __D)146*38fd1498Szrj _mm512_maskz_shldi_epi64 (__mmask8 __A, __m512i __B, __m512i __C, int __D)
147*38fd1498Szrj {
148*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshld_v8di_mask ((__v8di)__B, (__v8di) __C,
149*38fd1498Szrj 			__D, (__v8di) _mm512_setzero_si512 (), (__mmask8)__A);
150*38fd1498Szrj }
151*38fd1498Szrj #else
152*38fd1498Szrj #define _mm512_shrdi_epi16(A, B, C) \
153*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)(__m512i)(A), \
154*38fd1498Szrj 						(__v32hi)(__m512i)(B),(int)(C))
155*38fd1498Szrj #define _mm512_shrdi_epi32(A, B, C) \
156*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)(__m512i)(A), \
157*38fd1498Szrj 	(__v16si)(__m512i)(B),(int)(C))
158*38fd1498Szrj #define _mm512_mask_shrdi_epi32(A, B, C, D, E) \
159*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(C), \
160*38fd1498Szrj 	(__v16si)(__m512i)(D), (int)(E), (__v16si)(__m512i)(A),(__mmask16)(B))
161*38fd1498Szrj #define _mm512_maskz_shrdi_epi32(A, B, C, D) \
162*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(B), \
163*38fd1498Szrj 	(__v16si)(__m512i)(C),(int)(D), \
164*38fd1498Szrj 	(__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(A))
165*38fd1498Szrj #define _mm512_shrdi_epi64(A, B, C) \
166*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)(__m512i)(A), \
167*38fd1498Szrj 	(__v8di)(__m512i)(B),(int)(C))
168*38fd1498Szrj #define _mm512_mask_shrdi_epi64(A, B, C, D, E) \
169*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(C), \
170*38fd1498Szrj 	(__v8di)(__m512i)(D), (int)(E), (__v8di)(__m512i)(A),(__mmask8)(B))
171*38fd1498Szrj #define _mm512_maskz_shrdi_epi64(A, B, C, D) \
172*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(B), \
173*38fd1498Szrj 	(__v8di)(__m512i)(C),(int)(D), \
174*38fd1498Szrj 	(__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A))
175*38fd1498Szrj #define _mm512_shldi_epi16(A, B, C) \
176*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)(__m512i)(A), \
177*38fd1498Szrj 						(__v32hi)(__m512i)(B),(int)(C))
178*38fd1498Szrj #define _mm512_shldi_epi32(A, B, C) \
179*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshld_v16si ((__v16si)(__m512i)(A), 	\
180*38fd1498Szrj 				(__v16si)(__m512i)(B),(int)(C))
181*38fd1498Szrj #define _mm512_mask_shldi_epi32(A, B, C, D, E) \
182*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(C), \
183*38fd1498Szrj 	(__v16si)(__m512i)(D), (int)(E), (__v16si)(__m512i)(A),(__mmask16)(B))
184*38fd1498Szrj #define _mm512_maskz_shldi_epi32(A, B, C, D) \
185*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(B), \
186*38fd1498Szrj 	(__v16si)(__m512i)(C),(int)(D), \
187*38fd1498Szrj 	(__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(A))
188*38fd1498Szrj #define _mm512_shldi_epi64(A, B, C) \
189*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshld_v8di ((__v8di)(__m512i)(A), \
190*38fd1498Szrj 	(__v8di)(__m512i)(B),(int)(C))
191*38fd1498Szrj #define _mm512_mask_shldi_epi64(A, B, C, D, E) \
192*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(C), \
193*38fd1498Szrj 	(__v8di)(__m512i)(D), (int)(E), (__v8di)(__m512i)(A),(__mmask8)(B))
194*38fd1498Szrj #define _mm512_maskz_shldi_epi64(A, B, C, D) \
195*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(B), \
196*38fd1498Szrj 	(__v8di)(__m512i)(C),(int)(D), \
197*38fd1498Szrj 	(__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A))
198*38fd1498Szrj #endif
199*38fd1498Szrj 
200*38fd1498Szrj extern __inline __m512i
201*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shrdv_epi16(__m512i __A,__m512i __B,__m512i __C)202*38fd1498Szrj _mm512_shrdv_epi16 (__m512i __A, __m512i __B, __m512i __C)
203*38fd1498Szrj {
204*38fd1498Szrj   return (__m512i) __builtin_ia32_vpshrdv_v32hi ((__v32hi)__A, (__v32hi) __B,
205*38fd1498Szrj 								(__v32hi) __C);
206*38fd1498Szrj }
207*38fd1498Szrj 
208*38fd1498Szrj extern __inline __m512i
209*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shrdv_epi32(__m512i __A,__m512i __B,__m512i __C)210*38fd1498Szrj _mm512_shrdv_epi32 (__m512i __A, __m512i __B, __m512i __C)
211*38fd1498Szrj {
212*38fd1498Szrj   return (__m512i) __builtin_ia32_vpshrdv_v16si ((__v16si)__A, (__v16si) __B,
213*38fd1498Szrj 								(__v16si) __C);
214*38fd1498Szrj }
215*38fd1498Szrj 
216*38fd1498Szrj extern __inline __m512i
217*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shrdv_epi32(__m512i __A,__mmask16 __B,__m512i __C,__m512i __D)218*38fd1498Szrj _mm512_mask_shrdv_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D)
219*38fd1498Szrj {
220*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshrdv_v16si_mask ((__v16si)__A,
221*38fd1498Szrj 				(__v16si) __C, (__v16si) __D, (__mmask16)__B);
222*38fd1498Szrj }
223*38fd1498Szrj 
224*38fd1498Szrj extern __inline __m512i
225*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shrdv_epi32(__mmask16 __A,__m512i __B,__m512i __C,__m512i __D)226*38fd1498Szrj _mm512_maskz_shrdv_epi32 (__mmask16 __A, __m512i __B, __m512i __C, __m512i __D)
227*38fd1498Szrj {
228*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshrdv_v16si_maskz ((__v16si)__B,
229*38fd1498Szrj 				(__v16si) __C, (__v16si) __D, (__mmask16)__A);
230*38fd1498Szrj }
231*38fd1498Szrj 
232*38fd1498Szrj extern __inline __m512i
233*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shrdv_epi64(__m512i __A,__m512i __B,__m512i __C)234*38fd1498Szrj _mm512_shrdv_epi64 (__m512i __A, __m512i __B, __m512i __C)
235*38fd1498Szrj {
236*38fd1498Szrj   return (__m512i) __builtin_ia32_vpshrdv_v8di ((__v8di)__A, (__v8di) __B,
237*38fd1498Szrj 								(__v8di) __C);
238*38fd1498Szrj }
239*38fd1498Szrj 
240*38fd1498Szrj extern __inline __m512i
241*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shrdv_epi64(__m512i __A,__mmask8 __B,__m512i __C,__m512i __D)242*38fd1498Szrj _mm512_mask_shrdv_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D)
243*38fd1498Szrj {
244*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshrdv_v8di_mask ((__v8di)__A, (__v8di) __C,
245*38fd1498Szrj 						(__v8di) __D, (__mmask8)__B);
246*38fd1498Szrj }
247*38fd1498Szrj 
248*38fd1498Szrj extern __inline __m512i
249*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shrdv_epi64(__mmask8 __A,__m512i __B,__m512i __C,__m512i __D)250*38fd1498Szrj _mm512_maskz_shrdv_epi64 (__mmask8 __A, __m512i __B, __m512i __C, __m512i __D)
251*38fd1498Szrj {
252*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshrdv_v8di_maskz ((__v8di)__B, (__v8di) __C,
253*38fd1498Szrj 						 (__v8di) __D, (__mmask8)__A);
254*38fd1498Szrj }
255*38fd1498Szrj extern __inline __m512i
256*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shldv_epi16(__m512i __A,__m512i __B,__m512i __C)257*38fd1498Szrj _mm512_shldv_epi16 (__m512i __A, __m512i __B, __m512i __C)
258*38fd1498Szrj {
259*38fd1498Szrj   return (__m512i) __builtin_ia32_vpshldv_v32hi ((__v32hi)__A, (__v32hi) __B,
260*38fd1498Szrj 							 (__v32hi) __C);
261*38fd1498Szrj }
262*38fd1498Szrj 
263*38fd1498Szrj extern __inline __m512i
264*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shldv_epi32(__m512i __A,__m512i __B,__m512i __C)265*38fd1498Szrj _mm512_shldv_epi32 (__m512i __A, __m512i __B, __m512i __C)
266*38fd1498Szrj {
267*38fd1498Szrj   return (__m512i) __builtin_ia32_vpshldv_v16si ((__v16si)__A, (__v16si) __B,
268*38fd1498Szrj 								(__v16si) __C);
269*38fd1498Szrj }
270*38fd1498Szrj 
271*38fd1498Szrj extern __inline __m512i
272*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shldv_epi32(__m512i __A,__mmask16 __B,__m512i __C,__m512i __D)273*38fd1498Szrj _mm512_mask_shldv_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D)
274*38fd1498Szrj {
275*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshldv_v16si_mask ((__v16si)__A,
276*38fd1498Szrj 				(__v16si) __C, (__v16si) __D, (__mmask16)__B);
277*38fd1498Szrj }
278*38fd1498Szrj 
279*38fd1498Szrj extern __inline __m512i
280*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shldv_epi32(__mmask16 __A,__m512i __B,__m512i __C,__m512i __D)281*38fd1498Szrj _mm512_maskz_shldv_epi32 (__mmask16 __A, __m512i __B, __m512i __C, __m512i __D)
282*38fd1498Szrj {
283*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshldv_v16si_maskz ((__v16si)__B,
284*38fd1498Szrj 				(__v16si) __C, (__v16si) __D, (__mmask16)__A);
285*38fd1498Szrj }
286*38fd1498Szrj 
287*38fd1498Szrj extern __inline __m512i
288*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shldv_epi64(__m512i __A,__m512i __B,__m512i __C)289*38fd1498Szrj _mm512_shldv_epi64 (__m512i __A, __m512i __B, __m512i __C)
290*38fd1498Szrj {
291*38fd1498Szrj   return (__m512i) __builtin_ia32_vpshldv_v8di ((__v8di)__A, (__v8di) __B,
292*38fd1498Szrj 								(__v8di) __C);
293*38fd1498Szrj }
294*38fd1498Szrj 
295*38fd1498Szrj extern __inline __m512i
296*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shldv_epi64(__m512i __A,__mmask8 __B,__m512i __C,__m512i __D)297*38fd1498Szrj _mm512_mask_shldv_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D)
298*38fd1498Szrj {
299*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshldv_v8di_mask ((__v8di)__A, (__v8di) __C,
300*38fd1498Szrj 						(__v8di) __D, (__mmask8)__B);
301*38fd1498Szrj }
302*38fd1498Szrj 
303*38fd1498Szrj extern __inline __m512i
304*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shldv_epi64(__mmask8 __A,__m512i __B,__m512i __C,__m512i __D)305*38fd1498Szrj _mm512_maskz_shldv_epi64 (__mmask8 __A, __m512i __B, __m512i __C, __m512i __D)
306*38fd1498Szrj {
307*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshldv_v8di_maskz ((__v8di)__B, (__v8di) __C,
308*38fd1498Szrj 						(__v8di) __D, (__mmask8)__A);
309*38fd1498Szrj }
310*38fd1498Szrj 
311*38fd1498Szrj #ifdef __DISABLE_AVX512VBMI2__
312*38fd1498Szrj #undef __DISABLE_AVX512VBMI2__
313*38fd1498Szrj 
314*38fd1498Szrj #pragma GCC pop_options
315*38fd1498Szrj #endif /* __DISABLE_AVX512VBMI2__ */
316*38fd1498Szrj 
317*38fd1498Szrj #if !defined(__AVX512VBMI2__) || !defined(__AVX512BW__)
318*38fd1498Szrj #pragma GCC push_options
319*38fd1498Szrj #pragma GCC target("avx512vbmi2,avx512bw")
320*38fd1498Szrj #define __DISABLE_AVX512VBMI2BW__
321*38fd1498Szrj #endif /* __AVX512VBMI2BW__ */
322*38fd1498Szrj 
323*38fd1498Szrj extern __inline __m512i
324*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compress_epi8(__m512i __A,__mmask64 __B,__m512i __C)325*38fd1498Szrj _mm512_mask_compress_epi8 (__m512i __A, __mmask64 __B, __m512i __C)
326*38fd1498Szrj {
327*38fd1498Szrj   return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi)__C,
328*38fd1498Szrj 						(__v64qi)__A, (__mmask64)__B);
329*38fd1498Szrj }
330*38fd1498Szrj 
331*38fd1498Szrj 
332*38fd1498Szrj extern __inline __m512i
333*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_compress_epi8(__mmask64 __A,__m512i __B)334*38fd1498Szrj _mm512_maskz_compress_epi8 (__mmask64 __A, __m512i __B)
335*38fd1498Szrj {
336*38fd1498Szrj   return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi)__B,
337*38fd1498Szrj 			(__v64qi)_mm512_setzero_si512 (), (__mmask64)__A);
338*38fd1498Szrj }
339*38fd1498Szrj 
340*38fd1498Szrj 
341*38fd1498Szrj extern __inline void
342*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compressstoreu_epi8(void * __A,__mmask64 __B,__m512i __C)343*38fd1498Szrj _mm512_mask_compressstoreu_epi8 (void * __A, __mmask64 __B, __m512i __C)
344*38fd1498Szrj {
345*38fd1498Szrj   __builtin_ia32_compressstoreuqi512_mask ((__v64qi *) __A, (__v64qi) __C,
346*38fd1498Szrj 							(__mmask64) __B);
347*38fd1498Szrj }
348*38fd1498Szrj 
349*38fd1498Szrj extern __inline __m512i
350*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compress_epi16(__m512i __A,__mmask32 __B,__m512i __C)351*38fd1498Szrj _mm512_mask_compress_epi16 (__m512i __A, __mmask32 __B, __m512i __C)
352*38fd1498Szrj {
353*38fd1498Szrj   return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi)__C,
354*38fd1498Szrj 						(__v32hi)__A, (__mmask32)__B);
355*38fd1498Szrj }
356*38fd1498Szrj 
357*38fd1498Szrj extern __inline __m512i
358*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_compress_epi16(__mmask32 __A,__m512i __B)359*38fd1498Szrj _mm512_maskz_compress_epi16 (__mmask32 __A, __m512i __B)
360*38fd1498Szrj {
361*38fd1498Szrj   return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi)__B,
362*38fd1498Szrj 			(__v32hi)_mm512_setzero_si512 (), (__mmask32)__A);
363*38fd1498Szrj }
364*38fd1498Szrj 
365*38fd1498Szrj extern __inline void
366*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compressstoreu_epi16(void * __A,__mmask32 __B,__m512i __C)367*38fd1498Szrj _mm512_mask_compressstoreu_epi16 (void * __A, __mmask32 __B, __m512i __C)
368*38fd1498Szrj {
369*38fd1498Szrj   __builtin_ia32_compressstoreuhi512_mask ((__v32hi *) __A, (__v32hi) __C,
370*38fd1498Szrj 							(__mmask32) __B);
371*38fd1498Szrj }
372*38fd1498Szrj 
373*38fd1498Szrj extern __inline __m512i
374*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expand_epi8(__m512i __A,__mmask64 __B,__m512i __C)375*38fd1498Szrj _mm512_mask_expand_epi8 (__m512i __A, __mmask64 __B, __m512i __C)
376*38fd1498Szrj {
377*38fd1498Szrj   return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __C,
378*38fd1498Szrj 						    (__v64qi) __A,
379*38fd1498Szrj 						    (__mmask64) __B);
380*38fd1498Szrj }
381*38fd1498Szrj 
382*38fd1498Szrj extern __inline __m512i
383*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expand_epi8(__mmask64 __A,__m512i __B)384*38fd1498Szrj _mm512_maskz_expand_epi8 (__mmask64 __A, __m512i __B)
385*38fd1498Szrj {
386*38fd1498Szrj   return (__m512i) __builtin_ia32_expandqi512_maskz ((__v64qi) __B,
387*38fd1498Szrj 			(__v64qi) _mm512_setzero_si512 (), (__mmask64) __A);
388*38fd1498Szrj }
389*38fd1498Szrj 
390*38fd1498Szrj extern __inline __m512i
391*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expandloadu_epi8(__m512i __A,__mmask64 __B,const void * __C)392*38fd1498Szrj _mm512_mask_expandloadu_epi8 (__m512i __A, __mmask64 __B, const void * __C)
393*38fd1498Szrj {
394*38fd1498Szrj   return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *) __C,
395*38fd1498Szrj 					(__v64qi) __A, (__mmask64) __B);
396*38fd1498Szrj }
397*38fd1498Szrj 
398*38fd1498Szrj extern __inline __m512i
399*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expandloadu_epi8(__mmask64 __A,const void * __B)400*38fd1498Szrj _mm512_maskz_expandloadu_epi8 (__mmask64 __A, const void * __B)
401*38fd1498Szrj {
402*38fd1498Szrj   return (__m512i) __builtin_ia32_expandloadqi512_maskz ((const __v64qi *) __B,
403*38fd1498Szrj 			(__v64qi) _mm512_setzero_si512 (), (__mmask64) __A);
404*38fd1498Szrj }
405*38fd1498Szrj 
406*38fd1498Szrj extern __inline __m512i
407*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expand_epi16(__m512i __A,__mmask32 __B,__m512i __C)408*38fd1498Szrj _mm512_mask_expand_epi16 (__m512i __A, __mmask32 __B, __m512i __C)
409*38fd1498Szrj {
410*38fd1498Szrj   return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __C,
411*38fd1498Szrj 						    (__v32hi) __A,
412*38fd1498Szrj 						    (__mmask32) __B);
413*38fd1498Szrj }
414*38fd1498Szrj 
415*38fd1498Szrj extern __inline __m512i
416*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expand_epi16(__mmask32 __A,__m512i __B)417*38fd1498Szrj _mm512_maskz_expand_epi16 (__mmask32 __A, __m512i __B)
418*38fd1498Szrj {
419*38fd1498Szrj   return (__m512i) __builtin_ia32_expandhi512_maskz ((__v32hi) __B,
420*38fd1498Szrj 			(__v32hi) _mm512_setzero_si512 (), (__mmask32) __A);
421*38fd1498Szrj }
422*38fd1498Szrj 
423*38fd1498Szrj extern __inline __m512i
424*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expandloadu_epi16(__m512i __A,__mmask32 __B,const void * __C)425*38fd1498Szrj _mm512_mask_expandloadu_epi16 (__m512i __A, __mmask32 __B, const void * __C)
426*38fd1498Szrj {
427*38fd1498Szrj   return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *) __C,
428*38fd1498Szrj 					(__v32hi) __A, (__mmask32) __B);
429*38fd1498Szrj }
430*38fd1498Szrj 
431*38fd1498Szrj extern __inline __m512i
432*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expandloadu_epi16(__mmask32 __A,const void * __B)433*38fd1498Szrj _mm512_maskz_expandloadu_epi16 (__mmask32 __A, const void * __B)
434*38fd1498Szrj {
435*38fd1498Szrj   return (__m512i) __builtin_ia32_expandloadhi512_maskz ((const __v32hi *) __B,
436*38fd1498Szrj 			(__v32hi) _mm512_setzero_si512 (), (__mmask32) __A);
437*38fd1498Szrj }
438*38fd1498Szrj 
439*38fd1498Szrj #ifdef __OPTIMIZE__
440*38fd1498Szrj extern __inline __m512i
441*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shrdi_epi16(__m512i __A,__mmask32 __B,__m512i __C,__m512i __D,int __E)442*38fd1498Szrj _mm512_mask_shrdi_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D,
443*38fd1498Szrj 								int __E)
444*38fd1498Szrj {
445*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshrd_v32hi_mask ((__v32hi)__C,
446*38fd1498Szrj 			(__v32hi) __D, __E, (__v32hi) __A, (__mmask32)__B);
447*38fd1498Szrj }
448*38fd1498Szrj 
449*38fd1498Szrj extern __inline __m512i
450*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shrdi_epi16(__mmask32 __A,__m512i __B,__m512i __C,int __D)451*38fd1498Szrj _mm512_maskz_shrdi_epi16 (__mmask32 __A, __m512i __B, __m512i __C, int __D)
452*38fd1498Szrj {
453*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshrd_v32hi_mask ((__v32hi)__B,
454*38fd1498Szrj 	(__v32hi) __C, __D, (__v32hi) _mm512_setzero_si512 (), (__mmask32)__A);
455*38fd1498Szrj }
456*38fd1498Szrj 
457*38fd1498Szrj extern __inline __m512i
458*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shldi_epi16(__m512i __A,__mmask32 __B,__m512i __C,__m512i __D,int __E)459*38fd1498Szrj _mm512_mask_shldi_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D,
460*38fd1498Szrj 								int __E)
461*38fd1498Szrj {
462*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshld_v32hi_mask ((__v32hi)__C,
463*38fd1498Szrj 			(__v32hi) __D, __E, (__v32hi) __A, (__mmask32)__B);
464*38fd1498Szrj }
465*38fd1498Szrj 
466*38fd1498Szrj extern __inline __m512i
467*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shldi_epi16(__mmask32 __A,__m512i __B,__m512i __C,int __D)468*38fd1498Szrj _mm512_maskz_shldi_epi16 (__mmask32 __A, __m512i __B, __m512i __C, int __D)
469*38fd1498Szrj {
470*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshld_v32hi_mask ((__v32hi)__B,
471*38fd1498Szrj 	(__v32hi) __C, __D, (__v32hi) _mm512_setzero_si512 (), (__mmask32)__A);
472*38fd1498Szrj }
473*38fd1498Szrj 
474*38fd1498Szrj #else
475*38fd1498Szrj #define _mm512_mask_shrdi_epi16(A, B, C, D, E) \
476*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(C), \
477*38fd1498Szrj 	(__v32hi)(__m512i)(D), (int)(E), (__v32hi)(__m512i)(A),(__mmask32)(B))
478*38fd1498Szrj #define _mm512_maskz_shrdi_epi16(A, B, C, D) \
479*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(B), \
480*38fd1498Szrj 	(__v32hi)(__m512i)(C),(int)(D), \
481*38fd1498Szrj 	(__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A))
482*38fd1498Szrj #define _mm512_mask_shldi_epi16(A, B, C, D, E) \
483*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(C), \
484*38fd1498Szrj 	(__v32hi)(__m512i)(D), (int)(E), (__v32hi)(__m512i)(A),(__mmask32)(B))
485*38fd1498Szrj #define _mm512_maskz_shldi_epi16(A, B, C, D) \
486*38fd1498Szrj   ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(B),   \
487*38fd1498Szrj 	(__v32hi)(__m512i)(C),(int)(D), 				\
488*38fd1498Szrj 	(__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A))
489*38fd1498Szrj #endif
490*38fd1498Szrj 
491*38fd1498Szrj extern __inline __m512i
492*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shrdv_epi16(__m512i __A,__mmask32 __B,__m512i __C,__m512i __D)493*38fd1498Szrj _mm512_mask_shrdv_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D)
494*38fd1498Szrj {
495*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshrdv_v32hi_mask ((__v32hi)__A,
496*38fd1498Szrj 				(__v32hi) __C, (__v32hi) __D, (__mmask32)__B);
497*38fd1498Szrj }
498*38fd1498Szrj 
499*38fd1498Szrj extern __inline __m512i
500*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shrdv_epi16(__mmask32 __A,__m512i __B,__m512i __C,__m512i __D)501*38fd1498Szrj _mm512_maskz_shrdv_epi16 (__mmask32 __A, __m512i __B, __m512i __C, __m512i __D)
502*38fd1498Szrj {
503*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshrdv_v32hi_maskz ((__v32hi)__B,
504*38fd1498Szrj 				(__v32hi) __C, (__v32hi) __D, (__mmask32)__A);
505*38fd1498Szrj }
506*38fd1498Szrj 
507*38fd1498Szrj extern __inline __m512i
508*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shldv_epi16(__m512i __A,__mmask32 __B,__m512i __C,__m512i __D)509*38fd1498Szrj _mm512_mask_shldv_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D)
510*38fd1498Szrj {
511*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshldv_v32hi_mask ((__v32hi)__A,
512*38fd1498Szrj 				(__v32hi) __C, (__v32hi) __D, (__mmask32)__B);
513*38fd1498Szrj }
514*38fd1498Szrj 
515*38fd1498Szrj extern __inline __m512i
516*38fd1498Szrj __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shldv_epi16(__mmask32 __A,__m512i __B,__m512i __C,__m512i __D)517*38fd1498Szrj _mm512_maskz_shldv_epi16 (__mmask32 __A, __m512i __B, __m512i __C, __m512i __D)
518*38fd1498Szrj {
519*38fd1498Szrj   return (__m512i)__builtin_ia32_vpshldv_v32hi_maskz ((__v32hi)__B,
520*38fd1498Szrj 				(__v32hi) __C, (__v32hi) __D, (__mmask32)__A);
521*38fd1498Szrj }
522*38fd1498Szrj 
523*38fd1498Szrj #ifdef __DISABLE_AVX512VBMI2BW__
524*38fd1498Szrj #undef __DISABLE_AVX512VBMI2BW__
525*38fd1498Szrj 
526*38fd1498Szrj #pragma GCC pop_options
527*38fd1498Szrj #endif /* __DISABLE_AVX512VBMI2BW__ */
528*38fd1498Szrj 
529*38fd1498Szrj #endif /* __AVX512VBMI2INTRIN_H_INCLUDED */
530