1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
12
13;
14; add
15;
16
17define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
18; SSE-LABEL: trunc_add_v4i64_v4i32:
19; SSE:       # %bb.0:
20; SSE-NEXT:    paddq %xmm3, %xmm1
21; SSE-NEXT:    paddq %xmm2, %xmm0
22; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
23; SSE-NEXT:    retq
24;
25; AVX1-LABEL: trunc_add_v4i64_v4i32:
26; AVX1:       # %bb.0:
27; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
28; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
29; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
30; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
31; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
32; AVX1-NEXT:    vzeroupper
33; AVX1-NEXT:    retq
34;
35; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
36; AVX2-SLOW:       # %bb.0:
37; AVX2-SLOW-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
38; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
39; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
40; AVX2-SLOW-NEXT:    vzeroupper
41; AVX2-SLOW-NEXT:    retq
42;
43; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32:
44; AVX2-FAST-ALL:       # %bb.0:
45; AVX2-FAST-ALL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
46; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
47; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
48; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
49; AVX2-FAST-ALL-NEXT:    vzeroupper
50; AVX2-FAST-ALL-NEXT:    retq
51;
52; AVX2-FAST-PERLANE-LABEL: trunc_add_v4i64_v4i32:
53; AVX2-FAST-PERLANE:       # %bb.0:
54; AVX2-FAST-PERLANE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
55; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
56; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
57; AVX2-FAST-PERLANE-NEXT:    vzeroupper
58; AVX2-FAST-PERLANE-NEXT:    retq
59;
60; AVX512-LABEL: trunc_add_v4i64_v4i32:
61; AVX512:       # %bb.0:
62; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
63; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
64; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
65; AVX512-NEXT:    vzeroupper
66; AVX512-NEXT:    retq
67  %1 = add <4 x i64> %a0, %a1
68  %2 = trunc <4 x i64> %1 to <4 x i32>
69  ret <4 x i32> %2
70}
71
72define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
73; SSE-LABEL: trunc_add_v8i64_v8i16:
74; SSE:       # %bb.0:
75; SSE-NEXT:    paddq %xmm6, %xmm2
76; SSE-NEXT:    paddq %xmm7, %xmm3
77; SSE-NEXT:    paddq %xmm4, %xmm0
78; SSE-NEXT:    paddq %xmm5, %xmm1
79; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
80; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
81; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
82; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
83; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
84; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
85; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
86; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
87; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
88; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
89; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
90; SSE-NEXT:    retq
91;
92; AVX1-LABEL: trunc_add_v8i64_v8i16:
93; AVX1:       # %bb.0:
94; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm4
95; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
96; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
97; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
98; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm2
99; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
100; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
101; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
102; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
103; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
104; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
105; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
106; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
107; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
108; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
109; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
110; AVX1-NEXT:    vzeroupper
111; AVX1-NEXT:    retq
112;
113; AVX2-LABEL: trunc_add_v8i64_v8i16:
114; AVX2:       # %bb.0:
115; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
116; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
117; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
118; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
119; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
120; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
121; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
122; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
123; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
124; AVX2-NEXT:    vzeroupper
125; AVX2-NEXT:    retq
126;
127; AVX512-LABEL: trunc_add_v8i64_v8i16:
128; AVX512:       # %bb.0:
129; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
130; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
131; AVX512-NEXT:    vzeroupper
132; AVX512-NEXT:    retq
133  %1 = add <8 x i64> %a0, %a1
134  %2 = trunc <8 x i64> %1 to <8 x i16>
135  ret <8 x i16> %2
136}
137
138define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
139; SSE-LABEL: trunc_add_v8i32_v8i16:
140; SSE:       # %bb.0:
141; SSE-NEXT:    paddd %xmm2, %xmm0
142; SSE-NEXT:    paddd %xmm3, %xmm1
143; SSE-NEXT:    pslld $16, %xmm1
144; SSE-NEXT:    psrad $16, %xmm1
145; SSE-NEXT:    pslld $16, %xmm0
146; SSE-NEXT:    psrad $16, %xmm0
147; SSE-NEXT:    packssdw %xmm1, %xmm0
148; SSE-NEXT:    retq
149;
150; AVX1-LABEL: trunc_add_v8i32_v8i16:
151; AVX1:       # %bb.0:
152; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
153; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
154; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
155; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
156; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
157; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
158; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
159; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
160; AVX1-NEXT:    vzeroupper
161; AVX1-NEXT:    retq
162;
163; AVX2-LABEL: trunc_add_v8i32_v8i16:
164; AVX2:       # %bb.0:
165; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
166; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
167; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
168; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
169; AVX2-NEXT:    vzeroupper
170; AVX2-NEXT:    retq
171;
172; AVX512-LABEL: trunc_add_v8i32_v8i16:
173; AVX512:       # %bb.0:
174; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
175; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
176; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
177; AVX512-NEXT:    vzeroupper
178; AVX512-NEXT:    retq
179  %1 = add <8 x i32> %a0, %a1
180  %2 = trunc <8 x i32> %1 to <8 x i16>
181  ret <8 x i16> %2
182}
183
184define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
185; SSE-LABEL: trunc_add_v16i64_v16i8:
186; SSE:       # %bb.0:
187; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm0
188; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm1
189; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm2
190; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm3
191; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm4
192; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm5
193; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm6
194; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm7
195; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
196; SSE-NEXT:    pand %xmm8, %xmm7
197; SSE-NEXT:    pand %xmm8, %xmm6
198; SSE-NEXT:    packuswb %xmm7, %xmm6
199; SSE-NEXT:    pand %xmm8, %xmm5
200; SSE-NEXT:    pand %xmm8, %xmm4
201; SSE-NEXT:    packuswb %xmm5, %xmm4
202; SSE-NEXT:    packuswb %xmm6, %xmm4
203; SSE-NEXT:    pand %xmm8, %xmm3
204; SSE-NEXT:    pand %xmm8, %xmm2
205; SSE-NEXT:    packuswb %xmm3, %xmm2
206; SSE-NEXT:    pand %xmm8, %xmm1
207; SSE-NEXT:    pand %xmm8, %xmm0
208; SSE-NEXT:    packuswb %xmm1, %xmm0
209; SSE-NEXT:    packuswb %xmm2, %xmm0
210; SSE-NEXT:    packuswb %xmm4, %xmm0
211; SSE-NEXT:    retq
212;
213; AVX1-LABEL: trunc_add_v16i64_v16i8:
214; AVX1:       # %bb.0:
215; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm8
216; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
217; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
218; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
219; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm4
220; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
221; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
222; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
223; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm5
224; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
225; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
226; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
227; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm6
228; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
229; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
230; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm3
231; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255]
232; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
233; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
234; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
235; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
236; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
237; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
238; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
239; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
240; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
241; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
242; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
243; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
244; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
245; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
246; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
247; AVX1-NEXT:    vzeroupper
248; AVX1-NEXT:    retq
249;
250; AVX2-LABEL: trunc_add_v16i64_v16i8:
251; AVX2:       # %bb.0:
252; AVX2-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
253; AVX2-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
254; AVX2-NEXT:    vpaddq %ymm6, %ymm2, %ymm2
255; AVX2-NEXT:    vpaddq %ymm7, %ymm3, %ymm3
256; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
257; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
258; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
259; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
260; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
261; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
262; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
263; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
264; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
265; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
266; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
267; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
268; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
269; AVX2-NEXT:    vzeroupper
270; AVX2-NEXT:    retq
271;
272; AVX512-LABEL: trunc_add_v16i64_v16i8:
273; AVX512:       # %bb.0:
274; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
275; AVX512-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
276; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
277; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
278; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
279; AVX512-NEXT:    vzeroupper
280; AVX512-NEXT:    retq
281  %1 = add <16 x i64> %a0, %a1
282  %2 = trunc <16 x i64> %1 to <16 x i8>
283  ret <16 x i8> %2
284}
285
286define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
287; SSE-LABEL: trunc_add_v16i32_v16i8:
288; SSE:       # %bb.0:
289; SSE-NEXT:    paddd %xmm4, %xmm0
290; SSE-NEXT:    paddd %xmm5, %xmm1
291; SSE-NEXT:    paddd %xmm6, %xmm2
292; SSE-NEXT:    paddd %xmm7, %xmm3
293; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
294; SSE-NEXT:    pand %xmm4, %xmm3
295; SSE-NEXT:    pand %xmm4, %xmm2
296; SSE-NEXT:    packuswb %xmm3, %xmm2
297; SSE-NEXT:    pand %xmm4, %xmm1
298; SSE-NEXT:    pand %xmm4, %xmm0
299; SSE-NEXT:    packuswb %xmm1, %xmm0
300; SSE-NEXT:    packuswb %xmm2, %xmm0
301; SSE-NEXT:    retq
302;
303; AVX1-LABEL: trunc_add_v16i32_v16i8:
304; AVX1:       # %bb.0:
305; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm4
306; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
307; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
308; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
309; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm2
310; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
311; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
312; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
313; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
314; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
315; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
316; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
317; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
318; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
319; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
320; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
321; AVX1-NEXT:    vzeroupper
322; AVX1-NEXT:    retq
323;
324; AVX2-LABEL: trunc_add_v16i32_v16i8:
325; AVX2:       # %bb.0:
326; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
327; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
328; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
329; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
330; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
331; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
332; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
333; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
334; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
335; AVX2-NEXT:    vzeroupper
336; AVX2-NEXT:    retq
337;
338; AVX512-LABEL: trunc_add_v16i32_v16i8:
339; AVX512:       # %bb.0:
340; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
341; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
342; AVX512-NEXT:    vzeroupper
343; AVX512-NEXT:    retq
344  %1 = add <16 x i32> %a0, %a1
345  %2 = trunc <16 x i32> %1 to <16 x i8>
346  ret <16 x i8> %2
347}
348
349define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
350; SSE-LABEL: trunc_add_v16i16_v16i8:
351; SSE:       # %bb.0:
352; SSE-NEXT:    paddw %xmm2, %xmm0
353; SSE-NEXT:    paddw %xmm3, %xmm1
354; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
355; SSE-NEXT:    pand %xmm2, %xmm1
356; SSE-NEXT:    pand %xmm2, %xmm0
357; SSE-NEXT:    packuswb %xmm1, %xmm0
358; SSE-NEXT:    retq
359;
360; AVX1-LABEL: trunc_add_v16i16_v16i8:
361; AVX1:       # %bb.0:
362; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
363; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
364; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
365; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
366; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
367; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
368; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
369; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
370; AVX1-NEXT:    vzeroupper
371; AVX1-NEXT:    retq
372;
373; AVX2-LABEL: trunc_add_v16i16_v16i8:
374; AVX2:       # %bb.0:
375; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
376; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
377; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
378; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
379; AVX2-NEXT:    vzeroupper
380; AVX2-NEXT:    retq
381;
382; AVX512F-LABEL: trunc_add_v16i16_v16i8:
383; AVX512F:       # %bb.0:
384; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
385; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
386; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
387; AVX512F-NEXT:    vzeroupper
388; AVX512F-NEXT:    retq
389;
390; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
391; AVX512BW:       # %bb.0:
392; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
393; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
394; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
395; AVX512BW-NEXT:    vzeroupper
396; AVX512BW-NEXT:    retq
397;
398; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
399; AVX512DQ:       # %bb.0:
400; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
401; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
402; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
403; AVX512DQ-NEXT:    vzeroupper
404; AVX512DQ-NEXT:    retq
405  %1 = add <16 x i16> %a0, %a1
406  %2 = trunc <16 x i16> %1 to <16 x i8>
407  ret <16 x i8> %2
408}
409
410define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
411; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
412; SSE:       # %bb.0:
413; SSE-NEXT:    pslld $16, %xmm2
414; SSE-NEXT:    psrad $16, %xmm2
415; SSE-NEXT:    pslld $16, %xmm1
416; SSE-NEXT:    psrad $16, %xmm1
417; SSE-NEXT:    packssdw %xmm2, %xmm1
418; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
419; SSE-NEXT:    psraw $8, %xmm0
420; SSE-NEXT:    paddw %xmm1, %xmm0
421; SSE-NEXT:    retq
422;
423; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
424; AVX1:       # %bb.0:
425; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
426; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
427; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
428; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
429; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
430; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
431; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
432; AVX1-NEXT:    vzeroupper
433; AVX1-NEXT:    retq
434;
435; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
436; AVX2:       # %bb.0:
437; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
438; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
439; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
440; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
441; AVX2-NEXT:    vzeroupper
442; AVX2-NEXT:    retq
443;
444; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
445; AVX512:       # %bb.0:
446; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
447; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
448; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
449; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
450; AVX512-NEXT:    vzeroupper
451; AVX512-NEXT:    retq
452  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
453  %2 = sext <8 x i8> %1 to <8 x i32>
454  %3 = add <8 x i32> %2, %a1
455  %4 = trunc <8 x i32> %3 to <8 x i16>
456  ret <8 x i16> %4
457}
458
459;
460; add to constant
461;
462
463define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
464; SSE-LABEL: trunc_add_const_v4i64_v4i32:
465; SSE:       # %bb.0:
466; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
467; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
468; SSE-NEXT:    retq
469;
470; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
471; AVX1:       # %bb.0:
472; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
473; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
474; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
475; AVX1-NEXT:    vzeroupper
476; AVX1-NEXT:    retq
477;
478; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
479; AVX2-SLOW:       # %bb.0:
480; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
481; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
482; AVX2-SLOW-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
483; AVX2-SLOW-NEXT:    vzeroupper
484; AVX2-SLOW-NEXT:    retq
485;
486; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32:
487; AVX2-FAST-ALL:       # %bb.0:
488; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
489; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
490; AVX2-FAST-ALL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
491; AVX2-FAST-ALL-NEXT:    vzeroupper
492; AVX2-FAST-ALL-NEXT:    retq
493;
494; AVX2-FAST-PERLANE-LABEL: trunc_add_const_v4i64_v4i32:
495; AVX2-FAST-PERLANE:       # %bb.0:
496; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
497; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
498; AVX2-FAST-PERLANE-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
499; AVX2-FAST-PERLANE-NEXT:    vzeroupper
500; AVX2-FAST-PERLANE-NEXT:    retq
501;
502; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
503; AVX512:       # %bb.0:
504; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
505; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
506; AVX512-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
507; AVX512-NEXT:    vzeroupper
508; AVX512-NEXT:    retq
509  %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
510  %2 = trunc <4 x i64> %1 to <4 x i32>
511  ret <4 x i32> %2
512}
513
514define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
515; SSE-LABEL: trunc_add_const_v8i64_v8i16:
516; SSE:       # %bb.0:
517; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
518; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
519; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
520; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
521; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
522; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
523; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
524; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
525; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
526; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
527; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
528; SSE-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
529; SSE-NEXT:    retq
530;
531; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
532; AVX1:       # %bb.0:
533; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
534; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
535; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
536; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
537; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
538; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
539; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
540; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
541; AVX1-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
542; AVX1-NEXT:    vzeroupper
543; AVX1-NEXT:    retq
544;
545; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
546; AVX2:       # %bb.0:
547; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
548; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
549; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
550; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
551; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
552; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
553; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
554; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
555; AVX2-NEXT:    vzeroupper
556; AVX2-NEXT:    retq
557;
558; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
559; AVX512:       # %bb.0:
560; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
561; AVX512-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
562; AVX512-NEXT:    vzeroupper
563; AVX512-NEXT:    retq
564  %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
565  %2 = trunc <8 x i64> %1 to <8 x i16>
566  ret <8 x i16> %2
567}
568
569define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
570; SSE-LABEL: trunc_add_const_v8i32_v8i16:
571; SSE:       # %bb.0:
572; SSE-NEXT:    pslld $16, %xmm1
573; SSE-NEXT:    psrad $16, %xmm1
574; SSE-NEXT:    pslld $16, %xmm0
575; SSE-NEXT:    psrad $16, %xmm0
576; SSE-NEXT:    packssdw %xmm1, %xmm0
577; SSE-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
578; SSE-NEXT:    retq
579;
580; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
581; AVX1:       # %bb.0:
582; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
583; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
584; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
585; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
586; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
587; AVX1-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
588; AVX1-NEXT:    vzeroupper
589; AVX1-NEXT:    retq
590;
591; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
592; AVX2:       # %bb.0:
593; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
594; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
595; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
596; AVX2-NEXT:    vzeroupper
597; AVX2-NEXT:    retq
598;
599; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
600; AVX512:       # %bb.0:
601; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
602; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
603; AVX512-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
604; AVX512-NEXT:    vzeroupper
605; AVX512-NEXT:    retq
606  %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
607  %2 = trunc <8 x i32> %1 to <8 x i16>
608  ret <8 x i16> %2
609}
610
611define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
612; SSE-LABEL: trunc_add_const_v16i64_v16i8:
613; SSE:       # %bb.0:
614; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
615; SSE-NEXT:    pand %xmm8, %xmm7
616; SSE-NEXT:    pand %xmm8, %xmm6
617; SSE-NEXT:    packuswb %xmm7, %xmm6
618; SSE-NEXT:    pand %xmm8, %xmm5
619; SSE-NEXT:    pand %xmm8, %xmm4
620; SSE-NEXT:    packuswb %xmm5, %xmm4
621; SSE-NEXT:    packuswb %xmm6, %xmm4
622; SSE-NEXT:    pand %xmm8, %xmm3
623; SSE-NEXT:    pand %xmm8, %xmm2
624; SSE-NEXT:    packuswb %xmm3, %xmm2
625; SSE-NEXT:    pand %xmm8, %xmm1
626; SSE-NEXT:    pand %xmm8, %xmm0
627; SSE-NEXT:    packuswb %xmm1, %xmm0
628; SSE-NEXT:    packuswb %xmm2, %xmm0
629; SSE-NEXT:    packuswb %xmm4, %xmm0
630; SSE-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
631; SSE-NEXT:    retq
632;
633; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
634; AVX1:       # %bb.0:
635; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
636; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
637; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
638; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
639; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
640; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
641; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
642; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
643; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
644; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
645; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
646; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
647; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
648; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
649; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
650; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
651; AVX1-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
652; AVX1-NEXT:    vzeroupper
653; AVX1-NEXT:    retq
654;
655; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
656; AVX2:       # %bb.0:
657; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
658; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
659; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
660; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
661; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
662; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
663; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
664; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
665; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
666; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
667; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
668; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
669; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
670; AVX2-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
671; AVX2-NEXT:    vzeroupper
672; AVX2-NEXT:    retq
673;
674; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
675; AVX512:       # %bb.0:
676; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
677; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
678; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
679; AVX512-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
680; AVX512-NEXT:    vzeroupper
681; AVX512-NEXT:    retq
682  %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
683  %2 = trunc <16 x i64> %1 to <16 x i8>
684  ret <16 x i8> %2
685}
686
687define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
688; SSE-LABEL: trunc_add_const_v16i32_v16i8:
689; SSE:       # %bb.0:
690; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
691; SSE-NEXT:    pand %xmm4, %xmm3
692; SSE-NEXT:    pand %xmm4, %xmm2
693; SSE-NEXT:    packuswb %xmm3, %xmm2
694; SSE-NEXT:    pand %xmm4, %xmm1
695; SSE-NEXT:    pand %xmm4, %xmm0
696; SSE-NEXT:    packuswb %xmm1, %xmm0
697; SSE-NEXT:    packuswb %xmm2, %xmm0
698; SSE-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
699; SSE-NEXT:    retq
700;
701; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
702; AVX1:       # %bb.0:
703; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
704; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
705; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
706; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
707; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
708; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
709; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
710; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
711; AVX1-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
712; AVX1-NEXT:    vzeroupper
713; AVX1-NEXT:    retq
714;
715; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
716; AVX2:       # %bb.0:
717; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
718; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
719; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
720; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
721; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
722; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
723; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
724; AVX2-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
725; AVX2-NEXT:    vzeroupper
726; AVX2-NEXT:    retq
727;
728; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
729; AVX512:       # %bb.0:
730; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
731; AVX512-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
732; AVX512-NEXT:    vzeroupper
733; AVX512-NEXT:    retq
734  %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
735  %2 = trunc <16 x i32> %1 to <16 x i8>
736  ret <16 x i8> %2
737}
738
739define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
740; SSE-LABEL: trunc_add_const_v16i16_v16i8:
741; SSE:       # %bb.0:
742; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
743; SSE-NEXT:    pand %xmm2, %xmm1
744; SSE-NEXT:    pand %xmm2, %xmm0
745; SSE-NEXT:    packuswb %xmm1, %xmm0
746; SSE-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
747; SSE-NEXT:    retq
748;
749; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
750; AVX1:       # %bb.0:
751; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
752; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
753; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
754; AVX1-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
755; AVX1-NEXT:    vzeroupper
756; AVX1-NEXT:    retq
757;
758; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
759; AVX2:       # %bb.0:
760; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
761; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
762; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
763; AVX2-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
764; AVX2-NEXT:    vzeroupper
765; AVX2-NEXT:    retq
766;
767; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
768; AVX512F:       # %bb.0:
769; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
770; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
771; AVX512F-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
772; AVX512F-NEXT:    vzeroupper
773; AVX512F-NEXT:    retq
774;
775; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
776; AVX512BW:       # %bb.0:
777; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
778; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
779; AVX512BW-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
780; AVX512BW-NEXT:    vzeroupper
781; AVX512BW-NEXT:    retq
782;
783; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
784; AVX512DQ:       # %bb.0:
785; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
786; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
787; AVX512DQ-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
788; AVX512DQ-NEXT:    vzeroupper
789; AVX512DQ-NEXT:    retq
790  %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
791  %2 = trunc <16 x i16> %1 to <16 x i8>
792  ret <16 x i8> %2
793}
794
795;
796; sub
797;
798
799define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
800; SSE-LABEL: trunc_sub_v4i64_v4i32:
801; SSE:       # %bb.0:
802; SSE-NEXT:    psubq %xmm3, %xmm1
803; SSE-NEXT:    psubq %xmm2, %xmm0
804; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
805; SSE-NEXT:    retq
806;
807; AVX1-LABEL: trunc_sub_v4i64_v4i32:
808; AVX1:       # %bb.0:
809; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
810; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
811; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
812; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
813; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
814; AVX1-NEXT:    vzeroupper
815; AVX1-NEXT:    retq
816;
817; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
818; AVX2-SLOW:       # %bb.0:
819; AVX2-SLOW-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
820; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
821; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
822; AVX2-SLOW-NEXT:    vzeroupper
823; AVX2-SLOW-NEXT:    retq
824;
825; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32:
826; AVX2-FAST-ALL:       # %bb.0:
827; AVX2-FAST-ALL-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
828; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
829; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
830; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
831; AVX2-FAST-ALL-NEXT:    vzeroupper
832; AVX2-FAST-ALL-NEXT:    retq
833;
834; AVX2-FAST-PERLANE-LABEL: trunc_sub_v4i64_v4i32:
835; AVX2-FAST-PERLANE:       # %bb.0:
836; AVX2-FAST-PERLANE-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
837; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
838; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
839; AVX2-FAST-PERLANE-NEXT:    vzeroupper
840; AVX2-FAST-PERLANE-NEXT:    retq
841;
842; AVX512-LABEL: trunc_sub_v4i64_v4i32:
843; AVX512:       # %bb.0:
844; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
845; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
846; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
847; AVX512-NEXT:    vzeroupper
848; AVX512-NEXT:    retq
849  %1 = sub <4 x i64> %a0, %a1
850  %2 = trunc <4 x i64> %1 to <4 x i32>
851  ret <4 x i32> %2
852}
853
854define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
855; SSE-LABEL: trunc_sub_v8i64_v8i16:
856; SSE:       # %bb.0:
857; SSE-NEXT:    psubq %xmm6, %xmm2
858; SSE-NEXT:    psubq %xmm7, %xmm3
859; SSE-NEXT:    psubq %xmm4, %xmm0
860; SSE-NEXT:    psubq %xmm5, %xmm1
861; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
862; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
863; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
864; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
865; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
866; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
867; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
868; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
869; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
870; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
871; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
872; SSE-NEXT:    retq
873;
874; AVX1-LABEL: trunc_sub_v8i64_v8i16:
875; AVX1:       # %bb.0:
876; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm4
877; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
878; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
879; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
880; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm2
881; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
882; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
883; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
884; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
885; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
886; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
887; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
888; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
889; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
890; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
891; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
892; AVX1-NEXT:    vzeroupper
893; AVX1-NEXT:    retq
894;
895; AVX2-LABEL: trunc_sub_v8i64_v8i16:
896; AVX2:       # %bb.0:
897; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
898; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
899; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
900; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
901; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
902; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
903; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
904; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
905; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
906; AVX2-NEXT:    vzeroupper
907; AVX2-NEXT:    retq
908;
909; AVX512-LABEL: trunc_sub_v8i64_v8i16:
910; AVX512:       # %bb.0:
911; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
912; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
913; AVX512-NEXT:    vzeroupper
914; AVX512-NEXT:    retq
915  %1 = sub <8 x i64> %a0, %a1
916  %2 = trunc <8 x i64> %1 to <8 x i16>
917  ret <8 x i16> %2
918}
919
920define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
921; SSE-LABEL: trunc_sub_v8i32_v8i16:
922; SSE:       # %bb.0:
923; SSE-NEXT:    psubd %xmm2, %xmm0
924; SSE-NEXT:    psubd %xmm3, %xmm1
925; SSE-NEXT:    pslld $16, %xmm1
926; SSE-NEXT:    psrad $16, %xmm1
927; SSE-NEXT:    pslld $16, %xmm0
928; SSE-NEXT:    psrad $16, %xmm0
929; SSE-NEXT:    packssdw %xmm1, %xmm0
930; SSE-NEXT:    retq
931;
932; AVX1-LABEL: trunc_sub_v8i32_v8i16:
933; AVX1:       # %bb.0:
934; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
935; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
936; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
937; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
938; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
939; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
940; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
941; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
942; AVX1-NEXT:    vzeroupper
943; AVX1-NEXT:    retq
944;
945; AVX2-LABEL: trunc_sub_v8i32_v8i16:
946; AVX2:       # %bb.0:
947; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
948; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
949; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
950; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
951; AVX2-NEXT:    vzeroupper
952; AVX2-NEXT:    retq
953;
954; AVX512-LABEL: trunc_sub_v8i32_v8i16:
955; AVX512:       # %bb.0:
956; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
957; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
958; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
959; AVX512-NEXT:    vzeroupper
960; AVX512-NEXT:    retq
961  %1 = sub <8 x i32> %a0, %a1
962  %2 = trunc <8 x i32> %1 to <8 x i16>
963  ret <8 x i16> %2
964}
965
966define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
967; SSE-LABEL: trunc_sub_v16i64_v16i8:
968; SSE:       # %bb.0:
969; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm0
970; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm1
971; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm2
972; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm3
973; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm4
974; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm5
975; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm6
976; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm7
977; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
978; SSE-NEXT:    pand %xmm8, %xmm7
979; SSE-NEXT:    pand %xmm8, %xmm6
980; SSE-NEXT:    packuswb %xmm7, %xmm6
981; SSE-NEXT:    pand %xmm8, %xmm5
982; SSE-NEXT:    pand %xmm8, %xmm4
983; SSE-NEXT:    packuswb %xmm5, %xmm4
984; SSE-NEXT:    packuswb %xmm6, %xmm4
985; SSE-NEXT:    pand %xmm8, %xmm3
986; SSE-NEXT:    pand %xmm8, %xmm2
987; SSE-NEXT:    packuswb %xmm3, %xmm2
988; SSE-NEXT:    pand %xmm8, %xmm1
989; SSE-NEXT:    pand %xmm8, %xmm0
990; SSE-NEXT:    packuswb %xmm1, %xmm0
991; SSE-NEXT:    packuswb %xmm2, %xmm0
992; SSE-NEXT:    packuswb %xmm4, %xmm0
993; SSE-NEXT:    retq
994;
995; AVX1-LABEL: trunc_sub_v16i64_v16i8:
996; AVX1:       # %bb.0:
997; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm8
998; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
999; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1000; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
1001; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm4
1002; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
1003; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1004; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm1
1005; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm5
1006; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
1007; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1008; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
1009; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm6
1010; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
1011; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1012; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm3
1013; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255]
1014; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
1015; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
1016; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
1017; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
1018; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
1019; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
1020; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1021; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
1022; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
1023; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
1024; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
1025; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
1026; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
1027; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1028; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1029; AVX1-NEXT:    vzeroupper
1030; AVX1-NEXT:    retq
1031;
1032; AVX2-LABEL: trunc_sub_v16i64_v16i8:
1033; AVX2:       # %bb.0:
1034; AVX2-NEXT:    vpsubq %ymm4, %ymm0, %ymm0
1035; AVX2-NEXT:    vpsubq %ymm5, %ymm1, %ymm1
1036; AVX2-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
1037; AVX2-NEXT:    vpsubq %ymm7, %ymm3, %ymm3
1038; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
1039; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
1040; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
1041; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
1042; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1043; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
1044; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
1045; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1046; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1047; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
1048; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1049; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1050; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1051; AVX2-NEXT:    vzeroupper
1052; AVX2-NEXT:    retq
1053;
1054; AVX512-LABEL: trunc_sub_v16i64_v16i8:
1055; AVX512:       # %bb.0:
1056; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
1057; AVX512-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
1058; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
1059; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
1060; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1061; AVX512-NEXT:    vzeroupper
1062; AVX512-NEXT:    retq
1063  %1 = sub <16 x i64> %a0, %a1
1064  %2 = trunc <16 x i64> %1 to <16 x i8>
1065  ret <16 x i8> %2
1066}
1067
1068define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1069; SSE-LABEL: trunc_sub_v16i32_v16i8:
1070; SSE:       # %bb.0:
1071; SSE-NEXT:    psubd %xmm4, %xmm0
1072; SSE-NEXT:    psubd %xmm5, %xmm1
1073; SSE-NEXT:    psubd %xmm6, %xmm2
1074; SSE-NEXT:    psubd %xmm7, %xmm3
1075; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1076; SSE-NEXT:    pand %xmm4, %xmm3
1077; SSE-NEXT:    pand %xmm4, %xmm2
1078; SSE-NEXT:    packuswb %xmm3, %xmm2
1079; SSE-NEXT:    pand %xmm4, %xmm1
1080; SSE-NEXT:    pand %xmm4, %xmm0
1081; SSE-NEXT:    packuswb %xmm1, %xmm0
1082; SSE-NEXT:    packuswb %xmm2, %xmm0
1083; SSE-NEXT:    retq
1084;
1085; AVX1-LABEL: trunc_sub_v16i32_v16i8:
1086; AVX1:       # %bb.0:
1087; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm4
1088; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1089; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1090; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
1091; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm2
1092; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1093; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1094; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
1095; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
1096; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1097; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1098; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
1099; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1100; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
1101; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
1102; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1103; AVX1-NEXT:    vzeroupper
1104; AVX1-NEXT:    retq
1105;
1106; AVX2-LABEL: trunc_sub_v16i32_v16i8:
1107; AVX2:       # %bb.0:
1108; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
1109; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
1110; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1111; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1112; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1113; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1114; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1115; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1116; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1117; AVX2-NEXT:    vzeroupper
1118; AVX2-NEXT:    retq
1119;
1120; AVX512-LABEL: trunc_sub_v16i32_v16i8:
1121; AVX512:       # %bb.0:
1122; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
1123; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1124; AVX512-NEXT:    vzeroupper
1125; AVX512-NEXT:    retq
1126  %1 = sub <16 x i32> %a0, %a1
1127  %2 = trunc <16 x i32> %1 to <16 x i8>
1128  ret <16 x i8> %2
1129}
1130
1131define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
1132; SSE-LABEL: trunc_sub_v16i16_v16i8:
1133; SSE:       # %bb.0:
1134; SSE-NEXT:    psubw %xmm2, %xmm0
1135; SSE-NEXT:    psubw %xmm3, %xmm1
1136; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1137; SSE-NEXT:    pand %xmm2, %xmm1
1138; SSE-NEXT:    pand %xmm2, %xmm0
1139; SSE-NEXT:    packuswb %xmm1, %xmm0
1140; SSE-NEXT:    retq
1141;
1142; AVX1-LABEL: trunc_sub_v16i16_v16i8:
1143; AVX1:       # %bb.0:
1144; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
1145; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1146; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1147; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
1148; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1149; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1150; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
1151; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
1152; AVX1-NEXT:    vzeroupper
1153; AVX1-NEXT:    retq
1154;
1155; AVX2-LABEL: trunc_sub_v16i16_v16i8:
1156; AVX2:       # %bb.0:
1157; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1158; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1159; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1160; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1161; AVX2-NEXT:    vzeroupper
1162; AVX2-NEXT:    retq
1163;
1164; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
1165; AVX512F:       # %bb.0:
1166; AVX512F-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1167; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1168; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1169; AVX512F-NEXT:    vzeroupper
1170; AVX512F-NEXT:    retq
1171;
1172; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
1173; AVX512BW:       # %bb.0:
1174; AVX512BW-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1175; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1176; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1177; AVX512BW-NEXT:    vzeroupper
1178; AVX512BW-NEXT:    retq
1179;
1180; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
1181; AVX512DQ:       # %bb.0:
1182; AVX512DQ-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1183; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1184; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1185; AVX512DQ-NEXT:    vzeroupper
1186; AVX512DQ-NEXT:    retq
1187  %1 = sub <16 x i16> %a0, %a1
1188  %2 = trunc <16 x i16> %1 to <16 x i8>
1189  ret <16 x i8> %2
1190}
1191
1192define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
1193; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
1194; SSE:       # %bb.0:
1195; SSE-NEXT:    psubb %xmm1, %xmm0
1196; SSE-NEXT:    retq
1197;
1198; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
1199; AVX:       # %bb.0:
1200; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1201; AVX-NEXT:    retq
1202  %a = zext <16 x i8> %x to <16 x i16>
1203  %b = zext <16 x i8> %y to <16 x i16>
1204  %c = sub <16 x i16> %a, %b
1205  %d = trunc <16 x i16> %c to <16 x i8>
1206  ret <16 x i8> %d
1207}
1208
1209;
1210; sub to constant
1211;
1212
1213define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
1214; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
1215; SSE:       # %bb.0:
1216; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1217; SSE-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1218; SSE-NEXT:    retq
1219;
1220; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
1221; AVX1:       # %bb.0:
1222; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1223; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1224; AVX1-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1225; AVX1-NEXT:    vzeroupper
1226; AVX1-NEXT:    retq
1227;
1228; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
1229; AVX2-SLOW:       # %bb.0:
1230; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1231; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1232; AVX2-SLOW-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1233; AVX2-SLOW-NEXT:    vzeroupper
1234; AVX2-SLOW-NEXT:    retq
1235;
1236; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32:
1237; AVX2-FAST-ALL:       # %bb.0:
1238; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
1239; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1240; AVX2-FAST-ALL-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1241; AVX2-FAST-ALL-NEXT:    vzeroupper
1242; AVX2-FAST-ALL-NEXT:    retq
1243;
1244; AVX2-FAST-PERLANE-LABEL: trunc_sub_const_v4i64_v4i32:
1245; AVX2-FAST-PERLANE:       # %bb.0:
1246; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
1247; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1248; AVX2-FAST-PERLANE-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1249; AVX2-FAST-PERLANE-NEXT:    vzeroupper
1250; AVX2-FAST-PERLANE-NEXT:    retq
1251;
1252; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
1253; AVX512:       # %bb.0:
1254; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1255; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
1256; AVX512-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1257; AVX512-NEXT:    vzeroupper
1258; AVX512-NEXT:    retq
1259  %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
1260  %2 = trunc <4 x i64> %1 to <4 x i32>
1261  ret <4 x i32> %2
1262}
1263
1264define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
1265; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
1266; SSE:       # %bb.0:
1267; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1268; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1269; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1270; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1271; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1272; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1273; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1274; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1275; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1276; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1277; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1278; SSE-NEXT:    psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1279; SSE-NEXT:    retq
1280;
1281; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
1282; AVX1:       # %bb.0:
1283; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
1284; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
1285; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1286; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1287; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
1288; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1289; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1290; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1291; AVX1-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1292; AVX1-NEXT:    vzeroupper
1293; AVX1-NEXT:    retq
1294;
1295; AVX2-LABEL: trunc_sub_const_v8i64_v8i16:
1296; AVX2:       # %bb.0:
1297; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1298; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
1299; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
1300; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1301; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1302; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1303; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1304; AVX2-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1305; AVX2-NEXT:    vzeroupper
1306; AVX2-NEXT:    retq
1307;
1308; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
1309; AVX512:       # %bb.0:
1310; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
1311; AVX512-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1312; AVX512-NEXT:    vzeroupper
1313; AVX512-NEXT:    retq
1314  %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
1315  %2 = trunc <8 x i64> %1 to <8 x i16>
1316  ret <8 x i16> %2
1317}
1318
1319define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
1320; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
1321; SSE:       # %bb.0:
1322; SSE-NEXT:    pslld $16, %xmm1
1323; SSE-NEXT:    psrad $16, %xmm1
1324; SSE-NEXT:    pslld $16, %xmm0
1325; SSE-NEXT:    psrad $16, %xmm0
1326; SSE-NEXT:    packssdw %xmm1, %xmm0
1327; SSE-NEXT:    psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1328; SSE-NEXT:    retq
1329;
1330; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
1331; AVX1:       # %bb.0:
1332; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1333; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
1334; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1335; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1336; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1337; AVX1-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1338; AVX1-NEXT:    vzeroupper
1339; AVX1-NEXT:    retq
1340;
1341; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
1342; AVX2:       # %bb.0:
1343; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1344; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1345; AVX2-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1346; AVX2-NEXT:    vzeroupper
1347; AVX2-NEXT:    retq
1348;
1349; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
1350; AVX512:       # %bb.0:
1351; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1352; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1353; AVX512-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1354; AVX512-NEXT:    vzeroupper
1355; AVX512-NEXT:    retq
1356  %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1357  %2 = trunc <8 x i32> %1 to <8 x i16>
1358  ret <8 x i16> %2
1359}
1360
1361define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
1362; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
1363; SSE:       # %bb.0:
1364; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1365; SSE-NEXT:    pand %xmm8, %xmm7
1366; SSE-NEXT:    pand %xmm8, %xmm6
1367; SSE-NEXT:    packuswb %xmm7, %xmm6
1368; SSE-NEXT:    pand %xmm8, %xmm5
1369; SSE-NEXT:    pand %xmm8, %xmm4
1370; SSE-NEXT:    packuswb %xmm5, %xmm4
1371; SSE-NEXT:    packuswb %xmm6, %xmm4
1372; SSE-NEXT:    pand %xmm8, %xmm3
1373; SSE-NEXT:    pand %xmm8, %xmm2
1374; SSE-NEXT:    packuswb %xmm3, %xmm2
1375; SSE-NEXT:    pand %xmm8, %xmm1
1376; SSE-NEXT:    pand %xmm8, %xmm0
1377; SSE-NEXT:    packuswb %xmm1, %xmm0
1378; SSE-NEXT:    packuswb %xmm2, %xmm0
1379; SSE-NEXT:    packuswb %xmm4, %xmm0
1380; SSE-NEXT:    psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1381; SSE-NEXT:    retq
1382;
1383; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
1384; AVX1:       # %bb.0:
1385; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
1386; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
1387; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
1388; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
1389; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
1390; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
1391; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
1392; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1393; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
1394; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1395; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1396; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
1397; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1398; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
1399; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1400; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1401; AVX1-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1402; AVX1-NEXT:    vzeroupper
1403; AVX1-NEXT:    retq
1404;
1405; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
1406; AVX2:       # %bb.0:
1407; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
1408; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
1409; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
1410; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
1411; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1412; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
1413; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
1414; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1415; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1416; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
1417; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1418; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1419; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1420; AVX2-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1421; AVX2-NEXT:    vzeroupper
1422; AVX2-NEXT:    retq
1423;
1424; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
1425; AVX512:       # %bb.0:
1426; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
1427; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
1428; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1429; AVX512-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1430; AVX512-NEXT:    vzeroupper
1431; AVX512-NEXT:    retq
1432  %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
1433  %2 = trunc <16 x i64> %1 to <16 x i8>
1434  ret <16 x i8> %2
1435}
1436
1437define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
1438; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
1439; SSE:       # %bb.0:
1440; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1441; SSE-NEXT:    pand %xmm4, %xmm3
1442; SSE-NEXT:    pand %xmm4, %xmm2
1443; SSE-NEXT:    packuswb %xmm3, %xmm2
1444; SSE-NEXT:    pand %xmm4, %xmm1
1445; SSE-NEXT:    pand %xmm4, %xmm0
1446; SSE-NEXT:    packuswb %xmm1, %xmm0
1447; SSE-NEXT:    packuswb %xmm2, %xmm0
1448; SSE-NEXT:    psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1449; SSE-NEXT:    retq
1450;
1451; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
1452; AVX1:       # %bb.0:
1453; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1454; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
1455; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1456; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1457; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
1458; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1459; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1460; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1461; AVX1-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1462; AVX1-NEXT:    vzeroupper
1463; AVX1-NEXT:    retq
1464;
1465; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
1466; AVX2:       # %bb.0:
1467; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1468; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1469; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1470; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1471; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1472; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1473; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1474; AVX2-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1475; AVX2-NEXT:    vzeroupper
1476; AVX2-NEXT:    retq
1477;
1478; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
1479; AVX512:       # %bb.0:
1480; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1481; AVX512-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1482; AVX512-NEXT:    vzeroupper
1483; AVX512-NEXT:    retq
1484  %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1485  %2 = trunc <16 x i32> %1 to <16 x i8>
1486  ret <16 x i8> %2
1487}
1488
1489define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
1490; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
1491; SSE:       # %bb.0:
1492; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1493; SSE-NEXT:    pand %xmm2, %xmm1
1494; SSE-NEXT:    pand %xmm2, %xmm0
1495; SSE-NEXT:    packuswb %xmm1, %xmm0
1496; SSE-NEXT:    psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1497; SSE-NEXT:    retq
1498;
1499; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
1500; AVX1:       # %bb.0:
1501; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1502; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1503; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1504; AVX1-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1505; AVX1-NEXT:    vzeroupper
1506; AVX1-NEXT:    retq
1507;
1508; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
1509; AVX2:       # %bb.0:
1510; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1511; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1512; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1513; AVX2-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1514; AVX2-NEXT:    vzeroupper
1515; AVX2-NEXT:    retq
1516;
1517; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
1518; AVX512F:       # %bb.0:
1519; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1520; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1521; AVX512F-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1522; AVX512F-NEXT:    vzeroupper
1523; AVX512F-NEXT:    retq
1524;
1525; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
1526; AVX512BW:       # %bb.0:
1527; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1528; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1529; AVX512BW-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1530; AVX512BW-NEXT:    vzeroupper
1531; AVX512BW-NEXT:    retq
1532;
1533; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
1534; AVX512DQ:       # %bb.0:
1535; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1536; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1537; AVX512DQ-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1538; AVX512DQ-NEXT:    vzeroupper
1539; AVX512DQ-NEXT:    retq
1540  %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1541  %2 = trunc <16 x i16> %1 to <16 x i8>
1542  ret <16 x i8> %2
1543}
1544
1545define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
1546; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1547; SSE:       # %bb.0:
1548; SSE-NEXT:    psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1549; SSE-NEXT:    retq
1550;
1551; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1552; AVX:       # %bb.0:
1553; AVX-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1554; AVX-NEXT:    retq
1555  %a = zext <16 x i8> %x to <16 x i16>
1556  %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1557  %c = trunc <16 x i16> %b to <16 x i8>
1558  ret <16 x i8> %c
1559}
1560
1561define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
1562; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1563; SSE:       # %bb.0:
1564; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1565; SSE-NEXT:    psubb %xmm0, %xmm1
1566; SSE-NEXT:    movdqa %xmm1, %xmm0
1567; SSE-NEXT:    retq
1568;
1569; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1570; AVX:       # %bb.0:
1571; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1572; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
1573; AVX-NEXT:    retq
1574  %a = zext <16 x i8> %x to <16 x i16>
1575  %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
1576  %c = trunc <16 x i16> %b to <16 x i8>
1577  ret <16 x i8> %c
1578}
1579
1580;
1581; mul
1582;
1583
1584define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1585; SSE-LABEL: trunc_mul_v4i64_v4i32:
1586; SSE:       # %bb.0:
1587; SSE-NEXT:    pmuludq %xmm3, %xmm1
1588; SSE-NEXT:    pmuludq %xmm2, %xmm0
1589; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1590; SSE-NEXT:    retq
1591;
1592; AVX1-LABEL: trunc_mul_v4i64_v4i32:
1593; AVX1:       # %bb.0:
1594; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1595; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1596; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1597; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1598; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1599; AVX1-NEXT:    vzeroupper
1600; AVX1-NEXT:    retq
1601;
1602; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
1603; AVX2-SLOW:       # %bb.0:
1604; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
1605; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1606; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
1607; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1608; AVX2-SLOW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1609; AVX2-SLOW-NEXT:    vzeroupper
1610; AVX2-SLOW-NEXT:    retq
1611;
1612; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32:
1613; AVX2-FAST-ALL:       # %bb.0:
1614; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1615; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
1616; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm2, %ymm0
1617; AVX2-FAST-ALL-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1618; AVX2-FAST-ALL-NEXT:    vzeroupper
1619; AVX2-FAST-ALL-NEXT:    retq
1620;
1621; AVX2-FAST-PERLANE-LABEL: trunc_mul_v4i64_v4i32:
1622; AVX2-FAST-PERLANE:       # %bb.0:
1623; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
1624; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1625; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm2
1626; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1627; AVX2-FAST-PERLANE-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1628; AVX2-FAST-PERLANE-NEXT:    vzeroupper
1629; AVX2-FAST-PERLANE-NEXT:    retq
1630;
1631; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
1632; AVX512F:       # %bb.0:
1633; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1634; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1635; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
1636; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1637; AVX512F-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1638; AVX512F-NEXT:    vzeroupper
1639; AVX512F-NEXT:    retq
1640;
1641; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
1642; AVX512BW:       # %bb.0:
1643; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1644; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1645; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
1646; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1647; AVX512BW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1648; AVX512BW-NEXT:    vzeroupper
1649; AVX512BW-NEXT:    retq
1650;
1651; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
1652; AVX512DQ:       # %bb.0:
1653; AVX512DQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1654; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1655; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
1656; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
1657; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1658; AVX512DQ-NEXT:    vzeroupper
1659; AVX512DQ-NEXT:    retq
1660  %1 = mul <4 x i64> %a0, %a1
1661  %2 = trunc <4 x i64> %1 to <4 x i32>
1662  ret <4 x i32> %2
1663}
1664
1665define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
1666; SSE-LABEL: trunc_mul_v8i64_v8i16:
1667; SSE:       # %bb.0:
1668; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1669; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
1670; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1671; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
1672; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1673; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
1674; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7]
1675; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1676; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
1677; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1678; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
1679; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1680; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1681; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1682; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1683; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1684; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1685; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1686; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1687; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1688; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1689; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1690; SSE-NEXT:    pmullw %xmm6, %xmm0
1691; SSE-NEXT:    retq
1692;
1693; AVX1-LABEL: trunc_mul_v8i64_v8i16:
1694; AVX1:       # %bb.0:
1695; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
1696; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
1697; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
1698; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
1699; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
1700; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
1701; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
1702; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1703; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
1704; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1705; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1706; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
1707; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1708; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
1709; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1710; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1711; AVX1-NEXT:    vzeroupper
1712; AVX1-NEXT:    retq
1713;
1714; AVX2-LABEL: trunc_mul_v8i64_v8i16:
1715; AVX2:       # %bb.0:
1716; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1717; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7],ymm3[8],ymm4[9,10,11],ymm3[12],ymm4[13,14,15]
1718; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7],ymm2[8],ymm4[9,10,11],ymm2[12],ymm4[13,14,15]
1719; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
1720; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
1721; AVX2-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1722; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7],ymm1[8],ymm4[9,10,11],ymm1[12],ymm4[13,14,15]
1723; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7],ymm0[8],ymm4[9,10,11],ymm0[12],ymm4[13,14,15]
1724; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1725; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1726; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1727; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1728; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1729; AVX2-NEXT:    vzeroupper
1730; AVX2-NEXT:    retq
1731;
1732; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
1733; AVX512F:       # %bb.0:
1734; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
1735; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
1736; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1737; AVX512F-NEXT:    vzeroupper
1738; AVX512F-NEXT:    retq
1739;
1740; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
1741; AVX512BW:       # %bb.0:
1742; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
1743; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
1744; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1745; AVX512BW-NEXT:    vzeroupper
1746; AVX512BW-NEXT:    retq
1747;
1748; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
1749; AVX512DQ:       # %bb.0:
1750; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
1751; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
1752; AVX512DQ-NEXT:    vzeroupper
1753; AVX512DQ-NEXT:    retq
1754  %1 = mul <8 x i64> %a0, %a1
1755  %2 = trunc <8 x i64> %1 to <8 x i16>
1756  ret <8 x i16> %2
1757}
1758
1759define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1760; SSE-LABEL: trunc_mul_v8i32_v8i16:
1761; SSE:       # %bb.0:
1762; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1763; SSE-NEXT:    pmuludq %xmm2, %xmm0
1764; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1765; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1766; SSE-NEXT:    pmuludq %xmm4, %xmm2
1767; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1768; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1769; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1770; SSE-NEXT:    pmuludq %xmm3, %xmm1
1771; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1772; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1773; SSE-NEXT:    pmuludq %xmm2, %xmm3
1774; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1775; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1776; SSE-NEXT:    pslld $16, %xmm1
1777; SSE-NEXT:    psrad $16, %xmm1
1778; SSE-NEXT:    pslld $16, %xmm0
1779; SSE-NEXT:    psrad $16, %xmm0
1780; SSE-NEXT:    packssdw %xmm1, %xmm0
1781; SSE-NEXT:    retq
1782;
1783; AVX1-LABEL: trunc_mul_v8i32_v8i16:
1784; AVX1:       # %bb.0:
1785; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm2
1786; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1787; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1788; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1789; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
1790; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1791; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
1792; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1793; AVX1-NEXT:    vzeroupper
1794; AVX1-NEXT:    retq
1795;
1796; AVX2-LABEL: trunc_mul_v8i32_v8i16:
1797; AVX2:       # %bb.0:
1798; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1799; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1800; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1801; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1802; AVX2-NEXT:    vzeroupper
1803; AVX2-NEXT:    retq
1804;
1805; AVX512-LABEL: trunc_mul_v8i32_v8i16:
1806; AVX512:       # %bb.0:
1807; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1808; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1809; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1810; AVX512-NEXT:    vzeroupper
1811; AVX512-NEXT:    retq
1812  %1 = mul <8 x i32> %a0, %a1
1813  %2 = trunc <8 x i32> %1 to <8 x i16>
1814  ret <8 x i16> %2
1815}
1816
1817define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1818; SSE-LABEL: trunc_mul_v16i64_v16i8:
1819; SSE:       # %bb.0:
1820; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm0
1821; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm1
1822; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm2
1823; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm3
1824; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm4
1825; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm5
1826; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm6
1827; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm7
1828; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1829; SSE-NEXT:    pand %xmm8, %xmm7
1830; SSE-NEXT:    pand %xmm8, %xmm6
1831; SSE-NEXT:    packuswb %xmm7, %xmm6
1832; SSE-NEXT:    pand %xmm8, %xmm5
1833; SSE-NEXT:    pand %xmm8, %xmm4
1834; SSE-NEXT:    packuswb %xmm5, %xmm4
1835; SSE-NEXT:    packuswb %xmm6, %xmm4
1836; SSE-NEXT:    pand %xmm8, %xmm3
1837; SSE-NEXT:    pand %xmm8, %xmm2
1838; SSE-NEXT:    packuswb %xmm3, %xmm2
1839; SSE-NEXT:    pand %xmm8, %xmm1
1840; SSE-NEXT:    pand %xmm8, %xmm0
1841; SSE-NEXT:    packuswb %xmm1, %xmm0
1842; SSE-NEXT:    packuswb %xmm2, %xmm0
1843; SSE-NEXT:    packuswb %xmm4, %xmm0
1844; SSE-NEXT:    retq
1845;
1846; AVX1-LABEL: trunc_mul_v16i64_v16i8:
1847; AVX1:       # %bb.0:
1848; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm8
1849; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
1850; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1851; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm0
1852; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm4
1853; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
1854; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1855; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
1856; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm5
1857; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
1858; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1859; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm2
1860; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm6
1861; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
1862; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1863; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm3
1864; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255]
1865; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
1866; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
1867; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
1868; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
1869; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
1870; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
1871; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1872; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
1873; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
1874; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
1875; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
1876; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
1877; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
1878; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1879; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1880; AVX1-NEXT:    vzeroupper
1881; AVX1-NEXT:    retq
1882;
1883; AVX2-LABEL: trunc_mul_v16i64_v16i8:
1884; AVX2:       # %bb.0:
1885; AVX2-NEXT:    vpmuludq %ymm4, %ymm0, %ymm0
1886; AVX2-NEXT:    vpmuludq %ymm5, %ymm1, %ymm1
1887; AVX2-NEXT:    vpmuludq %ymm6, %ymm2, %ymm2
1888; AVX2-NEXT:    vpmuludq %ymm7, %ymm3, %ymm3
1889; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
1890; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
1891; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
1892; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
1893; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1894; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
1895; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
1896; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1897; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1898; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
1899; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1900; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1901; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1902; AVX2-NEXT:    vzeroupper
1903; AVX2-NEXT:    retq
1904;
1905; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
1906; AVX512F:       # %bb.0:
1907; AVX512F-NEXT:    vpmuludq %zmm2, %zmm0, %zmm0
1908; AVX512F-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
1909; AVX512F-NEXT:    vpmovqb %zmm1, %xmm1
1910; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
1911; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1912; AVX512F-NEXT:    vzeroupper
1913; AVX512F-NEXT:    retq
1914;
1915; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
1916; AVX512BW:       # %bb.0:
1917; AVX512BW-NEXT:    vpmuludq %zmm2, %zmm0, %zmm0
1918; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
1919; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
1920; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
1921; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1922; AVX512BW-NEXT:    vzeroupper
1923; AVX512BW-NEXT:    retq
1924;
1925; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
1926; AVX512DQ:       # %bb.0:
1927; AVX512DQ-NEXT:    vpmullq %zmm2, %zmm0, %zmm0
1928; AVX512DQ-NEXT:    vpmullq %zmm3, %zmm1, %zmm1
1929; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
1930; AVX512DQ-NEXT:    vpmovqb %zmm0, %xmm0
1931; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1932; AVX512DQ-NEXT:    vzeroupper
1933; AVX512DQ-NEXT:    retq
1934  %1 = mul <16 x i64> %a0, %a1
1935  %2 = trunc <16 x i64> %1 to <16 x i8>
1936  ret <16 x i8> %2
1937}
1938
1939define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1940; SSE-LABEL: trunc_mul_v16i32_v16i8:
1941; SSE:       # %bb.0:
1942; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
1943; SSE-NEXT:    pmuludq %xmm4, %xmm0
1944; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1945; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1946; SSE-NEXT:    pmuludq %xmm8, %xmm4
1947; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1948; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
1949; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1950; SSE-NEXT:    pmuludq %xmm5, %xmm1
1951; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1952; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1953; SSE-NEXT:    pmuludq %xmm4, %xmm5
1954; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1955; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
1956; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
1957; SSE-NEXT:    pmuludq %xmm6, %xmm2
1958; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1959; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
1960; SSE-NEXT:    pmuludq %xmm4, %xmm5
1961; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1962; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1963; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1964; SSE-NEXT:    pmuludq %xmm7, %xmm3
1965; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1966; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
1967; SSE-NEXT:    pmuludq %xmm4, %xmm5
1968; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1969; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1970; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1971; SSE-NEXT:    pand %xmm4, %xmm3
1972; SSE-NEXT:    pand %xmm4, %xmm2
1973; SSE-NEXT:    packuswb %xmm3, %xmm2
1974; SSE-NEXT:    pand %xmm4, %xmm1
1975; SSE-NEXT:    pand %xmm4, %xmm0
1976; SSE-NEXT:    packuswb %xmm1, %xmm0
1977; SSE-NEXT:    packuswb %xmm2, %xmm0
1978; SSE-NEXT:    retq
1979;
1980; AVX1-LABEL: trunc_mul_v16i32_v16i8:
1981; AVX1:       # %bb.0:
1982; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm4
1983; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1984; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1985; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
1986; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm2
1987; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1988; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1989; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
1990; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
1991; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1992; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1993; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
1994; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1995; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
1996; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
1997; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1998; AVX1-NEXT:    vzeroupper
1999; AVX1-NEXT:    retq
2000;
2001; AVX2-LABEL: trunc_mul_v16i32_v16i8:
2002; AVX2:       # %bb.0:
2003; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
2004; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
2005; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
2006; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
2007; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2008; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2009; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2010; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2011; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2012; AVX2-NEXT:    vzeroupper
2013; AVX2-NEXT:    retq
2014;
2015; AVX512-LABEL: trunc_mul_v16i32_v16i8:
2016; AVX512:       # %bb.0:
2017; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
2018; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2019; AVX512-NEXT:    vzeroupper
2020; AVX512-NEXT:    retq
2021  %1 = mul <16 x i32> %a0, %a1
2022  %2 = trunc <16 x i32> %1 to <16 x i8>
2023  ret <16 x i8> %2
2024}
2025
2026define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2027; SSE-LABEL: trunc_mul_v16i16_v16i8:
2028; SSE:       # %bb.0:
2029; SSE-NEXT:    pmullw %xmm2, %xmm0
2030; SSE-NEXT:    pmullw %xmm3, %xmm1
2031; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2032; SSE-NEXT:    pand %xmm2, %xmm1
2033; SSE-NEXT:    pand %xmm2, %xmm0
2034; SSE-NEXT:    packuswb %xmm1, %xmm0
2035; SSE-NEXT:    retq
2036;
2037; AVX1-LABEL: trunc_mul_v16i16_v16i8:
2038; AVX1:       # %bb.0:
2039; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2040; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2041; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2042; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2043; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2044; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
2045; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
2046; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
2047; AVX1-NEXT:    vzeroupper
2048; AVX1-NEXT:    retq
2049;
2050; AVX2-LABEL: trunc_mul_v16i16_v16i8:
2051; AVX2:       # %bb.0:
2052; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2053; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2054; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2055; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2056; AVX2-NEXT:    vzeroupper
2057; AVX2-NEXT:    retq
2058;
2059; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
2060; AVX512F:       # %bb.0:
2061; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2062; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2063; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2064; AVX512F-NEXT:    vzeroupper
2065; AVX512F-NEXT:    retq
2066;
2067; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
2068; AVX512BW:       # %bb.0:
2069; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2070; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2071; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2072; AVX512BW-NEXT:    vzeroupper
2073; AVX512BW-NEXT:    retq
2074;
2075; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
2076; AVX512DQ:       # %bb.0:
2077; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2078; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2079; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2080; AVX512DQ-NEXT:    vzeroupper
2081; AVX512DQ-NEXT:    retq
2082  %1 = mul <16 x i16> %a0, %a1
2083  %2 = trunc <16 x i16> %1 to <16 x i8>
2084  ret <16 x i8> %2
2085}
2086
2087define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
2088; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2089; SSE:       # %bb.0:
2090; SSE-NEXT:    pxor %xmm3, %xmm3
2091; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2092; SSE-NEXT:    pslld $16, %xmm2
2093; SSE-NEXT:    psrad $16, %xmm2
2094; SSE-NEXT:    pslld $16, %xmm1
2095; SSE-NEXT:    psrad $16, %xmm1
2096; SSE-NEXT:    packssdw %xmm2, %xmm1
2097; SSE-NEXT:    pmullw %xmm1, %xmm0
2098; SSE-NEXT:    retq
2099;
2100; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2101; AVX1:       # %bb.0:
2102; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2103; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2104; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2105; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2106; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2107; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2108; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2109; AVX1-NEXT:    vzeroupper
2110; AVX1-NEXT:    retq
2111;
2112; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2113; AVX2:       # %bb.0:
2114; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2115; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2116; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2117; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2118; AVX2-NEXT:    vzeroupper
2119; AVX2-NEXT:    retq
2120;
2121; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2122; AVX512:       # %bb.0:
2123; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2124; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
2125; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2126; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2127; AVX512-NEXT:    vzeroupper
2128; AVX512-NEXT:    retq
2129  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2130  %2 = zext <8 x i8> %1 to <8 x i32>
2131  %3 = mul <8 x i32> %2, %a1
2132  %4 = trunc <8 x i32> %3 to <8 x i16>
2133  ret <8 x i16> %4
2134}
2135
2136;
2137; mul to constant
2138;
2139
2140define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2141; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
2142; SSE:       # %bb.0:
2143; SSE-NEXT:    xorps %xmm2, %xmm2
2144; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2145; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2146; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
2147; SSE-NEXT:    movaps %xmm2, %xmm0
2148; SSE-NEXT:    retq
2149;
2150; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
2151; AVX1:       # %bb.0:
2152; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2153; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2154; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2155; AVX1-NEXT:    vzeroupper
2156; AVX1-NEXT:    retq
2157;
2158; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
2159; AVX2-SLOW:       # %bb.0:
2160; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
2161; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2162; AVX2-SLOW-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2163; AVX2-SLOW-NEXT:    vzeroupper
2164; AVX2-SLOW-NEXT:    retq
2165;
2166; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32:
2167; AVX2-FAST-ALL:       # %bb.0:
2168; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
2169; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2170; AVX2-FAST-ALL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2171; AVX2-FAST-ALL-NEXT:    vzeroupper
2172; AVX2-FAST-ALL-NEXT:    retq
2173;
2174; AVX2-FAST-PERLANE-LABEL: trunc_mul_const_v4i64_v4i32:
2175; AVX2-FAST-PERLANE:       # %bb.0:
2176; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
2177; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2178; AVX2-FAST-PERLANE-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2179; AVX2-FAST-PERLANE-NEXT:    vzeroupper
2180; AVX2-FAST-PERLANE-NEXT:    retq
2181;
2182; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
2183; AVX512:       # %bb.0:
2184; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2185; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2186; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2187; AVX512-NEXT:    vzeroupper
2188; AVX512-NEXT:    retq
2189  %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2190  %2 = trunc <4 x i64> %1 to <4 x i32>
2191  ret <4 x i32> %2
2192}
2193
2194define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2195; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
2196; SSE:       # %bb.0:
2197; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2198; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2199; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2200; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2201; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2202; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2203; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2204; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2205; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2206; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2207; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2208; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2209; SSE-NEXT:    retq
2210;
2211; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
2212; AVX1:       # %bb.0:
2213; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
2214; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
2215; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2216; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2217; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2218; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2219; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2220; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2221; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2222; AVX1-NEXT:    vzeroupper
2223; AVX1-NEXT:    retq
2224;
2225; AVX2-LABEL: trunc_mul_const_v8i64_v8i16:
2226; AVX2:       # %bb.0:
2227; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2228; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2229; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2230; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2231; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2232; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2233; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2234; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2235; AVX2-NEXT:    vzeroupper
2236; AVX2-NEXT:    retq
2237;
2238; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
2239; AVX512:       # %bb.0:
2240; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
2241; AVX512-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2242; AVX512-NEXT:    vzeroupper
2243; AVX512-NEXT:    retq
2244  %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2245  %2 = trunc <8 x i64> %1 to <8 x i16>
2246  ret <8 x i16> %2
2247}
2248
2249define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
2250; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
2251; SSE:       # %bb.0:
2252; SSE-NEXT:    pslld $16, %xmm1
2253; SSE-NEXT:    psrad $16, %xmm1
2254; SSE-NEXT:    pslld $16, %xmm0
2255; SSE-NEXT:    psrad $16, %xmm0
2256; SSE-NEXT:    packssdw %xmm1, %xmm0
2257; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2258; SSE-NEXT:    retq
2259;
2260; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
2261; AVX1:       # %bb.0:
2262; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2263; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2264; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2265; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2266; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2267; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2268; AVX1-NEXT:    vzeroupper
2269; AVX1-NEXT:    retq
2270;
2271; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
2272; AVX2:       # %bb.0:
2273; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2274; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2275; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2276; AVX2-NEXT:    vzeroupper
2277; AVX2-NEXT:    retq
2278;
2279; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
2280; AVX512:       # %bb.0:
2281; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2282; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2283; AVX512-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2284; AVX512-NEXT:    vzeroupper
2285; AVX512-NEXT:    retq
2286  %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2287  %2 = trunc <8 x i32> %1 to <8 x i16>
2288  ret <8 x i16> %2
2289}
2290
2291define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
2292; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
2293; SSE:       # %bb.0:
2294; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2295; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2296; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2297; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2298; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
2299; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
2300; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
2301; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2302; SSE-NEXT:    pand %xmm8, %xmm7
2303; SSE-NEXT:    pand %xmm8, %xmm6
2304; SSE-NEXT:    packuswb %xmm7, %xmm6
2305; SSE-NEXT:    pand %xmm8, %xmm5
2306; SSE-NEXT:    pand %xmm8, %xmm4
2307; SSE-NEXT:    packuswb %xmm5, %xmm4
2308; SSE-NEXT:    packuswb %xmm6, %xmm4
2309; SSE-NEXT:    pand %xmm8, %xmm3
2310; SSE-NEXT:    pand %xmm8, %xmm2
2311; SSE-NEXT:    packuswb %xmm3, %xmm2
2312; SSE-NEXT:    pand %xmm8, %xmm1
2313; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2314; SSE-NEXT:    packuswb %xmm1, %xmm0
2315; SSE-NEXT:    packuswb %xmm2, %xmm0
2316; SSE-NEXT:    packuswb %xmm4, %xmm0
2317; SSE-NEXT:    retq
2318;
2319; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
2320; AVX1:       # %bb.0:
2321; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8
2322; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2323; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2324; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5
2325; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2326; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2327; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6
2328; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2329; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2330; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
2331; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2332; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
2333; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255]
2334; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
2335; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
2336; AVX1-NEXT:    vpackusdw %xmm3, %xmm7, %xmm3
2337; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2338; AVX1-NEXT:    vpand %xmm4, %xmm6, %xmm6
2339; AVX1-NEXT:    vpackusdw %xmm2, %xmm6, %xmm2
2340; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
2341; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2342; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
2343; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
2344; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2345; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
2346; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
2347; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2348; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2349; AVX1-NEXT:    vzeroupper
2350; AVX1-NEXT:    retq
2351;
2352; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
2353; AVX2:       # %bb.0:
2354; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2355; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2356; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2357; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
2358; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
2359; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
2360; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
2361; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
2362; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2363; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
2364; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
2365; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2366; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2367; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
2368; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2369; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2370; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2371; AVX2-NEXT:    vzeroupper
2372; AVX2-NEXT:    retq
2373;
2374; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
2375; AVX512F:       # %bb.0:
2376; AVX512F-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2377; AVX512F-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2378; AVX512F-NEXT:    vpmovqb %zmm1, %xmm1
2379; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
2380; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2381; AVX512F-NEXT:    vzeroupper
2382; AVX512F-NEXT:    retq
2383;
2384; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
2385; AVX512BW:       # %bb.0:
2386; AVX512BW-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2387; AVX512BW-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2388; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
2389; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
2390; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2391; AVX512BW-NEXT:    vzeroupper
2392; AVX512BW-NEXT:    retq
2393;
2394; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
2395; AVX512DQ:       # %bb.0:
2396; AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2397; AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2398; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
2399; AVX512DQ-NEXT:    vpmovqb %zmm0, %xmm0
2400; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2401; AVX512DQ-NEXT:    vzeroupper
2402; AVX512DQ-NEXT:    retq
2403  %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
2404  %2 = trunc <16 x i64> %1 to <16 x i8>
2405  ret <16 x i8> %2
2406}
2407
2408define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
2409; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
2410; SSE:       # %bb.0:
2411; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,2,3]
2412; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2413; SSE-NEXT:    pmuludq %xmm4, %xmm0
2414; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2415; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2416; SSE-NEXT:    pmuludq %xmm5, %xmm4
2417; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2418; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2419; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [4,5,6,7]
2420; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
2421; SSE-NEXT:    pmuludq %xmm4, %xmm1
2422; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2423; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2424; SSE-NEXT:    pmuludq %xmm5, %xmm4
2425; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2426; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2427; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,9,10,11]
2428; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
2429; SSE-NEXT:    pmuludq %xmm4, %xmm2
2430; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2431; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2432; SSE-NEXT:    pmuludq %xmm5, %xmm4
2433; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2434; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2435; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [12,13,14,15]
2436; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
2437; SSE-NEXT:    pmuludq %xmm4, %xmm3
2438; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2439; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2440; SSE-NEXT:    pmuludq %xmm5, %xmm4
2441; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2442; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2443; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2444; SSE-NEXT:    pand %xmm4, %xmm3
2445; SSE-NEXT:    pand %xmm4, %xmm2
2446; SSE-NEXT:    packuswb %xmm3, %xmm2
2447; SSE-NEXT:    pand %xmm4, %xmm1
2448; SSE-NEXT:    pand %xmm4, %xmm0
2449; SSE-NEXT:    packuswb %xmm1, %xmm0
2450; SSE-NEXT:    packuswb %xmm2, %xmm0
2451; SSE-NEXT:    retq
2452;
2453; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
2454; AVX1:       # %bb.0:
2455; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
2456; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2457; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2458; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
2459; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2460; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2461; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255]
2462; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2463; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
2464; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
2465; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2466; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2467; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
2468; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2469; AVX1-NEXT:    vzeroupper
2470; AVX1-NEXT:    retq
2471;
2472; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
2473; AVX2:       # %bb.0:
2474; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2475; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2476; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
2477; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
2478; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2479; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2480; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2481; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2482; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2483; AVX2-NEXT:    vzeroupper
2484; AVX2-NEXT:    retq
2485;
2486; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
2487; AVX512:       # %bb.0:
2488; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2489; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2490; AVX512-NEXT:    vzeroupper
2491; AVX512-NEXT:    retq
2492  %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2493  %2 = trunc <16 x i32> %1 to <16 x i8>
2494  ret <16 x i8> %2
2495}
2496
2497define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
2498; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
2499; SSE:       # %bb.0:
2500; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2501; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2502; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2503; SSE-NEXT:    pand %xmm2, %xmm1
2504; SSE-NEXT:    pand %xmm2, %xmm0
2505; SSE-NEXT:    packuswb %xmm1, %xmm0
2506; SSE-NEXT:    retq
2507;
2508; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
2509; AVX1:       # %bb.0:
2510; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2511; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2512; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2513; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2514; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
2515; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
2516; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
2517; AVX1-NEXT:    vzeroupper
2518; AVX1-NEXT:    retq
2519;
2520; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
2521; AVX2:       # %bb.0:
2522; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2523; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2524; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2525; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2526; AVX2-NEXT:    vzeroupper
2527; AVX2-NEXT:    retq
2528;
2529; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
2530; AVX512F:       # %bb.0:
2531; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2532; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2533; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2534; AVX512F-NEXT:    vzeroupper
2535; AVX512F-NEXT:    retq
2536;
2537; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
2538; AVX512BW:       # %bb.0:
2539; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2540; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2541; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2542; AVX512BW-NEXT:    vzeroupper
2543; AVX512BW-NEXT:    retq
2544;
2545; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
2546; AVX512DQ:       # %bb.0:
2547; AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2548; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2549; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2550; AVX512DQ-NEXT:    vzeroupper
2551; AVX512DQ-NEXT:    retq
2552  %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
2553  %2 = trunc <16 x i16> %1 to <16 x i8>
2554  ret <16 x i8> %2
2555}
2556
2557;
2558; and
2559;
2560
2561define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2562; SSE-LABEL: trunc_and_v4i64_v4i32:
2563; SSE:       # %bb.0:
2564; SSE-NEXT:    andps %xmm3, %xmm1
2565; SSE-NEXT:    andps %xmm2, %xmm0
2566; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2567; SSE-NEXT:    retq
2568;
2569; AVX1-LABEL: trunc_and_v4i64_v4i32:
2570; AVX1:       # %bb.0:
2571; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2572; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2573; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2574; AVX1-NEXT:    vzeroupper
2575; AVX1-NEXT:    retq
2576;
2577; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
2578; AVX2-SLOW:       # %bb.0:
2579; AVX2-SLOW-NEXT:    vandps %ymm1, %ymm0, %ymm0
2580; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
2581; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2582; AVX2-SLOW-NEXT:    vzeroupper
2583; AVX2-SLOW-NEXT:    retq
2584;
2585; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32:
2586; AVX2-FAST-ALL:       # %bb.0:
2587; AVX2-FAST-ALL-NEXT:    vandps %ymm1, %ymm0, %ymm0
2588; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
2589; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2590; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2591; AVX2-FAST-ALL-NEXT:    vzeroupper
2592; AVX2-FAST-ALL-NEXT:    retq
2593;
2594; AVX2-FAST-PERLANE-LABEL: trunc_and_v4i64_v4i32:
2595; AVX2-FAST-PERLANE:       # %bb.0:
2596; AVX2-FAST-PERLANE-NEXT:    vandps %ymm1, %ymm0, %ymm0
2597; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
2598; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2599; AVX2-FAST-PERLANE-NEXT:    vzeroupper
2600; AVX2-FAST-PERLANE-NEXT:    retq
2601;
2602; AVX512-LABEL: trunc_and_v4i64_v4i32:
2603; AVX512:       # %bb.0:
2604; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
2605; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2606; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2607; AVX512-NEXT:    vzeroupper
2608; AVX512-NEXT:    retq
2609  %1 = and <4 x i64> %a0, %a1
2610  %2 = trunc <4 x i64> %1 to <4 x i32>
2611  ret <4 x i32> %2
2612}
2613
2614define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
2615; SSE-LABEL: trunc_and_v8i64_v8i16:
2616; SSE:       # %bb.0:
2617; SSE-NEXT:    pand %xmm6, %xmm2
2618; SSE-NEXT:    pand %xmm7, %xmm3
2619; SSE-NEXT:    pand %xmm4, %xmm0
2620; SSE-NEXT:    pand %xmm5, %xmm1
2621; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2622; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2623; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2624; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2625; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2626; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2627; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2628; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2629; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2630; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2631; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2632; SSE-NEXT:    retq
2633;
2634; AVX1-LABEL: trunc_and_v8i64_v8i16:
2635; AVX1:       # %bb.0:
2636; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
2637; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
2638; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
2639; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2640; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2641; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
2642; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2643; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2644; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2645; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2646; AVX1-NEXT:    vzeroupper
2647; AVX1-NEXT:    retq
2648;
2649; AVX2-LABEL: trunc_and_v8i64_v8i16:
2650; AVX2:       # %bb.0:
2651; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2652; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2653; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2654; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2655; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2656; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2657; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2658; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2659; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2660; AVX2-NEXT:    vzeroupper
2661; AVX2-NEXT:    retq
2662;
2663; AVX512-LABEL: trunc_and_v8i64_v8i16:
2664; AVX512:       # %bb.0:
2665; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2666; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
2667; AVX512-NEXT:    vzeroupper
2668; AVX512-NEXT:    retq
2669  %1 = and <8 x i64> %a0, %a1
2670  %2 = trunc <8 x i64> %1 to <8 x i16>
2671  ret <8 x i16> %2
2672}
2673
2674define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
2675; SSE-LABEL: trunc_and_v8i32_v8i16:
2676; SSE:       # %bb.0:
2677; SSE-NEXT:    pand %xmm2, %xmm0
2678; SSE-NEXT:    pand %xmm3, %xmm1
2679; SSE-NEXT:    pslld $16, %xmm1
2680; SSE-NEXT:    psrad $16, %xmm1
2681; SSE-NEXT:    pslld $16, %xmm0
2682; SSE-NEXT:    psrad $16, %xmm0
2683; SSE-NEXT:    packssdw %xmm1, %xmm0
2684; SSE-NEXT:    retq
2685;
2686; AVX1-LABEL: trunc_and_v8i32_v8i16:
2687; AVX1:       # %bb.0:
2688; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2689; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2690; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2691; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2692; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2693; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2694; AVX1-NEXT:    vzeroupper
2695; AVX1-NEXT:    retq
2696;
2697; AVX2-LABEL: trunc_and_v8i32_v8i16:
2698; AVX2:       # %bb.0:
2699; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2700; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2701; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2702; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2703; AVX2-NEXT:    vzeroupper
2704; AVX2-NEXT:    retq
2705;
2706; AVX512-LABEL: trunc_and_v8i32_v8i16:
2707; AVX512:       # %bb.0:
2708; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
2709; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2710; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2711; AVX512-NEXT:    vzeroupper
2712; AVX512-NEXT:    retq
2713  %1 = and <8 x i32> %a0, %a1
2714  %2 = trunc <8 x i32> %1 to <8 x i16>
2715  ret <8 x i16> %2
2716}
2717
2718define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
2719; SSE-LABEL: trunc_and_v16i64_v16i8:
2720; SSE:       # %bb.0:
2721; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm0
2722; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm1
2723; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm2
2724; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm3
2725; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm4
2726; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm5
2727; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm6
2728; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm7
2729; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2730; SSE-NEXT:    pand %xmm8, %xmm7
2731; SSE-NEXT:    pand %xmm8, %xmm6
2732; SSE-NEXT:    packuswb %xmm7, %xmm6
2733; SSE-NEXT:    pand %xmm8, %xmm5
2734; SSE-NEXT:    pand %xmm8, %xmm4
2735; SSE-NEXT:    packuswb %xmm5, %xmm4
2736; SSE-NEXT:    packuswb %xmm6, %xmm4
2737; SSE-NEXT:    pand %xmm8, %xmm3
2738; SSE-NEXT:    pand %xmm8, %xmm2
2739; SSE-NEXT:    packuswb %xmm3, %xmm2
2740; SSE-NEXT:    pand %xmm8, %xmm1
2741; SSE-NEXT:    pand %xmm8, %xmm0
2742; SSE-NEXT:    packuswb %xmm1, %xmm0
2743; SSE-NEXT:    packuswb %xmm2, %xmm0
2744; SSE-NEXT:    packuswb %xmm4, %xmm0
2745; SSE-NEXT:    retq
2746;
2747; AVX1-LABEL: trunc_and_v16i64_v16i8:
2748; AVX1:       # %bb.0:
2749; AVX1-NEXT:    vmovaps {{.*#+}} ymm8 = [255,255,255,255]
2750; AVX1-NEXT:    vandps %ymm7, %ymm8, %ymm7
2751; AVX1-NEXT:    vandps %ymm7, %ymm3, %ymm3
2752; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm7
2753; AVX1-NEXT:    vpackusdw %xmm7, %xmm3, %xmm3
2754; AVX1-NEXT:    vandps %ymm6, %ymm8, %ymm6
2755; AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
2756; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
2757; AVX1-NEXT:    vpackusdw %xmm6, %xmm2, %xmm2
2758; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
2759; AVX1-NEXT:    vandps %ymm5, %ymm8, %ymm3
2760; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
2761; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2762; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2763; AVX1-NEXT:    vandps %ymm4, %ymm8, %ymm3
2764; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
2765; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
2766; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
2767; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2768; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2769; AVX1-NEXT:    vzeroupper
2770; AVX1-NEXT:    retq
2771;
2772; AVX2-LABEL: trunc_and_v16i64_v16i8:
2773; AVX2:       # %bb.0:
2774; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [255,255,255,255]
2775; AVX2-NEXT:    vpand %ymm7, %ymm8, %ymm7
2776; AVX2-NEXT:    vpand %ymm7, %ymm3, %ymm3
2777; AVX2-NEXT:    vpand %ymm6, %ymm8, %ymm6
2778; AVX2-NEXT:    vpand %ymm6, %ymm2, %ymm2
2779; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
2780; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2781; AVX2-NEXT:    vpand %ymm5, %ymm8, %ymm3
2782; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2783; AVX2-NEXT:    vpand %ymm4, %ymm8, %ymm3
2784; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2785; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2786; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2787; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
2788; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2789; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2790; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2791; AVX2-NEXT:    vzeroupper
2792; AVX2-NEXT:    retq
2793;
2794; AVX512-LABEL: trunc_and_v16i64_v16i8:
2795; AVX512:       # %bb.0:
2796; AVX512-NEXT:    vpandq %zmm2, %zmm0, %zmm0
2797; AVX512-NEXT:    vpandq %zmm3, %zmm1, %zmm1
2798; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
2799; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
2800; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2801; AVX512-NEXT:    vzeroupper
2802; AVX512-NEXT:    retq
2803  %1 = and <16 x i64> %a0, %a1
2804  %2 = trunc <16 x i64> %1 to <16 x i8>
2805  ret <16 x i8> %2
2806}
2807
2808define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
2809; SSE-LABEL: trunc_and_v16i32_v16i8:
2810; SSE:       # %bb.0:
2811; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2812; SSE-NEXT:    pand %xmm8, %xmm7
2813; SSE-NEXT:    pand %xmm3, %xmm7
2814; SSE-NEXT:    pand %xmm8, %xmm6
2815; SSE-NEXT:    pand %xmm2, %xmm6
2816; SSE-NEXT:    packuswb %xmm7, %xmm6
2817; SSE-NEXT:    pand %xmm8, %xmm5
2818; SSE-NEXT:    pand %xmm1, %xmm5
2819; SSE-NEXT:    pand %xmm8, %xmm4
2820; SSE-NEXT:    pand %xmm4, %xmm0
2821; SSE-NEXT:    packuswb %xmm5, %xmm0
2822; SSE-NEXT:    packuswb %xmm6, %xmm0
2823; SSE-NEXT:    retq
2824;
2825; AVX1-LABEL: trunc_and_v16i32_v16i8:
2826; AVX1:       # %bb.0:
2827; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
2828; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
2829; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
2830; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2831; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2832; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
2833; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2834; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2835; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2836; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2837; AVX1-NEXT:    vzeroupper
2838; AVX1-NEXT:    retq
2839;
2840; AVX2-LABEL: trunc_and_v16i32_v16i8:
2841; AVX2:       # %bb.0:
2842; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
2843; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
2844; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2845; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
2846; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2847; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2848; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2849; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2850; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2851; AVX2-NEXT:    vzeroupper
2852; AVX2-NEXT:    retq
2853;
2854; AVX512-LABEL: trunc_and_v16i32_v16i8:
2855; AVX512:       # %bb.0:
2856; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
2857; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2858; AVX512-NEXT:    vzeroupper
2859; AVX512-NEXT:    retq
2860  %1 = and <16 x i32> %a0, %a1
2861  %2 = trunc <16 x i32> %1 to <16 x i8>
2862  ret <16 x i8> %2
2863}
2864
2865define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2866; SSE-LABEL: trunc_and_v16i16_v16i8:
2867; SSE:       # %bb.0:
2868; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2869; SSE-NEXT:    pand %xmm4, %xmm3
2870; SSE-NEXT:    pand %xmm1, %xmm3
2871; SSE-NEXT:    pand %xmm4, %xmm2
2872; SSE-NEXT:    pand %xmm2, %xmm0
2873; SSE-NEXT:    packuswb %xmm3, %xmm0
2874; SSE-NEXT:    retq
2875;
2876; AVX1-LABEL: trunc_and_v16i16_v16i8:
2877; AVX1:       # %bb.0:
2878; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2879; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2880; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2881; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2882; AVX1-NEXT:    vzeroupper
2883; AVX1-NEXT:    retq
2884;
2885; AVX2-LABEL: trunc_and_v16i16_v16i8:
2886; AVX2:       # %bb.0:
2887; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2888; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2889; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2890; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2891; AVX2-NEXT:    vzeroupper
2892; AVX2-NEXT:    retq
2893;
2894; AVX512F-LABEL: trunc_and_v16i16_v16i8:
2895; AVX512F:       # %bb.0:
2896; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
2897; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2898; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2899; AVX512F-NEXT:    vzeroupper
2900; AVX512F-NEXT:    retq
2901;
2902; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
2903; AVX512BW:       # %bb.0:
2904; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm0
2905; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2906; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2907; AVX512BW-NEXT:    vzeroupper
2908; AVX512BW-NEXT:    retq
2909;
2910; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
2911; AVX512DQ:       # %bb.0:
2912; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
2913; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2914; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2915; AVX512DQ-NEXT:    vzeroupper
2916; AVX512DQ-NEXT:    retq
2917  %1 = and <16 x i16> %a0, %a1
2918  %2 = trunc <16 x i16> %1 to <16 x i8>
2919  ret <16 x i8> %2
2920}
2921
2922;
2923; and to constant
2924;
2925
2926define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2927; SSE-LABEL: trunc_and_const_v4i64_v4i32:
2928; SSE:       # %bb.0:
2929; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2930; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2931; SSE-NEXT:    retq
2932;
2933; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
2934; AVX1:       # %bb.0:
2935; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2936; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2937; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2938; AVX1-NEXT:    vzeroupper
2939; AVX1-NEXT:    retq
2940;
2941; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
2942; AVX2-SLOW:       # %bb.0:
2943; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
2944; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2945; AVX2-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2946; AVX2-SLOW-NEXT:    vzeroupper
2947; AVX2-SLOW-NEXT:    retq
2948;
2949; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32:
2950; AVX2-FAST-ALL:       # %bb.0:
2951; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
2952; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2953; AVX2-FAST-ALL-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2954; AVX2-FAST-ALL-NEXT:    vzeroupper
2955; AVX2-FAST-ALL-NEXT:    retq
2956;
2957; AVX2-FAST-PERLANE-LABEL: trunc_and_const_v4i64_v4i32:
2958; AVX2-FAST-PERLANE:       # %bb.0:
2959; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
2960; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2961; AVX2-FAST-PERLANE-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2962; AVX2-FAST-PERLANE-NEXT:    vzeroupper
2963; AVX2-FAST-PERLANE-NEXT:    retq
2964;
2965; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
2966; AVX512:       # %bb.0:
2967; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2968; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2969; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2970; AVX512-NEXT:    vzeroupper
2971; AVX512-NEXT:    retq
2972  %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2973  %2 = trunc <4 x i64> %1 to <4 x i32>
2974  ret <4 x i32> %2
2975}
2976
2977define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2978; SSE-LABEL: trunc_and_const_v8i64_v8i16:
2979; SSE:       # %bb.0:
2980; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2981; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2982; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2983; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2984; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2985; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2986; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2987; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2988; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2989; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2990; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2991; SSE-NEXT:    andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2992; SSE-NEXT:    retq
2993;
2994; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
2995; AVX1:       # %bb.0:
2996; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
2997; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
2998; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2999; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3000; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3001; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3002; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3003; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3004; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3005; AVX1-NEXT:    vzeroupper
3006; AVX1-NEXT:    retq
3007;
3008; AVX2-LABEL: trunc_and_const_v8i64_v8i16:
3009; AVX2:       # %bb.0:
3010; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3011; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3012; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3013; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3014; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3015; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3016; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3017; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3018; AVX2-NEXT:    vzeroupper
3019; AVX2-NEXT:    retq
3020;
3021; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
3022; AVX512:       # %bb.0:
3023; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3024; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3025; AVX512-NEXT:    vzeroupper
3026; AVX512-NEXT:    retq
3027  %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3028  %2 = trunc <8 x i64> %1 to <8 x i16>
3029  ret <8 x i16> %2
3030}
3031
3032define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3033; SSE-LABEL: trunc_and_const_v8i32_v8i16:
3034; SSE:       # %bb.0:
3035; SSE-NEXT:    pslld $16, %xmm1
3036; SSE-NEXT:    psrad $16, %xmm1
3037; SSE-NEXT:    pslld $16, %xmm0
3038; SSE-NEXT:    psrad $16, %xmm0
3039; SSE-NEXT:    packssdw %xmm1, %xmm0
3040; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3041; SSE-NEXT:    retq
3042;
3043; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
3044; AVX1:       # %bb.0:
3045; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3046; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
3047; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3048; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3049; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3050; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3051; AVX1-NEXT:    vzeroupper
3052; AVX1-NEXT:    retq
3053;
3054; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
3055; AVX2:       # %bb.0:
3056; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3057; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3058; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3059; AVX2-NEXT:    vzeroupper
3060; AVX2-NEXT:    retq
3061;
3062; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
3063; AVX512:       # %bb.0:
3064; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3065; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3066; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3067; AVX512-NEXT:    vzeroupper
3068; AVX512-NEXT:    retq
3069  %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3070  %2 = trunc <8 x i32> %1 to <8 x i16>
3071  ret <8 x i16> %2
3072}
3073
3074define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3075; SSE-LABEL: trunc_and_const_v16i64_v16i8:
3076; SSE:       # %bb.0:
3077; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3078; SSE-NEXT:    pand %xmm8, %xmm7
3079; SSE-NEXT:    pand %xmm8, %xmm6
3080; SSE-NEXT:    packuswb %xmm7, %xmm6
3081; SSE-NEXT:    pand %xmm8, %xmm5
3082; SSE-NEXT:    pand %xmm8, %xmm4
3083; SSE-NEXT:    packuswb %xmm5, %xmm4
3084; SSE-NEXT:    packuswb %xmm6, %xmm4
3085; SSE-NEXT:    pand %xmm8, %xmm3
3086; SSE-NEXT:    pand %xmm8, %xmm2
3087; SSE-NEXT:    packuswb %xmm3, %xmm2
3088; SSE-NEXT:    pand %xmm8, %xmm1
3089; SSE-NEXT:    pand %xmm8, %xmm0
3090; SSE-NEXT:    packuswb %xmm1, %xmm0
3091; SSE-NEXT:    packuswb %xmm2, %xmm0
3092; SSE-NEXT:    packuswb %xmm4, %xmm0
3093; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3094; SSE-NEXT:    retq
3095;
3096; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
3097; AVX1:       # %bb.0:
3098; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3099; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
3100; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
3101; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
3102; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
3103; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
3104; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
3105; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
3106; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
3107; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3108; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3109; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
3110; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3111; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
3112; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3113; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3114; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3115; AVX1-NEXT:    vzeroupper
3116; AVX1-NEXT:    retq
3117;
3118; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
3119; AVX2:       # %bb.0:
3120; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3121; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
3122; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
3123; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
3124; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3125; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
3126; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
3127; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3128; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3129; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
3130; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3131; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3132; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3133; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3134; AVX2-NEXT:    vzeroupper
3135; AVX2-NEXT:    retq
3136;
3137; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
3138; AVX512:       # %bb.0:
3139; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
3140; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
3141; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3142; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3143; AVX512-NEXT:    vzeroupper
3144; AVX512-NEXT:    retq
3145  %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3146  %2 = trunc <16 x i64> %1 to <16 x i8>
3147  ret <16 x i8> %2
3148}
3149
3150define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3151; SSE-LABEL: trunc_and_const_v16i32_v16i8:
3152; SSE:       # %bb.0:
3153; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3154; SSE-NEXT:    pand %xmm4, %xmm3
3155; SSE-NEXT:    pand %xmm4, %xmm2
3156; SSE-NEXT:    packuswb %xmm3, %xmm2
3157; SSE-NEXT:    pand %xmm4, %xmm1
3158; SSE-NEXT:    pand %xmm4, %xmm0
3159; SSE-NEXT:    packuswb %xmm1, %xmm0
3160; SSE-NEXT:    packuswb %xmm2, %xmm0
3161; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3162; SSE-NEXT:    retq
3163;
3164; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
3165; AVX1:       # %bb.0:
3166; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3167; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3168; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3169; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3170; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3171; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3172; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3173; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3174; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3175; AVX1-NEXT:    vzeroupper
3176; AVX1-NEXT:    retq
3177;
3178; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
3179; AVX2:       # %bb.0:
3180; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3181; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
3182; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3183; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3184; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3185; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3186; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3187; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3188; AVX2-NEXT:    vzeroupper
3189; AVX2-NEXT:    retq
3190;
3191; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
3192; AVX512:       # %bb.0:
3193; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3194; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3195; AVX512-NEXT:    vzeroupper
3196; AVX512-NEXT:    retq
3197  %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3198  %2 = trunc <16 x i32> %1 to <16 x i8>
3199  ret <16 x i8> %2
3200}
3201
3202define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3203; SSE-LABEL: trunc_and_const_v16i16_v16i8:
3204; SSE:       # %bb.0:
3205; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3206; SSE-NEXT:    pand %xmm2, %xmm1
3207; SSE-NEXT:    pand %xmm2, %xmm0
3208; SSE-NEXT:    packuswb %xmm1, %xmm0
3209; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3210; SSE-NEXT:    retq
3211;
3212; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
3213; AVX1:       # %bb.0:
3214; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3215; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3216; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3217; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3218; AVX1-NEXT:    vzeroupper
3219; AVX1-NEXT:    retq
3220;
3221; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
3222; AVX2:       # %bb.0:
3223; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3224; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3225; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3226; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3227; AVX2-NEXT:    vzeroupper
3228; AVX2-NEXT:    retq
3229;
3230; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
3231; AVX512F:       # %bb.0:
3232; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3233; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3234; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3235; AVX512F-NEXT:    vzeroupper
3236; AVX512F-NEXT:    retq
3237;
3238; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
3239; AVX512BW:       # %bb.0:
3240; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3241; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3242; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3243; AVX512BW-NEXT:    vzeroupper
3244; AVX512BW-NEXT:    retq
3245;
3246; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
3247; AVX512DQ:       # %bb.0:
3248; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3249; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3250; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3251; AVX512DQ-NEXT:    vzeroupper
3252; AVX512DQ-NEXT:    retq
3253  %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3254  %2 = trunc <16 x i16> %1 to <16 x i8>
3255  ret <16 x i8> %2
3256}
3257
3258;
3259; xor
3260;
3261
3262define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3263; SSE-LABEL: trunc_xor_v4i64_v4i32:
3264; SSE:       # %bb.0:
3265; SSE-NEXT:    xorps %xmm3, %xmm1
3266; SSE-NEXT:    xorps %xmm2, %xmm0
3267; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3268; SSE-NEXT:    retq
3269;
3270; AVX1-LABEL: trunc_xor_v4i64_v4i32:
3271; AVX1:       # %bb.0:
3272; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3273; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3274; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3275; AVX1-NEXT:    vzeroupper
3276; AVX1-NEXT:    retq
3277;
3278; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
3279; AVX2-SLOW:       # %bb.0:
3280; AVX2-SLOW-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3281; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
3282; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3283; AVX2-SLOW-NEXT:    vzeroupper
3284; AVX2-SLOW-NEXT:    retq
3285;
3286; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32:
3287; AVX2-FAST-ALL:       # %bb.0:
3288; AVX2-FAST-ALL-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3289; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3290; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
3291; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3292; AVX2-FAST-ALL-NEXT:    vzeroupper
3293; AVX2-FAST-ALL-NEXT:    retq
3294;
3295; AVX2-FAST-PERLANE-LABEL: trunc_xor_v4i64_v4i32:
3296; AVX2-FAST-PERLANE:       # %bb.0:
3297; AVX2-FAST-PERLANE-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3298; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
3299; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3300; AVX2-FAST-PERLANE-NEXT:    vzeroupper
3301; AVX2-FAST-PERLANE-NEXT:    retq
3302;
3303; AVX512-LABEL: trunc_xor_v4i64_v4i32:
3304; AVX512:       # %bb.0:
3305; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3306; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3307; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3308; AVX512-NEXT:    vzeroupper
3309; AVX512-NEXT:    retq
3310  %1 = xor <4 x i64> %a0, %a1
3311  %2 = trunc <4 x i64> %1 to <4 x i32>
3312  ret <4 x i32> %2
3313}
3314
3315define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3316; SSE-LABEL: trunc_xor_v8i64_v8i16:
3317; SSE:       # %bb.0:
3318; SSE-NEXT:    pxor %xmm6, %xmm2
3319; SSE-NEXT:    pxor %xmm7, %xmm3
3320; SSE-NEXT:    pxor %xmm4, %xmm0
3321; SSE-NEXT:    pxor %xmm5, %xmm1
3322; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3323; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3324; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3325; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3326; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3327; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3328; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3329; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3330; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3331; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3332; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3333; SSE-NEXT:    retq
3334;
3335; AVX1-LABEL: trunc_xor_v8i64_v8i16:
3336; AVX1:       # %bb.0:
3337; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3338; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3339; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3340; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3341; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3342; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3343; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3344; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3345; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3346; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3347; AVX1-NEXT:    vzeroupper
3348; AVX1-NEXT:    retq
3349;
3350; AVX2-LABEL: trunc_xor_v8i64_v8i16:
3351; AVX2:       # %bb.0:
3352; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
3353; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
3354; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3355; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3356; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3357; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3358; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3359; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3360; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3361; AVX2-NEXT:    vzeroupper
3362; AVX2-NEXT:    retq
3363;
3364; AVX512-LABEL: trunc_xor_v8i64_v8i16:
3365; AVX512:       # %bb.0:
3366; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
3367; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3368; AVX512-NEXT:    vzeroupper
3369; AVX512-NEXT:    retq
3370  %1 = xor <8 x i64> %a0, %a1
3371  %2 = trunc <8 x i64> %1 to <8 x i16>
3372  ret <8 x i16> %2
3373}
3374
3375define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3376; SSE-LABEL: trunc_xor_v8i32_v8i16:
3377; SSE:       # %bb.0:
3378; SSE-NEXT:    pxor %xmm2, %xmm0
3379; SSE-NEXT:    pxor %xmm3, %xmm1
3380; SSE-NEXT:    pslld $16, %xmm1
3381; SSE-NEXT:    psrad $16, %xmm1
3382; SSE-NEXT:    pslld $16, %xmm0
3383; SSE-NEXT:    psrad $16, %xmm0
3384; SSE-NEXT:    packssdw %xmm1, %xmm0
3385; SSE-NEXT:    retq
3386;
3387; AVX1-LABEL: trunc_xor_v8i32_v8i16:
3388; AVX1:       # %bb.0:
3389; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3390; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3391; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
3392; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3393; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3394; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3395; AVX1-NEXT:    vzeroupper
3396; AVX1-NEXT:    retq
3397;
3398; AVX2-LABEL: trunc_xor_v8i32_v8i16:
3399; AVX2:       # %bb.0:
3400; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3401; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3402; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3403; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3404; AVX2-NEXT:    vzeroupper
3405; AVX2-NEXT:    retq
3406;
3407; AVX512-LABEL: trunc_xor_v8i32_v8i16:
3408; AVX512:       # %bb.0:
3409; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3410; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3411; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3412; AVX512-NEXT:    vzeroupper
3413; AVX512-NEXT:    retq
3414  %1 = xor <8 x i32> %a0, %a1
3415  %2 = trunc <8 x i32> %1 to <8 x i16>
3416  ret <8 x i16> %2
3417}
3418
3419define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3420; SSE-LABEL: trunc_xor_v16i64_v16i8:
3421; SSE:       # %bb.0:
3422; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm0
3423; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm1
3424; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm2
3425; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm3
3426; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm4
3427; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm5
3428; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm6
3429; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm7
3430; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3431; SSE-NEXT:    pand %xmm8, %xmm7
3432; SSE-NEXT:    pand %xmm8, %xmm6
3433; SSE-NEXT:    packuswb %xmm7, %xmm6
3434; SSE-NEXT:    pand %xmm8, %xmm5
3435; SSE-NEXT:    pand %xmm8, %xmm4
3436; SSE-NEXT:    packuswb %xmm5, %xmm4
3437; SSE-NEXT:    packuswb %xmm6, %xmm4
3438; SSE-NEXT:    pand %xmm8, %xmm3
3439; SSE-NEXT:    pand %xmm8, %xmm2
3440; SSE-NEXT:    packuswb %xmm3, %xmm2
3441; SSE-NEXT:    pand %xmm8, %xmm1
3442; SSE-NEXT:    pand %xmm8, %xmm0
3443; SSE-NEXT:    packuswb %xmm1, %xmm0
3444; SSE-NEXT:    packuswb %xmm2, %xmm0
3445; SSE-NEXT:    packuswb %xmm4, %xmm0
3446; SSE-NEXT:    retq
3447;
3448; AVX1-LABEL: trunc_xor_v16i64_v16i8:
3449; AVX1:       # %bb.0:
3450; AVX1-NEXT:    vxorps %ymm4, %ymm0, %ymm0
3451; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
3452; AVX1-NEXT:    vxorps %ymm6, %ymm2, %ymm2
3453; AVX1-NEXT:    vxorps %ymm7, %ymm3, %ymm3
3454; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3455; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
3456; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
3457; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
3458; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
3459; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
3460; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
3461; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
3462; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
3463; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3464; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3465; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
3466; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3467; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
3468; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3469; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3470; AVX1-NEXT:    vzeroupper
3471; AVX1-NEXT:    retq
3472;
3473; AVX2-LABEL: trunc_xor_v16i64_v16i8:
3474; AVX2:       # %bb.0:
3475; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
3476; AVX2-NEXT:    vpxor %ymm5, %ymm1, %ymm1
3477; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
3478; AVX2-NEXT:    vpxor %ymm7, %ymm3, %ymm3
3479; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3480; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
3481; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
3482; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
3483; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3484; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
3485; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
3486; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3487; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3488; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
3489; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3490; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3491; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3492; AVX2-NEXT:    vzeroupper
3493; AVX2-NEXT:    retq
3494;
3495; AVX512-LABEL: trunc_xor_v16i64_v16i8:
3496; AVX512:       # %bb.0:
3497; AVX512-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
3498; AVX512-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
3499; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
3500; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
3501; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3502; AVX512-NEXT:    vzeroupper
3503; AVX512-NEXT:    retq
3504  %1 = xor <16 x i64> %a0, %a1
3505  %2 = trunc <16 x i64> %1 to <16 x i8>
3506  ret <16 x i8> %2
3507}
3508
3509define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3510; SSE-LABEL: trunc_xor_v16i32_v16i8:
3511; SSE:       # %bb.0:
3512; SSE-NEXT:    pxor %xmm4, %xmm0
3513; SSE-NEXT:    pxor %xmm5, %xmm1
3514; SSE-NEXT:    pxor %xmm6, %xmm2
3515; SSE-NEXT:    pxor %xmm7, %xmm3
3516; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3517; SSE-NEXT:    pand %xmm4, %xmm3
3518; SSE-NEXT:    pand %xmm4, %xmm2
3519; SSE-NEXT:    packuswb %xmm3, %xmm2
3520; SSE-NEXT:    pand %xmm4, %xmm1
3521; SSE-NEXT:    pand %xmm4, %xmm0
3522; SSE-NEXT:    packuswb %xmm1, %xmm0
3523; SSE-NEXT:    packuswb %xmm2, %xmm0
3524; SSE-NEXT:    retq
3525;
3526; AVX1-LABEL: trunc_xor_v16i32_v16i8:
3527; AVX1:       # %bb.0:
3528; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3529; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3530; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3531; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3532; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3533; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3534; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3535; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3536; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3537; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3538; AVX1-NEXT:    vzeroupper
3539; AVX1-NEXT:    retq
3540;
3541; AVX2-LABEL: trunc_xor_v16i32_v16i8:
3542; AVX2:       # %bb.0:
3543; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
3544; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
3545; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3546; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
3547; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3548; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3549; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3550; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3551; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3552; AVX2-NEXT:    vzeroupper
3553; AVX2-NEXT:    retq
3554;
3555; AVX512-LABEL: trunc_xor_v16i32_v16i8:
3556; AVX512:       # %bb.0:
3557; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
3558; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3559; AVX512-NEXT:    vzeroupper
3560; AVX512-NEXT:    retq
3561  %1 = xor <16 x i32> %a0, %a1
3562  %2 = trunc <16 x i32> %1 to <16 x i8>
3563  ret <16 x i8> %2
3564}
3565
3566define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3567; SSE-LABEL: trunc_xor_v16i16_v16i8:
3568; SSE:       # %bb.0:
3569; SSE-NEXT:    pxor %xmm2, %xmm0
3570; SSE-NEXT:    pxor %xmm3, %xmm1
3571; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3572; SSE-NEXT:    pand %xmm2, %xmm1
3573; SSE-NEXT:    pand %xmm2, %xmm0
3574; SSE-NEXT:    packuswb %xmm1, %xmm0
3575; SSE-NEXT:    retq
3576;
3577; AVX1-LABEL: trunc_xor_v16i16_v16i8:
3578; AVX1:       # %bb.0:
3579; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3580; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3581; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3582; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3583; AVX1-NEXT:    vzeroupper
3584; AVX1-NEXT:    retq
3585;
3586; AVX2-LABEL: trunc_xor_v16i16_v16i8:
3587; AVX2:       # %bb.0:
3588; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3589; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3590; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3591; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3592; AVX2-NEXT:    vzeroupper
3593; AVX2-NEXT:    retq
3594;
3595; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
3596; AVX512F:       # %bb.0:
3597; AVX512F-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3598; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3599; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3600; AVX512F-NEXT:    vzeroupper
3601; AVX512F-NEXT:    retq
3602;
3603; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
3604; AVX512BW:       # %bb.0:
3605; AVX512BW-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3606; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3607; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3608; AVX512BW-NEXT:    vzeroupper
3609; AVX512BW-NEXT:    retq
3610;
3611; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
3612; AVX512DQ:       # %bb.0:
3613; AVX512DQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3614; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3615; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3616; AVX512DQ-NEXT:    vzeroupper
3617; AVX512DQ-NEXT:    retq
3618  %1 = xor <16 x i16> %a0, %a1
3619  %2 = trunc <16 x i16> %1 to <16 x i8>
3620  ret <16 x i8> %2
3621}
3622
3623;
3624; xor to constant
3625;
3626
3627define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3628; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
3629; SSE:       # %bb.0:
3630; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3631; SSE-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3632; SSE-NEXT:    retq
3633;
3634; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
3635; AVX1:       # %bb.0:
3636; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3637; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3638; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3639; AVX1-NEXT:    vzeroupper
3640; AVX1-NEXT:    retq
3641;
3642; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
3643; AVX2-SLOW:       # %bb.0:
3644; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
3645; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3646; AVX2-SLOW-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3647; AVX2-SLOW-NEXT:    vzeroupper
3648; AVX2-SLOW-NEXT:    retq
3649;
3650; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32:
3651; AVX2-FAST-ALL:       # %bb.0:
3652; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3653; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
3654; AVX2-FAST-ALL-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3655; AVX2-FAST-ALL-NEXT:    vzeroupper
3656; AVX2-FAST-ALL-NEXT:    retq
3657;
3658; AVX2-FAST-PERLANE-LABEL: trunc_xor_const_v4i64_v4i32:
3659; AVX2-FAST-PERLANE:       # %bb.0:
3660; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
3661; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3662; AVX2-FAST-PERLANE-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3663; AVX2-FAST-PERLANE-NEXT:    vzeroupper
3664; AVX2-FAST-PERLANE-NEXT:    retq
3665;
3666; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
3667; AVX512:       # %bb.0:
3668; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3669; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3670; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3671; AVX512-NEXT:    vzeroupper
3672; AVX512-NEXT:    retq
3673  %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3674  %2 = trunc <4 x i64> %1 to <4 x i32>
3675  ret <4 x i32> %2
3676}
3677
3678define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
3679; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
3680; SSE:       # %bb.0:
3681; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3682; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3683; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3684; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3685; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3686; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3687; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3688; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3689; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3690; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3691; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3692; SSE-NEXT:    xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3693; SSE-NEXT:    retq
3694;
3695; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
3696; AVX1:       # %bb.0:
3697; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3698; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3699; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3700; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3701; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3702; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3703; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3704; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3705; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3706; AVX1-NEXT:    vzeroupper
3707; AVX1-NEXT:    retq
3708;
3709; AVX2-LABEL: trunc_xor_const_v8i64_v8i16:
3710; AVX2:       # %bb.0:
3711; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3712; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3713; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3714; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3715; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3716; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3717; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3718; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3719; AVX2-NEXT:    vzeroupper
3720; AVX2-NEXT:    retq
3721;
3722; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
3723; AVX512:       # %bb.0:
3724; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3725; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3726; AVX512-NEXT:    vzeroupper
3727; AVX512-NEXT:    retq
3728  %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3729  %2 = trunc <8 x i64> %1 to <8 x i16>
3730  ret <8 x i16> %2
3731}
3732
3733define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3734; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
3735; SSE:       # %bb.0:
3736; SSE-NEXT:    pslld $16, %xmm1
3737; SSE-NEXT:    psrad $16, %xmm1
3738; SSE-NEXT:    pslld $16, %xmm0
3739; SSE-NEXT:    psrad $16, %xmm0
3740; SSE-NEXT:    packssdw %xmm1, %xmm0
3741; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3742; SSE-NEXT:    retq
3743;
3744; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
3745; AVX1:       # %bb.0:
3746; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3747; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
3748; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3749; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3750; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3751; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3752; AVX1-NEXT:    vzeroupper
3753; AVX1-NEXT:    retq
3754;
3755; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
3756; AVX2:       # %bb.0:
3757; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3758; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3759; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3760; AVX2-NEXT:    vzeroupper
3761; AVX2-NEXT:    retq
3762;
3763; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
3764; AVX512:       # %bb.0:
3765; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3766; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3767; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3768; AVX512-NEXT:    vzeroupper
3769; AVX512-NEXT:    retq
3770  %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3771  %2 = trunc <8 x i32> %1 to <8 x i16>
3772  ret <8 x i16> %2
3773}
3774
3775define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3776; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
3777; SSE:       # %bb.0:
3778; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3779; SSE-NEXT:    pand %xmm8, %xmm7
3780; SSE-NEXT:    pand %xmm8, %xmm6
3781; SSE-NEXT:    packuswb %xmm7, %xmm6
3782; SSE-NEXT:    pand %xmm8, %xmm5
3783; SSE-NEXT:    pand %xmm8, %xmm4
3784; SSE-NEXT:    packuswb %xmm5, %xmm4
3785; SSE-NEXT:    packuswb %xmm6, %xmm4
3786; SSE-NEXT:    pand %xmm8, %xmm3
3787; SSE-NEXT:    pand %xmm8, %xmm2
3788; SSE-NEXT:    packuswb %xmm3, %xmm2
3789; SSE-NEXT:    pand %xmm8, %xmm1
3790; SSE-NEXT:    pand %xmm8, %xmm0
3791; SSE-NEXT:    packuswb %xmm1, %xmm0
3792; SSE-NEXT:    packuswb %xmm2, %xmm0
3793; SSE-NEXT:    packuswb %xmm4, %xmm0
3794; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3795; SSE-NEXT:    retq
3796;
3797; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
3798; AVX1:       # %bb.0:
3799; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3800; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
3801; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
3802; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
3803; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
3804; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
3805; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
3806; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
3807; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
3808; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3809; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3810; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
3811; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3812; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
3813; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3814; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3815; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3816; AVX1-NEXT:    vzeroupper
3817; AVX1-NEXT:    retq
3818;
3819; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
3820; AVX2:       # %bb.0:
3821; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3822; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
3823; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
3824; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
3825; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3826; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
3827; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
3828; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3829; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3830; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
3831; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3832; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3833; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3834; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3835; AVX2-NEXT:    vzeroupper
3836; AVX2-NEXT:    retq
3837;
3838; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
3839; AVX512:       # %bb.0:
3840; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
3841; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
3842; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3843; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3844; AVX512-NEXT:    vzeroupper
3845; AVX512-NEXT:    retq
3846  %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3847  %2 = trunc <16 x i64> %1 to <16 x i8>
3848  ret <16 x i8> %2
3849}
3850
3851define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3852; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
3853; SSE:       # %bb.0:
3854; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3855; SSE-NEXT:    pand %xmm4, %xmm3
3856; SSE-NEXT:    pand %xmm4, %xmm2
3857; SSE-NEXT:    packuswb %xmm3, %xmm2
3858; SSE-NEXT:    pand %xmm4, %xmm1
3859; SSE-NEXT:    pand %xmm4, %xmm0
3860; SSE-NEXT:    packuswb %xmm1, %xmm0
3861; SSE-NEXT:    packuswb %xmm2, %xmm0
3862; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3863; SSE-NEXT:    retq
3864;
3865; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
3866; AVX1:       # %bb.0:
3867; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3868; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3869; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3870; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3871; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3872; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3873; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3874; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3875; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3876; AVX1-NEXT:    vzeroupper
3877; AVX1-NEXT:    retq
3878;
3879; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
3880; AVX2:       # %bb.0:
3881; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3882; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
3883; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3884; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3885; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3886; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3887; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3888; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3889; AVX2-NEXT:    vzeroupper
3890; AVX2-NEXT:    retq
3891;
3892; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
3893; AVX512:       # %bb.0:
3894; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3895; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3896; AVX512-NEXT:    vzeroupper
3897; AVX512-NEXT:    retq
3898  %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3899  %2 = trunc <16 x i32> %1 to <16 x i8>
3900  ret <16 x i8> %2
3901}
3902
3903define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3904; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
3905; SSE:       # %bb.0:
3906; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3907; SSE-NEXT:    pand %xmm2, %xmm1
3908; SSE-NEXT:    pand %xmm2, %xmm0
3909; SSE-NEXT:    packuswb %xmm1, %xmm0
3910; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3911; SSE-NEXT:    retq
3912;
3913; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
3914; AVX1:       # %bb.0:
3915; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3916; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3917; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3918; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3919; AVX1-NEXT:    vzeroupper
3920; AVX1-NEXT:    retq
3921;
3922; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
3923; AVX2:       # %bb.0:
3924; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3925; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3926; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3927; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3928; AVX2-NEXT:    vzeroupper
3929; AVX2-NEXT:    retq
3930;
3931; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
3932; AVX512F:       # %bb.0:
3933; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3934; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3935; AVX512F-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3936; AVX512F-NEXT:    vzeroupper
3937; AVX512F-NEXT:    retq
3938;
3939; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
3940; AVX512BW:       # %bb.0:
3941; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3942; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3943; AVX512BW-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3944; AVX512BW-NEXT:    vzeroupper
3945; AVX512BW-NEXT:    retq
3946;
3947; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
3948; AVX512DQ:       # %bb.0:
3949; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3950; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3951; AVX512DQ-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3952; AVX512DQ-NEXT:    vzeroupper
3953; AVX512DQ-NEXT:    retq
3954  %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3955  %2 = trunc <16 x i16> %1 to <16 x i8>
3956  ret <16 x i8> %2
3957}
3958
3959;
3960; or
3961;
3962
3963define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3964; SSE-LABEL: trunc_or_v4i64_v4i32:
3965; SSE:       # %bb.0:
3966; SSE-NEXT:    orps %xmm3, %xmm1
3967; SSE-NEXT:    orps %xmm2, %xmm0
3968; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3969; SSE-NEXT:    retq
3970;
3971; AVX1-LABEL: trunc_or_v4i64_v4i32:
3972; AVX1:       # %bb.0:
3973; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
3974; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3975; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3976; AVX1-NEXT:    vzeroupper
3977; AVX1-NEXT:    retq
3978;
3979; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
3980; AVX2-SLOW:       # %bb.0:
3981; AVX2-SLOW-NEXT:    vorps %ymm1, %ymm0, %ymm0
3982; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
3983; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3984; AVX2-SLOW-NEXT:    vzeroupper
3985; AVX2-SLOW-NEXT:    retq
3986;
3987; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32:
3988; AVX2-FAST-ALL:       # %bb.0:
3989; AVX2-FAST-ALL-NEXT:    vorps %ymm1, %ymm0, %ymm0
3990; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3991; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
3992; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3993; AVX2-FAST-ALL-NEXT:    vzeroupper
3994; AVX2-FAST-ALL-NEXT:    retq
3995;
3996; AVX2-FAST-PERLANE-LABEL: trunc_or_v4i64_v4i32:
3997; AVX2-FAST-PERLANE:       # %bb.0:
3998; AVX2-FAST-PERLANE-NEXT:    vorps %ymm1, %ymm0, %ymm0
3999; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
4000; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4001; AVX2-FAST-PERLANE-NEXT:    vzeroupper
4002; AVX2-FAST-PERLANE-NEXT:    retq
4003;
4004; AVX512-LABEL: trunc_or_v4i64_v4i32:
4005; AVX512:       # %bb.0:
4006; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
4007; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4008; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4009; AVX512-NEXT:    vzeroupper
4010; AVX512-NEXT:    retq
4011  %1 = or <4 x i64> %a0, %a1
4012  %2 = trunc <4 x i64> %1 to <4 x i32>
4013  ret <4 x i32> %2
4014}
4015
4016define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
4017; SSE-LABEL: trunc_or_v8i64_v8i16:
4018; SSE:       # %bb.0:
4019; SSE-NEXT:    por %xmm6, %xmm2
4020; SSE-NEXT:    por %xmm7, %xmm3
4021; SSE-NEXT:    por %xmm4, %xmm0
4022; SSE-NEXT:    por %xmm5, %xmm1
4023; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4024; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4025; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4026; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4027; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4028; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4029; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4030; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4031; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4032; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4033; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4034; SSE-NEXT:    retq
4035;
4036; AVX1-LABEL: trunc_or_v8i64_v8i16:
4037; AVX1:       # %bb.0:
4038; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
4039; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
4040; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4041; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4042; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4043; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4044; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4045; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4046; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4047; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4048; AVX1-NEXT:    vzeroupper
4049; AVX1-NEXT:    retq
4050;
4051; AVX2-LABEL: trunc_or_v8i64_v8i16:
4052; AVX2:       # %bb.0:
4053; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
4054; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
4055; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4056; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
4057; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
4058; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4059; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4060; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4061; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4062; AVX2-NEXT:    vzeroupper
4063; AVX2-NEXT:    retq
4064;
4065; AVX512-LABEL: trunc_or_v8i64_v8i16:
4066; AVX512:       # %bb.0:
4067; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
4068; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4069; AVX512-NEXT:    vzeroupper
4070; AVX512-NEXT:    retq
4071  %1 = or <8 x i64> %a0, %a1
4072  %2 = trunc <8 x i64> %1 to <8 x i16>
4073  ret <8 x i16> %2
4074}
4075
4076define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
4077; SSE-LABEL: trunc_or_v8i32_v8i16:
4078; SSE:       # %bb.0:
4079; SSE-NEXT:    por %xmm2, %xmm0
4080; SSE-NEXT:    por %xmm3, %xmm1
4081; SSE-NEXT:    pslld $16, %xmm1
4082; SSE-NEXT:    psrad $16, %xmm1
4083; SSE-NEXT:    pslld $16, %xmm0
4084; SSE-NEXT:    psrad $16, %xmm0
4085; SSE-NEXT:    packssdw %xmm1, %xmm0
4086; SSE-NEXT:    retq
4087;
4088; AVX1-LABEL: trunc_or_v8i32_v8i16:
4089; AVX1:       # %bb.0:
4090; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4091; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4092; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
4093; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4094; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4095; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4096; AVX1-NEXT:    vzeroupper
4097; AVX1-NEXT:    retq
4098;
4099; AVX2-LABEL: trunc_or_v8i32_v8i16:
4100; AVX2:       # %bb.0:
4101; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4102; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4103; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4104; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4105; AVX2-NEXT:    vzeroupper
4106; AVX2-NEXT:    retq
4107;
4108; AVX512-LABEL: trunc_or_v8i32_v8i16:
4109; AVX512:       # %bb.0:
4110; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
4111; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4112; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4113; AVX512-NEXT:    vzeroupper
4114; AVX512-NEXT:    retq
4115  %1 = or <8 x i32> %a0, %a1
4116  %2 = trunc <8 x i32> %1 to <8 x i16>
4117  ret <8 x i16> %2
4118}
4119
4120define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
4121; SSE-LABEL: trunc_or_v16i64_v16i8:
4122; SSE:       # %bb.0:
4123; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm0
4124; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm1
4125; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm2
4126; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm3
4127; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm4
4128; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm5
4129; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm6
4130; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm7
4131; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4132; SSE-NEXT:    pand %xmm8, %xmm7
4133; SSE-NEXT:    pand %xmm8, %xmm6
4134; SSE-NEXT:    packuswb %xmm7, %xmm6
4135; SSE-NEXT:    pand %xmm8, %xmm5
4136; SSE-NEXT:    pand %xmm8, %xmm4
4137; SSE-NEXT:    packuswb %xmm5, %xmm4
4138; SSE-NEXT:    packuswb %xmm6, %xmm4
4139; SSE-NEXT:    pand %xmm8, %xmm3
4140; SSE-NEXT:    pand %xmm8, %xmm2
4141; SSE-NEXT:    packuswb %xmm3, %xmm2
4142; SSE-NEXT:    pand %xmm8, %xmm1
4143; SSE-NEXT:    pand %xmm8, %xmm0
4144; SSE-NEXT:    packuswb %xmm1, %xmm0
4145; SSE-NEXT:    packuswb %xmm2, %xmm0
4146; SSE-NEXT:    packuswb %xmm4, %xmm0
4147; SSE-NEXT:    retq
4148;
4149; AVX1-LABEL: trunc_or_v16i64_v16i8:
4150; AVX1:       # %bb.0:
4151; AVX1-NEXT:    vorps %ymm4, %ymm0, %ymm0
4152; AVX1-NEXT:    vorps %ymm5, %ymm1, %ymm1
4153; AVX1-NEXT:    vorps %ymm6, %ymm2, %ymm2
4154; AVX1-NEXT:    vorps %ymm7, %ymm3, %ymm3
4155; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4156; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
4157; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
4158; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
4159; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
4160; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
4161; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
4162; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
4163; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
4164; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4165; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4166; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
4167; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4168; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
4169; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4170; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4171; AVX1-NEXT:    vzeroupper
4172; AVX1-NEXT:    retq
4173;
4174; AVX2-LABEL: trunc_or_v16i64_v16i8:
4175; AVX2:       # %bb.0:
4176; AVX2-NEXT:    vpor %ymm4, %ymm0, %ymm0
4177; AVX2-NEXT:    vpor %ymm5, %ymm1, %ymm1
4178; AVX2-NEXT:    vpor %ymm6, %ymm2, %ymm2
4179; AVX2-NEXT:    vpor %ymm7, %ymm3, %ymm3
4180; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
4181; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
4182; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
4183; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
4184; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
4185; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
4186; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
4187; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4188; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4189; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
4190; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4191; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4192; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4193; AVX2-NEXT:    vzeroupper
4194; AVX2-NEXT:    retq
4195;
4196; AVX512-LABEL: trunc_or_v16i64_v16i8:
4197; AVX512:       # %bb.0:
4198; AVX512-NEXT:    vporq %zmm2, %zmm0, %zmm0
4199; AVX512-NEXT:    vporq %zmm3, %zmm1, %zmm1
4200; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
4201; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
4202; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4203; AVX512-NEXT:    vzeroupper
4204; AVX512-NEXT:    retq
4205  %1 = or <16 x i64> %a0, %a1
4206  %2 = trunc <16 x i64> %1 to <16 x i8>
4207  ret <16 x i8> %2
4208}
4209
4210define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
4211; SSE-LABEL: trunc_or_v16i32_v16i8:
4212; SSE:       # %bb.0:
4213; SSE-NEXT:    por %xmm4, %xmm0
4214; SSE-NEXT:    por %xmm5, %xmm1
4215; SSE-NEXT:    por %xmm6, %xmm2
4216; SSE-NEXT:    por %xmm7, %xmm3
4217; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4218; SSE-NEXT:    pand %xmm4, %xmm3
4219; SSE-NEXT:    pand %xmm4, %xmm2
4220; SSE-NEXT:    packuswb %xmm3, %xmm2
4221; SSE-NEXT:    pand %xmm4, %xmm1
4222; SSE-NEXT:    pand %xmm4, %xmm0
4223; SSE-NEXT:    packuswb %xmm1, %xmm0
4224; SSE-NEXT:    packuswb %xmm2, %xmm0
4225; SSE-NEXT:    retq
4226;
4227; AVX1-LABEL: trunc_or_v16i32_v16i8:
4228; AVX1:       # %bb.0:
4229; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
4230; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
4231; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4232; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4233; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4234; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4235; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4236; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4237; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4238; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4239; AVX1-NEXT:    vzeroupper
4240; AVX1-NEXT:    retq
4241;
4242; AVX2-LABEL: trunc_or_v16i32_v16i8:
4243; AVX2:       # %bb.0:
4244; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
4245; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
4246; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4247; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
4248; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
4249; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4250; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4251; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4252; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4253; AVX2-NEXT:    vzeroupper
4254; AVX2-NEXT:    retq
4255;
4256; AVX512-LABEL: trunc_or_v16i32_v16i8:
4257; AVX512:       # %bb.0:
4258; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
4259; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4260; AVX512-NEXT:    vzeroupper
4261; AVX512-NEXT:    retq
4262  %1 = or <16 x i32> %a0, %a1
4263  %2 = trunc <16 x i32> %1 to <16 x i8>
4264  ret <16 x i8> %2
4265}
4266
4267define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4268; SSE-LABEL: trunc_or_v16i16_v16i8:
4269; SSE:       # %bb.0:
4270; SSE-NEXT:    por %xmm2, %xmm0
4271; SSE-NEXT:    por %xmm3, %xmm1
4272; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4273; SSE-NEXT:    pand %xmm2, %xmm1
4274; SSE-NEXT:    pand %xmm2, %xmm0
4275; SSE-NEXT:    packuswb %xmm1, %xmm0
4276; SSE-NEXT:    retq
4277;
4278; AVX1-LABEL: trunc_or_v16i16_v16i8:
4279; AVX1:       # %bb.0:
4280; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4281; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4282; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4283; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4284; AVX1-NEXT:    vzeroupper
4285; AVX1-NEXT:    retq
4286;
4287; AVX2-LABEL: trunc_or_v16i16_v16i8:
4288; AVX2:       # %bb.0:
4289; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4290; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4291; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4292; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4293; AVX2-NEXT:    vzeroupper
4294; AVX2-NEXT:    retq
4295;
4296; AVX512F-LABEL: trunc_or_v16i16_v16i8:
4297; AVX512F:       # %bb.0:
4298; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
4299; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4300; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4301; AVX512F-NEXT:    vzeroupper
4302; AVX512F-NEXT:    retq
4303;
4304; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
4305; AVX512BW:       # %bb.0:
4306; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
4307; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4308; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4309; AVX512BW-NEXT:    vzeroupper
4310; AVX512BW-NEXT:    retq
4311;
4312; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
4313; AVX512DQ:       # %bb.0:
4314; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
4315; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4316; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4317; AVX512DQ-NEXT:    vzeroupper
4318; AVX512DQ-NEXT:    retq
4319  %1 = or <16 x i16> %a0, %a1
4320  %2 = trunc <16 x i16> %1 to <16 x i8>
4321  ret <16 x i8> %2
4322}
4323
4324;
4325; or to constant
4326;
4327
4328define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
4329; SSE-LABEL: trunc_or_const_v4i64_v4i32:
4330; SSE:       # %bb.0:
4331; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4332; SSE-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4333; SSE-NEXT:    retq
4334;
4335; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
4336; AVX1:       # %bb.0:
4337; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4338; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4339; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4340; AVX1-NEXT:    vzeroupper
4341; AVX1-NEXT:    retq
4342;
4343; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
4344; AVX2-SLOW:       # %bb.0:
4345; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
4346; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4347; AVX2-SLOW-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4348; AVX2-SLOW-NEXT:    vzeroupper
4349; AVX2-SLOW-NEXT:    retq
4350;
4351; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32:
4352; AVX2-FAST-ALL:       # %bb.0:
4353; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
4354; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
4355; AVX2-FAST-ALL-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4356; AVX2-FAST-ALL-NEXT:    vzeroupper
4357; AVX2-FAST-ALL-NEXT:    retq
4358;
4359; AVX2-FAST-PERLANE-LABEL: trunc_or_const_v4i64_v4i32:
4360; AVX2-FAST-PERLANE:       # %bb.0:
4361; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
4362; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4363; AVX2-FAST-PERLANE-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4364; AVX2-FAST-PERLANE-NEXT:    vzeroupper
4365; AVX2-FAST-PERLANE-NEXT:    retq
4366;
4367; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
4368; AVX512:       # %bb.0:
4369; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4370; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4371; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4372; AVX512-NEXT:    vzeroupper
4373; AVX512-NEXT:    retq
4374  %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4375  %2 = trunc <4 x i64> %1 to <4 x i32>
4376  ret <4 x i32> %2
4377}
4378
4379define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
4380; SSE-LABEL: trunc_or_const_v8i64_v8i16:
4381; SSE:       # %bb.0:
4382; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4383; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4384; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4385; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4386; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4387; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4388; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4389; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4390; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4391; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4392; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4393; SSE-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4394; SSE-NEXT:    retq
4395;
4396; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
4397; AVX1:       # %bb.0:
4398; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4399; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4400; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4401; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4402; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4403; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4404; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4405; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4406; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4407; AVX1-NEXT:    vzeroupper
4408; AVX1-NEXT:    retq
4409;
4410; AVX2-LABEL: trunc_or_const_v8i64_v8i16:
4411; AVX2:       # %bb.0:
4412; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4413; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
4414; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
4415; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4416; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4417; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4418; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4419; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4420; AVX2-NEXT:    vzeroupper
4421; AVX2-NEXT:    retq
4422;
4423; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
4424; AVX512:       # %bb.0:
4425; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4426; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4427; AVX512-NEXT:    vzeroupper
4428; AVX512-NEXT:    retq
4429  %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4430  %2 = trunc <8 x i64> %1 to <8 x i16>
4431  ret <8 x i16> %2
4432}
4433
4434define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
4435; SSE-LABEL: trunc_or_const_v8i32_v8i16:
4436; SSE:       # %bb.0:
4437; SSE-NEXT:    pslld $16, %xmm1
4438; SSE-NEXT:    psrad $16, %xmm1
4439; SSE-NEXT:    pslld $16, %xmm0
4440; SSE-NEXT:    psrad $16, %xmm0
4441; SSE-NEXT:    packssdw %xmm1, %xmm0
4442; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4443; SSE-NEXT:    retq
4444;
4445; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
4446; AVX1:       # %bb.0:
4447; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4448; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
4449; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4450; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4451; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4452; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4453; AVX1-NEXT:    vzeroupper
4454; AVX1-NEXT:    retq
4455;
4456; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
4457; AVX2:       # %bb.0:
4458; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4459; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4460; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4461; AVX2-NEXT:    vzeroupper
4462; AVX2-NEXT:    retq
4463;
4464; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
4465; AVX512:       # %bb.0:
4466; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4467; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4468; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4469; AVX512-NEXT:    vzeroupper
4470; AVX512-NEXT:    retq
4471  %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4472  %2 = trunc <8 x i32> %1 to <8 x i16>
4473  ret <8 x i16> %2
4474}
4475
4476define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4477; SSE-LABEL: trunc_or_const_v16i64_v16i8:
4478; SSE:       # %bb.0:
4479; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4480; SSE-NEXT:    pand %xmm8, %xmm7
4481; SSE-NEXT:    pand %xmm8, %xmm6
4482; SSE-NEXT:    packuswb %xmm7, %xmm6
4483; SSE-NEXT:    pand %xmm8, %xmm5
4484; SSE-NEXT:    pand %xmm8, %xmm4
4485; SSE-NEXT:    packuswb %xmm5, %xmm4
4486; SSE-NEXT:    packuswb %xmm6, %xmm4
4487; SSE-NEXT:    pand %xmm8, %xmm3
4488; SSE-NEXT:    pand %xmm8, %xmm2
4489; SSE-NEXT:    packuswb %xmm3, %xmm2
4490; SSE-NEXT:    pand %xmm8, %xmm1
4491; SSE-NEXT:    pand %xmm8, %xmm0
4492; SSE-NEXT:    packuswb %xmm1, %xmm0
4493; SSE-NEXT:    packuswb %xmm2, %xmm0
4494; SSE-NEXT:    packuswb %xmm4, %xmm0
4495; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4496; SSE-NEXT:    retq
4497;
4498; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
4499; AVX1:       # %bb.0:
4500; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4501; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
4502; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
4503; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
4504; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
4505; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
4506; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
4507; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
4508; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
4509; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4510; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4511; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
4512; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4513; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
4514; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4515; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4516; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4517; AVX1-NEXT:    vzeroupper
4518; AVX1-NEXT:    retq
4519;
4520; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
4521; AVX2:       # %bb.0:
4522; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
4523; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
4524; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
4525; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
4526; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
4527; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
4528; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
4529; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4530; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4531; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
4532; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4533; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4534; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4535; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4536; AVX2-NEXT:    vzeroupper
4537; AVX2-NEXT:    retq
4538;
4539; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
4540; AVX512:       # %bb.0:
4541; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
4542; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
4543; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4544; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4545; AVX512-NEXT:    vzeroupper
4546; AVX512-NEXT:    retq
4547  %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4548  %2 = trunc <16 x i64> %1 to <16 x i8>
4549  ret <16 x i8> %2
4550}
4551
4552define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4553; SSE-LABEL: trunc_or_const_v16i32_v16i8:
4554; SSE:       # %bb.0:
4555; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4556; SSE-NEXT:    pand %xmm4, %xmm3
4557; SSE-NEXT:    pand %xmm4, %xmm2
4558; SSE-NEXT:    packuswb %xmm3, %xmm2
4559; SSE-NEXT:    pand %xmm4, %xmm1
4560; SSE-NEXT:    pand %xmm4, %xmm0
4561; SSE-NEXT:    packuswb %xmm1, %xmm0
4562; SSE-NEXT:    packuswb %xmm2, %xmm0
4563; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4564; SSE-NEXT:    retq
4565;
4566; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
4567; AVX1:       # %bb.0:
4568; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4569; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4570; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4571; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4572; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4573; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4574; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4575; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4576; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4577; AVX1-NEXT:    vzeroupper
4578; AVX1-NEXT:    retq
4579;
4580; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
4581; AVX2:       # %bb.0:
4582; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4583; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
4584; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
4585; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4586; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4587; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4588; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4589; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4590; AVX2-NEXT:    vzeroupper
4591; AVX2-NEXT:    retq
4592;
4593; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
4594; AVX512:       # %bb.0:
4595; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4596; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4597; AVX512-NEXT:    vzeroupper
4598; AVX512-NEXT:    retq
4599  %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4600  %2 = trunc <16 x i32> %1 to <16 x i8>
4601  ret <16 x i8> %2
4602}
4603
4604define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4605; SSE-LABEL: trunc_or_const_v16i16_v16i8:
4606; SSE:       # %bb.0:
4607; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4608; SSE-NEXT:    pand %xmm2, %xmm1
4609; SSE-NEXT:    pand %xmm2, %xmm0
4610; SSE-NEXT:    packuswb %xmm1, %xmm0
4611; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4612; SSE-NEXT:    retq
4613;
4614; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
4615; AVX1:       # %bb.0:
4616; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4617; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4618; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4619; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4620; AVX1-NEXT:    vzeroupper
4621; AVX1-NEXT:    retq
4622;
4623; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
4624; AVX2:       # %bb.0:
4625; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4626; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4627; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4628; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4629; AVX2-NEXT:    vzeroupper
4630; AVX2-NEXT:    retq
4631;
4632; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
4633; AVX512F:       # %bb.0:
4634; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4635; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4636; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4637; AVX512F-NEXT:    vzeroupper
4638; AVX512F-NEXT:    retq
4639;
4640; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
4641; AVX512BW:       # %bb.0:
4642; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4643; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4644; AVX512BW-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4645; AVX512BW-NEXT:    vzeroupper
4646; AVX512BW-NEXT:    retq
4647;
4648; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
4649; AVX512DQ:       # %bb.0:
4650; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4651; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4652; AVX512DQ-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4653; AVX512DQ-NEXT:    vzeroupper
4654; AVX512DQ-NEXT:    retq
4655  %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
4656  %2 = trunc <16 x i16> %1 to <16 x i8>
4657  ret <16 x i8> %2
4658}
4659
4660;
4661; complex patterns - often created by vectorizer
4662;
4663
4664define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4665; SSE-LABEL: mul_add_const_v4i64_v4i32:
4666; SSE:       # %bb.0:
4667; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4668; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3]
4669; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
4670; SSE-NEXT:    pmuludq %xmm2, %xmm0
4671; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3]
4672; SSE-NEXT:    pmuludq %xmm3, %xmm1
4673; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4674; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4675; SSE-NEXT:    retq
4676;
4677; AVX-LABEL: mul_add_const_v4i64_v4i32:
4678; AVX:       # %bb.0:
4679; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
4680; AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4681; AVX-NEXT:    retq
4682  %1 = sext <4 x i32> %a0 to <4 x i64>
4683  %2 = sext <4 x i32> %a1 to <4 x i64>
4684  %3 = mul <4 x i64> %1, %2
4685  %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
4686  %5 = trunc <4 x i64> %4 to <4 x i32>
4687  ret <4 x i32> %5
4688}
4689
4690define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4691; SSE-LABEL: mul_add_self_v4i64_v4i32:
4692; SSE:       # %bb.0:
4693; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4694; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3]
4695; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
4696; SSE-NEXT:    pmuludq %xmm2, %xmm0
4697; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3]
4698; SSE-NEXT:    pmuludq %xmm3, %xmm1
4699; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4700; SSE-NEXT:    paddd %xmm0, %xmm0
4701; SSE-NEXT:    retq
4702;
4703; AVX-LABEL: mul_add_self_v4i64_v4i32:
4704; AVX:       # %bb.0:
4705; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
4706; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
4707; AVX-NEXT:    retq
4708  %1 = sext <4 x i32> %a0 to <4 x i64>
4709  %2 = sext <4 x i32> %a1 to <4 x i64>
4710  %3 = mul <4 x i64> %1, %2
4711  %4 = add <4 x i64> %3, %3
4712  %5 = trunc <4 x i64> %4 to <4 x i32>
4713  ret <4 x i32> %5
4714}
4715
4716define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4717; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
4718; SSE:       # %bb.0:
4719; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4720; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3]
4721; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
4722; SSE-NEXT:    pmuludq %xmm2, %xmm4
4723; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3]
4724; SSE-NEXT:    pmuludq %xmm3, %xmm1
4725; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
4726; SSE-NEXT:    paddd %xmm4, %xmm0
4727; SSE-NEXT:    retq
4728;
4729; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
4730; AVX:       # %bb.0:
4731; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
4732; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
4733; AVX-NEXT:    retq
4734  %1 = sext <4 x i32> %a0 to <4 x i64>
4735  %2 = sext <4 x i32> %a1 to <4 x i64>
4736  %3 = mul <4 x i64> %1, %2
4737  %4 = add <4 x i64> %1, %3
4738  %5 = trunc <4 x i64> %4 to <4 x i32>
4739  ret <4 x i32> %5
4740}
4741