1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
9;
10; Just two 32-bit runs to make sure we do reasonable things there.
11; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE,X86-SSE2
12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86-SSE,X86-SSE41
13
14define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
15; SSE2-LABEL: sext_16i8_to_8i16:
16; SSE2:       # %bb.0: # %entry
17; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
18; SSE2-NEXT:    psraw $8, %xmm0
19; SSE2-NEXT:    retq
20;
21; SSSE3-LABEL: sext_16i8_to_8i16:
22; SSSE3:       # %bb.0: # %entry
23; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
24; SSSE3-NEXT:    psraw $8, %xmm0
25; SSSE3-NEXT:    retq
26;
27; SSE41-LABEL: sext_16i8_to_8i16:
28; SSE41:       # %bb.0: # %entry
29; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
30; SSE41-NEXT:    retq
31;
32; AVX-LABEL: sext_16i8_to_8i16:
33; AVX:       # %bb.0: # %entry
34; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
35; AVX-NEXT:    retq
36;
37; X86-SSE2-LABEL: sext_16i8_to_8i16:
38; X86-SSE2:       # %bb.0: # %entry
39; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
40; X86-SSE2-NEXT:    psraw $8, %xmm0
41; X86-SSE2-NEXT:    retl
42;
43; X86-SSE41-LABEL: sext_16i8_to_8i16:
44; X86-SSE41:       # %bb.0: # %entry
45; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
46; X86-SSE41-NEXT:    retl
47entry:
48  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
49  %C = sext <8 x i8> %B to <8 x i16>
50  ret <8 x i16> %C
51}
52
53define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
54; SSE2-LABEL: sext_16i8_to_16i16:
55; SSE2:       # %bb.0: # %entry
56; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
57; SSE2-NEXT:    psraw $8, %xmm2
58; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
59; SSE2-NEXT:    psraw $8, %xmm1
60; SSE2-NEXT:    movdqa %xmm2, %xmm0
61; SSE2-NEXT:    retq
62;
63; SSSE3-LABEL: sext_16i8_to_16i16:
64; SSSE3:       # %bb.0: # %entry
65; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
66; SSSE3-NEXT:    psraw $8, %xmm2
67; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
68; SSSE3-NEXT:    psraw $8, %xmm1
69; SSSE3-NEXT:    movdqa %xmm2, %xmm0
70; SSSE3-NEXT:    retq
71;
72; SSE41-LABEL: sext_16i8_to_16i16:
73; SSE41:       # %bb.0: # %entry
74; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
75; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
76; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
77; SSE41-NEXT:    movdqa %xmm2, %xmm0
78; SSE41-NEXT:    retq
79;
80; AVX1-LABEL: sext_16i8_to_16i16:
81; AVX1:       # %bb.0: # %entry
82; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
83; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
84; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
85; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
86; AVX1-NEXT:    retq
87;
88; AVX2-LABEL: sext_16i8_to_16i16:
89; AVX2:       # %bb.0: # %entry
90; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
91; AVX2-NEXT:    retq
92;
93; AVX512-LABEL: sext_16i8_to_16i16:
94; AVX512:       # %bb.0: # %entry
95; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
96; AVX512-NEXT:    retq
97;
98; X86-SSE2-LABEL: sext_16i8_to_16i16:
99; X86-SSE2:       # %bb.0: # %entry
100; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
101; X86-SSE2-NEXT:    psraw $8, %xmm2
102; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
103; X86-SSE2-NEXT:    psraw $8, %xmm1
104; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
105; X86-SSE2-NEXT:    retl
106;
107; X86-SSE41-LABEL: sext_16i8_to_16i16:
108; X86-SSE41:       # %bb.0: # %entry
109; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
110; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
111; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
112; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
113; X86-SSE41-NEXT:    retl
114entry:
115  %B = sext <16 x i8> %A to <16 x i16>
116  ret <16 x i16> %B
117}
118
119define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp {
120; SSE2-LABEL: sext_32i8_to_32i16:
121; SSE2:       # %bb.0: # %entry
122; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
123; SSE2-NEXT:    psraw $8, %xmm4
124; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
125; SSE2-NEXT:    psraw $8, %xmm5
126; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
127; SSE2-NEXT:    psraw $8, %xmm2
128; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
129; SSE2-NEXT:    psraw $8, %xmm3
130; SSE2-NEXT:    movdqa %xmm4, %xmm0
131; SSE2-NEXT:    movdqa %xmm5, %xmm1
132; SSE2-NEXT:    retq
133;
134; SSSE3-LABEL: sext_32i8_to_32i16:
135; SSSE3:       # %bb.0: # %entry
136; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
137; SSSE3-NEXT:    psraw $8, %xmm4
138; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
139; SSSE3-NEXT:    psraw $8, %xmm5
140; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
141; SSSE3-NEXT:    psraw $8, %xmm2
142; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
143; SSSE3-NEXT:    psraw $8, %xmm3
144; SSSE3-NEXT:    movdqa %xmm4, %xmm0
145; SSSE3-NEXT:    movdqa %xmm5, %xmm1
146; SSSE3-NEXT:    retq
147;
148; SSE41-LABEL: sext_32i8_to_32i16:
149; SSE41:       # %bb.0: # %entry
150; SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
151; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
152; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
153; SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
154; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
155; SSE41-NEXT:    pmovsxbw %xmm0, %xmm3
156; SSE41-NEXT:    movdqa %xmm5, %xmm0
157; SSE41-NEXT:    movdqa %xmm4, %xmm1
158; SSE41-NEXT:    retq
159;
160; AVX1-LABEL: sext_32i8_to_32i16:
161; AVX1:       # %bb.0: # %entry
162; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
163; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
164; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
165; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
166; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
167; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
168; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
169; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
170; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
171; AVX1-NEXT:    vmovaps %ymm2, %ymm0
172; AVX1-NEXT:    retq
173;
174; AVX2-LABEL: sext_32i8_to_32i16:
175; AVX2:       # %bb.0: # %entry
176; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm2
177; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
178; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
179; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
180; AVX2-NEXT:    retq
181;
182; AVX512F-LABEL: sext_32i8_to_32i16:
183; AVX512F:       # %bb.0: # %entry
184; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm1
185; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
186; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
187; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
188; AVX512F-NEXT:    retq
189;
190; AVX512BW-LABEL: sext_32i8_to_32i16:
191; AVX512BW:       # %bb.0: # %entry
192; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
193; AVX512BW-NEXT:    retq
194;
195; X86-SSE2-LABEL: sext_32i8_to_32i16:
196; X86-SSE2:       # %bb.0: # %entry
197; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
198; X86-SSE2-NEXT:    psraw $8, %xmm4
199; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
200; X86-SSE2-NEXT:    psraw $8, %xmm5
201; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
202; X86-SSE2-NEXT:    psraw $8, %xmm2
203; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
204; X86-SSE2-NEXT:    psraw $8, %xmm3
205; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
206; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
207; X86-SSE2-NEXT:    retl
208;
209; X86-SSE41-LABEL: sext_32i8_to_32i16:
210; X86-SSE41:       # %bb.0: # %entry
211; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
212; X86-SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
213; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
214; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
215; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
216; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm3
217; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
218; X86-SSE41-NEXT:    movdqa %xmm4, %xmm1
219; X86-SSE41-NEXT:    retl
220entry:
221  %B = sext <32 x i8> %A to <32 x i16>
222  ret <32 x i16> %B
223}
224
225define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
226; SSE2-LABEL: sext_16i8_to_4i32:
227; SSE2:       # %bb.0: # %entry
228; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
229; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
230; SSE2-NEXT:    psrad $24, %xmm0
231; SSE2-NEXT:    retq
232;
233; SSSE3-LABEL: sext_16i8_to_4i32:
234; SSSE3:       # %bb.0: # %entry
235; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
236; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
237; SSSE3-NEXT:    psrad $24, %xmm0
238; SSSE3-NEXT:    retq
239;
240; SSE41-LABEL: sext_16i8_to_4i32:
241; SSE41:       # %bb.0: # %entry
242; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
243; SSE41-NEXT:    retq
244;
245; AVX-LABEL: sext_16i8_to_4i32:
246; AVX:       # %bb.0: # %entry
247; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
248; AVX-NEXT:    retq
249;
250; X86-SSE2-LABEL: sext_16i8_to_4i32:
251; X86-SSE2:       # %bb.0: # %entry
252; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
253; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
254; X86-SSE2-NEXT:    psrad $24, %xmm0
255; X86-SSE2-NEXT:    retl
256;
257; X86-SSE41-LABEL: sext_16i8_to_4i32:
258; X86-SSE41:       # %bb.0: # %entry
259; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
260; X86-SSE41-NEXT:    retl
261entry:
262  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
263  %C = sext <4 x i8> %B to <4 x i32>
264  ret <4 x i32> %C
265}
266
267define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
268; SSE2-LABEL: sext_16i8_to_8i32:
269; SSE2:       # %bb.0: # %entry
270; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
271; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
272; SSE2-NEXT:    psrad $24, %xmm0
273; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
274; SSE2-NEXT:    psrad $24, %xmm1
275; SSE2-NEXT:    retq
276;
277; SSSE3-LABEL: sext_16i8_to_8i32:
278; SSSE3:       # %bb.0: # %entry
279; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
280; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
281; SSSE3-NEXT:    psrad $24, %xmm0
282; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
283; SSSE3-NEXT:    psrad $24, %xmm1
284; SSSE3-NEXT:    retq
285;
286; SSE41-LABEL: sext_16i8_to_8i32:
287; SSE41:       # %bb.0: # %entry
288; SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
289; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
290; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
291; SSE41-NEXT:    movdqa %xmm2, %xmm0
292; SSE41-NEXT:    retq
293;
294; AVX1-LABEL: sext_16i8_to_8i32:
295; AVX1:       # %bb.0: # %entry
296; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
297; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
298; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
299; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
300; AVX1-NEXT:    retq
301;
302; AVX2-LABEL: sext_16i8_to_8i32:
303; AVX2:       # %bb.0: # %entry
304; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
305; AVX2-NEXT:    retq
306;
307; AVX512-LABEL: sext_16i8_to_8i32:
308; AVX512:       # %bb.0: # %entry
309; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
310; AVX512-NEXT:    retq
311;
312; X86-SSE2-LABEL: sext_16i8_to_8i32:
313; X86-SSE2:       # %bb.0: # %entry
314; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
315; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
316; X86-SSE2-NEXT:    psrad $24, %xmm0
317; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
318; X86-SSE2-NEXT:    psrad $24, %xmm1
319; X86-SSE2-NEXT:    retl
320;
321; X86-SSE41-LABEL: sext_16i8_to_8i32:
322; X86-SSE41:       # %bb.0: # %entry
323; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
324; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
325; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
326; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
327; X86-SSE41-NEXT:    retl
328entry:
329  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
330  %C = sext <8 x i8> %B to <8 x i32>
331  ret <8 x i32> %C
332}
333
334define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
335; SSE2-LABEL: sext_16i8_to_16i32:
336; SSE2:       # %bb.0: # %entry
337; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
338; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
339; SSE2-NEXT:    psrad $24, %xmm4
340; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
341; SSE2-NEXT:    psrad $24, %xmm1
342; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
343; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
344; SSE2-NEXT:    psrad $24, %xmm2
345; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
346; SSE2-NEXT:    psrad $24, %xmm3
347; SSE2-NEXT:    movdqa %xmm4, %xmm0
348; SSE2-NEXT:    retq
349;
350; SSSE3-LABEL: sext_16i8_to_16i32:
351; SSSE3:       # %bb.0: # %entry
352; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
353; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
354; SSSE3-NEXT:    psrad $24, %xmm4
355; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
356; SSSE3-NEXT:    psrad $24, %xmm1
357; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
358; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
359; SSSE3-NEXT:    psrad $24, %xmm2
360; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
361; SSSE3-NEXT:    psrad $24, %xmm3
362; SSSE3-NEXT:    movdqa %xmm4, %xmm0
363; SSSE3-NEXT:    retq
364;
365; SSE41-LABEL: sext_16i8_to_16i32:
366; SSE41:       # %bb.0: # %entry
367; SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
368; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
369; SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
370; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
371; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
372; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
373; SSE41-NEXT:    pmovsxbd %xmm0, %xmm3
374; SSE41-NEXT:    movdqa %xmm4, %xmm0
375; SSE41-NEXT:    retq
376;
377; AVX1-LABEL: sext_16i8_to_16i32:
378; AVX1:       # %bb.0: # %entry
379; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
380; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
381; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
382; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
383; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
384; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
385; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
386; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
387; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
388; AVX1-NEXT:    vmovaps %ymm2, %ymm0
389; AVX1-NEXT:    retq
390;
391; AVX2-LABEL: sext_16i8_to_16i32:
392; AVX2:       # %bb.0: # %entry
393; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm2
394; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
395; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm1
396; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
397; AVX2-NEXT:    retq
398;
399; AVX512-LABEL: sext_16i8_to_16i32:
400; AVX512:       # %bb.0: # %entry
401; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
402; AVX512-NEXT:    retq
403;
404; X86-SSE2-LABEL: sext_16i8_to_16i32:
405; X86-SSE2:       # %bb.0: # %entry
406; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
407; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
408; X86-SSE2-NEXT:    psrad $24, %xmm4
409; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
410; X86-SSE2-NEXT:    psrad $24, %xmm1
411; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
412; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
413; X86-SSE2-NEXT:    psrad $24, %xmm2
414; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
415; X86-SSE2-NEXT:    psrad $24, %xmm3
416; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
417; X86-SSE2-NEXT:    retl
418;
419; X86-SSE41-LABEL: sext_16i8_to_16i32:
420; X86-SSE41:       # %bb.0: # %entry
421; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
422; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
423; X86-SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
424; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
425; X86-SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
426; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
427; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm3
428; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
429; X86-SSE41-NEXT:    retl
430entry:
431  %B = sext <16 x i8> %A to <16 x i32>
432  ret <16 x i32> %B
433}
434
435define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
436; SSE2-LABEL: sext_16i8_to_2i64:
437; SSE2:       # %bb.0: # %entry
438; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
439; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
440; SSE2-NEXT:    pxor %xmm1, %xmm1
441; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
442; SSE2-NEXT:    psrad $24, %xmm0
443; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
444; SSE2-NEXT:    retq
445;
446; SSSE3-LABEL: sext_16i8_to_2i64:
447; SSSE3:       # %bb.0: # %entry
448; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
449; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
450; SSSE3-NEXT:    pxor %xmm1, %xmm1
451; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
452; SSSE3-NEXT:    psrad $24, %xmm0
453; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
454; SSSE3-NEXT:    retq
455;
456; SSE41-LABEL: sext_16i8_to_2i64:
457; SSE41:       # %bb.0: # %entry
458; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
459; SSE41-NEXT:    retq
460;
461; AVX-LABEL: sext_16i8_to_2i64:
462; AVX:       # %bb.0: # %entry
463; AVX-NEXT:    vpmovsxbq %xmm0, %xmm0
464; AVX-NEXT:    retq
465;
466; X86-SSE2-LABEL: sext_16i8_to_2i64:
467; X86-SSE2:       # %bb.0: # %entry
468; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
469; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
470; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
471; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
472; X86-SSE2-NEXT:    psrad $24, %xmm0
473; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
474; X86-SSE2-NEXT:    retl
475;
476; X86-SSE41-LABEL: sext_16i8_to_2i64:
477; X86-SSE41:       # %bb.0: # %entry
478; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
479; X86-SSE41-NEXT:    retl
480entry:
481  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
482  %C = sext <2 x i8> %B to <2 x i64>
483  ret <2 x i64> %C
484}
485
486define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
487; SSE2-LABEL: sext_16i8_to_4i64:
488; SSE2:       # %bb.0: # %entry
489; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
490; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
491; SSE2-NEXT:    psrad $24, %xmm1
492; SSE2-NEXT:    pxor %xmm2, %xmm2
493; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
494; SSE2-NEXT:    movdqa %xmm1, %xmm0
495; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
496; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
497; SSE2-NEXT:    retq
498;
499; SSSE3-LABEL: sext_16i8_to_4i64:
500; SSSE3:       # %bb.0: # %entry
501; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
502; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
503; SSSE3-NEXT:    psrad $24, %xmm1
504; SSSE3-NEXT:    pxor %xmm2, %xmm2
505; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
506; SSSE3-NEXT:    movdqa %xmm1, %xmm0
507; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
508; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
509; SSSE3-NEXT:    retq
510;
511; SSE41-LABEL: sext_16i8_to_4i64:
512; SSE41:       # %bb.0: # %entry
513; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
514; SSE41-NEXT:    psrld $16, %xmm0
515; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
516; SSE41-NEXT:    movdqa %xmm2, %xmm0
517; SSE41-NEXT:    retq
518;
519; AVX1-LABEL: sext_16i8_to_4i64:
520; AVX1:       # %bb.0: # %entry
521; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
522; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
523; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
524; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
525; AVX1-NEXT:    retq
526;
527; AVX2-LABEL: sext_16i8_to_4i64:
528; AVX2:       # %bb.0: # %entry
529; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
530; AVX2-NEXT:    retq
531;
532; AVX512-LABEL: sext_16i8_to_4i64:
533; AVX512:       # %bb.0: # %entry
534; AVX512-NEXT:    vpmovsxbq %xmm0, %ymm0
535; AVX512-NEXT:    retq
536;
537; X86-SSE2-LABEL: sext_16i8_to_4i64:
538; X86-SSE2:       # %bb.0: # %entry
539; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
540; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
541; X86-SSE2-NEXT:    psrad $24, %xmm1
542; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
543; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
544; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
545; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
546; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
547; X86-SSE2-NEXT:    retl
548;
549; X86-SSE41-LABEL: sext_16i8_to_4i64:
550; X86-SSE41:       # %bb.0: # %entry
551; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
552; X86-SSE41-NEXT:    psrld $16, %xmm0
553; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
554; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
555; X86-SSE41-NEXT:    retl
556entry:
557  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
558  %C = sext <4 x i8> %B to <4 x i64>
559  ret <4 x i64> %C
560}
561
562define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
563; SSE2-LABEL: sext_16i8_to_8i64:
564; SSE2:       # %bb.0: # %entry
565; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
566; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
567; SSE2-NEXT:    psrad $24, %xmm1
568; SSE2-NEXT:    pxor %xmm4, %xmm4
569; SSE2-NEXT:    pxor %xmm3, %xmm3
570; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
571; SSE2-NEXT:    movdqa %xmm1, %xmm0
572; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
573; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
574; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
575; SSE2-NEXT:    psrad $24, %xmm3
576; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
577; SSE2-NEXT:    movdqa %xmm3, %xmm2
578; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
579; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
580; SSE2-NEXT:    retq
581;
582; SSSE3-LABEL: sext_16i8_to_8i64:
583; SSSE3:       # %bb.0: # %entry
584; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
585; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
586; SSSE3-NEXT:    psrad $24, %xmm1
587; SSSE3-NEXT:    pxor %xmm4, %xmm4
588; SSSE3-NEXT:    pxor %xmm3, %xmm3
589; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
590; SSSE3-NEXT:    movdqa %xmm1, %xmm0
591; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
592; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
593; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
594; SSSE3-NEXT:    psrad $24, %xmm3
595; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
596; SSSE3-NEXT:    movdqa %xmm3, %xmm2
597; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
598; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
599; SSSE3-NEXT:    retq
600;
601; SSE41-LABEL: sext_16i8_to_8i64:
602; SSE41:       # %bb.0: # %entry
603; SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
604; SSE41-NEXT:    movdqa %xmm0, %xmm1
605; SSE41-NEXT:    psrld $16, %xmm1
606; SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
607; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
608; SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
609; SSE41-NEXT:    psrlq $48, %xmm0
610; SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
611; SSE41-NEXT:    movdqa %xmm4, %xmm0
612; SSE41-NEXT:    retq
613;
614; AVX1-LABEL: sext_16i8_to_8i64:
615; AVX1:       # %bb.0: # %entry
616; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
617; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
618; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
619; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
620; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
621; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
622; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
623; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
624; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
625; AVX1-NEXT:    vmovaps %ymm2, %ymm0
626; AVX1-NEXT:    retq
627;
628; AVX2-LABEL: sext_16i8_to_8i64:
629; AVX2:       # %bb.0: # %entry
630; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm2
631; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
632; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm1
633; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
634; AVX2-NEXT:    retq
635;
636; AVX512-LABEL: sext_16i8_to_8i64:
637; AVX512:       # %bb.0: # %entry
638; AVX512-NEXT:    vpmovsxbq %xmm0, %zmm0
639; AVX512-NEXT:    retq
640;
641; X86-SSE2-LABEL: sext_16i8_to_8i64:
642; X86-SSE2:       # %bb.0: # %entry
643; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
644; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
645; X86-SSE2-NEXT:    psrad $24, %xmm1
646; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
647; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
648; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
649; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
650; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
651; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
652; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
653; X86-SSE2-NEXT:    psrad $24, %xmm3
654; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
655; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
656; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
657; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
658; X86-SSE2-NEXT:    retl
659;
660; X86-SSE41-LABEL: sext_16i8_to_8i64:
661; X86-SSE41:       # %bb.0: # %entry
662; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
663; X86-SSE41-NEXT:    movdqa %xmm0, %xmm1
664; X86-SSE41-NEXT:    psrld $16, %xmm1
665; X86-SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
666; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
667; X86-SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
668; X86-SSE41-NEXT:    psrlq $48, %xmm0
669; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
670; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
671; X86-SSE41-NEXT:    retl
672entry:
673  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
674  %C = sext <8 x i8> %B to <8 x i64>
675  ret <8 x i64> %C
676}
677
678define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
679; SSE2-LABEL: sext_8i16_to_4i32:
680; SSE2:       # %bb.0: # %entry
681; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
682; SSE2-NEXT:    psrad $16, %xmm0
683; SSE2-NEXT:    retq
684;
685; SSSE3-LABEL: sext_8i16_to_4i32:
686; SSSE3:       # %bb.0: # %entry
687; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
688; SSSE3-NEXT:    psrad $16, %xmm0
689; SSSE3-NEXT:    retq
690;
691; SSE41-LABEL: sext_8i16_to_4i32:
692; SSE41:       # %bb.0: # %entry
693; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
694; SSE41-NEXT:    retq
695;
696; AVX-LABEL: sext_8i16_to_4i32:
697; AVX:       # %bb.0: # %entry
698; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
699; AVX-NEXT:    retq
700;
701; X86-SSE2-LABEL: sext_8i16_to_4i32:
702; X86-SSE2:       # %bb.0: # %entry
703; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
704; X86-SSE2-NEXT:    psrad $16, %xmm0
705; X86-SSE2-NEXT:    retl
706;
707; X86-SSE41-LABEL: sext_8i16_to_4i32:
708; X86-SSE41:       # %bb.0: # %entry
709; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
710; X86-SSE41-NEXT:    retl
711entry:
712  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
713  %C = sext <4 x i16> %B to <4 x i32>
714  ret <4 x i32> %C
715}
716
717define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
718; SSE2-LABEL: sext_8i16_to_8i32:
719; SSE2:       # %bb.0: # %entry
720; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
721; SSE2-NEXT:    psrad $16, %xmm2
722; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
723; SSE2-NEXT:    psrad $16, %xmm1
724; SSE2-NEXT:    movdqa %xmm2, %xmm0
725; SSE2-NEXT:    retq
726;
727; SSSE3-LABEL: sext_8i16_to_8i32:
728; SSSE3:       # %bb.0: # %entry
729; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
730; SSSE3-NEXT:    psrad $16, %xmm2
731; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
732; SSSE3-NEXT:    psrad $16, %xmm1
733; SSSE3-NEXT:    movdqa %xmm2, %xmm0
734; SSSE3-NEXT:    retq
735;
736; SSE41-LABEL: sext_8i16_to_8i32:
737; SSE41:       # %bb.0: # %entry
738; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
739; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
740; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
741; SSE41-NEXT:    movdqa %xmm2, %xmm0
742; SSE41-NEXT:    retq
743;
744; AVX1-LABEL: sext_8i16_to_8i32:
745; AVX1:       # %bb.0: # %entry
746; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
747; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
748; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
749; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
750; AVX1-NEXT:    retq
751;
752; AVX2-LABEL: sext_8i16_to_8i32:
753; AVX2:       # %bb.0: # %entry
754; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
755; AVX2-NEXT:    retq
756;
757; AVX512-LABEL: sext_8i16_to_8i32:
758; AVX512:       # %bb.0: # %entry
759; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
760; AVX512-NEXT:    retq
761;
762; X86-SSE2-LABEL: sext_8i16_to_8i32:
763; X86-SSE2:       # %bb.0: # %entry
764; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
765; X86-SSE2-NEXT:    psrad $16, %xmm2
766; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
767; X86-SSE2-NEXT:    psrad $16, %xmm1
768; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
769; X86-SSE2-NEXT:    retl
770;
771; X86-SSE41-LABEL: sext_8i16_to_8i32:
772; X86-SSE41:       # %bb.0: # %entry
773; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
774; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
775; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
776; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
777; X86-SSE41-NEXT:    retl
778entry:
779  %B = sext <8 x i16> %A to <8 x i32>
780  ret <8 x i32> %B
781}
782
783define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp {
784; SSE2-LABEL: sext_16i16_to_16i32:
785; SSE2:       # %bb.0: # %entry
786; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
787; SSE2-NEXT:    psrad $16, %xmm4
788; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
789; SSE2-NEXT:    psrad $16, %xmm5
790; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
791; SSE2-NEXT:    psrad $16, %xmm2
792; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
793; SSE2-NEXT:    psrad $16, %xmm3
794; SSE2-NEXT:    movdqa %xmm4, %xmm0
795; SSE2-NEXT:    movdqa %xmm5, %xmm1
796; SSE2-NEXT:    retq
797;
798; SSSE3-LABEL: sext_16i16_to_16i32:
799; SSSE3:       # %bb.0: # %entry
800; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
801; SSSE3-NEXT:    psrad $16, %xmm4
802; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
803; SSSE3-NEXT:    psrad $16, %xmm5
804; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
805; SSSE3-NEXT:    psrad $16, %xmm2
806; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
807; SSSE3-NEXT:    psrad $16, %xmm3
808; SSSE3-NEXT:    movdqa %xmm4, %xmm0
809; SSSE3-NEXT:    movdqa %xmm5, %xmm1
810; SSSE3-NEXT:    retq
811;
812; SSE41-LABEL: sext_16i16_to_16i32:
813; SSE41:       # %bb.0: # %entry
814; SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
815; SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
816; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
817; SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
818; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
819; SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
820; SSE41-NEXT:    movdqa %xmm5, %xmm0
821; SSE41-NEXT:    movdqa %xmm4, %xmm1
822; SSE41-NEXT:    retq
823;
824; AVX1-LABEL: sext_16i16_to_16i32:
825; AVX1:       # %bb.0: # %entry
826; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
827; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
828; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
829; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
830; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
831; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
832; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
833; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
834; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
835; AVX1-NEXT:    vmovaps %ymm2, %ymm0
836; AVX1-NEXT:    retq
837;
838; AVX2-LABEL: sext_16i16_to_16i32:
839; AVX2:       # %bb.0: # %entry
840; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm2
841; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
842; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm1
843; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
844; AVX2-NEXT:    retq
845;
846; AVX512-LABEL: sext_16i16_to_16i32:
847; AVX512:       # %bb.0: # %entry
848; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
849; AVX512-NEXT:    retq
850;
851; X86-SSE2-LABEL: sext_16i16_to_16i32:
852; X86-SSE2:       # %bb.0: # %entry
853; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
854; X86-SSE2-NEXT:    psrad $16, %xmm4
855; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
856; X86-SSE2-NEXT:    psrad $16, %xmm5
857; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
858; X86-SSE2-NEXT:    psrad $16, %xmm2
859; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
860; X86-SSE2-NEXT:    psrad $16, %xmm3
861; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
862; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
863; X86-SSE2-NEXT:    retl
864;
865; X86-SSE41-LABEL: sext_16i16_to_16i32:
866; X86-SSE41:       # %bb.0: # %entry
867; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
868; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
869; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
870; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
871; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
872; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
873; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
874; X86-SSE41-NEXT:    movdqa %xmm4, %xmm1
875; X86-SSE41-NEXT:    retl
876entry:
877  %B = sext <16 x i16> %A to <16 x i32>
878  ret <16 x i32> %B
879}
880
881define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
882; SSE2-LABEL: sext_8i16_to_2i64:
883; SSE2:       # %bb.0: # %entry
884; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
885; SSE2-NEXT:    pxor %xmm1, %xmm1
886; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
887; SSE2-NEXT:    psrad $16, %xmm0
888; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
889; SSE2-NEXT:    retq
890;
891; SSSE3-LABEL: sext_8i16_to_2i64:
892; SSSE3:       # %bb.0: # %entry
893; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
894; SSSE3-NEXT:    pxor %xmm1, %xmm1
895; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
896; SSSE3-NEXT:    psrad $16, %xmm0
897; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
898; SSSE3-NEXT:    retq
899;
900; SSE41-LABEL: sext_8i16_to_2i64:
901; SSE41:       # %bb.0: # %entry
902; SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
903; SSE41-NEXT:    retq
904;
905; AVX-LABEL: sext_8i16_to_2i64:
906; AVX:       # %bb.0: # %entry
907; AVX-NEXT:    vpmovsxwq %xmm0, %xmm0
908; AVX-NEXT:    retq
909;
910; X86-SSE2-LABEL: sext_8i16_to_2i64:
911; X86-SSE2:       # %bb.0: # %entry
912; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
913; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
914; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
915; X86-SSE2-NEXT:    psrad $16, %xmm0
916; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
917; X86-SSE2-NEXT:    retl
918;
919; X86-SSE41-LABEL: sext_8i16_to_2i64:
920; X86-SSE41:       # %bb.0: # %entry
921; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
922; X86-SSE41-NEXT:    retl
923entry:
924  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
925  %C = sext <2 x i16> %B to <2 x i64>
926  ret <2 x i64> %C
927}
928
929define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
930; SSE2-LABEL: sext_8i16_to_4i64:
931; SSE2:       # %bb.0: # %entry
932; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
933; SSE2-NEXT:    psrad $16, %xmm1
934; SSE2-NEXT:    pxor %xmm2, %xmm2
935; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
936; SSE2-NEXT:    movdqa %xmm1, %xmm0
937; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
938; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
939; SSE2-NEXT:    retq
940;
941; SSSE3-LABEL: sext_8i16_to_4i64:
942; SSSE3:       # %bb.0: # %entry
943; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
944; SSSE3-NEXT:    psrad $16, %xmm1
945; SSSE3-NEXT:    pxor %xmm2, %xmm2
946; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
947; SSSE3-NEXT:    movdqa %xmm1, %xmm0
948; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
949; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
950; SSSE3-NEXT:    retq
951;
952; SSE41-LABEL: sext_8i16_to_4i64:
953; SSE41:       # %bb.0: # %entry
954; SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
955; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
956; SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
957; SSE41-NEXT:    movdqa %xmm2, %xmm0
958; SSE41-NEXT:    retq
959;
960; AVX1-LABEL: sext_8i16_to_4i64:
961; AVX1:       # %bb.0: # %entry
962; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
963; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
964; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
965; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
966; AVX1-NEXT:    retq
967;
968; AVX2-LABEL: sext_8i16_to_4i64:
969; AVX2:       # %bb.0: # %entry
970; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
971; AVX2-NEXT:    retq
972;
973; AVX512-LABEL: sext_8i16_to_4i64:
974; AVX512:       # %bb.0: # %entry
975; AVX512-NEXT:    vpmovsxwq %xmm0, %ymm0
976; AVX512-NEXT:    retq
977;
978; X86-SSE2-LABEL: sext_8i16_to_4i64:
979; X86-SSE2:       # %bb.0: # %entry
980; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
981; X86-SSE2-NEXT:    psrad $16, %xmm1
982; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
983; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
984; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
985; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
986; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
987; X86-SSE2-NEXT:    retl
988;
989; X86-SSE41-LABEL: sext_8i16_to_4i64:
990; X86-SSE41:       # %bb.0: # %entry
991; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
992; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
993; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
994; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
995; X86-SSE41-NEXT:    retl
996entry:
997  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
998  %C = sext <4 x i16> %B to <4 x i64>
999  ret <4 x i64> %C
1000}
1001
1002define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp {
1003; SSE2-LABEL: sext_8i16_to_8i64:
1004; SSE2:       # %bb.0: # %entry
1005; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1006; SSE2-NEXT:    psrad $16, %xmm1
1007; SSE2-NEXT:    pxor %xmm5, %xmm5
1008; SSE2-NEXT:    pxor %xmm2, %xmm2
1009; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1010; SSE2-NEXT:    movdqa %xmm1, %xmm4
1011; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
1012; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1013; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1014; SSE2-NEXT:    psrad $16, %xmm3
1015; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
1016; SSE2-NEXT:    movdqa %xmm3, %xmm2
1017; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1018; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1019; SSE2-NEXT:    movdqa %xmm4, %xmm0
1020; SSE2-NEXT:    retq
1021;
1022; SSSE3-LABEL: sext_8i16_to_8i64:
1023; SSSE3:       # %bb.0: # %entry
1024; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1025; SSSE3-NEXT:    psrad $16, %xmm1
1026; SSSE3-NEXT:    pxor %xmm5, %xmm5
1027; SSSE3-NEXT:    pxor %xmm2, %xmm2
1028; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
1029; SSSE3-NEXT:    movdqa %xmm1, %xmm4
1030; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
1031; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1032; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1033; SSSE3-NEXT:    psrad $16, %xmm3
1034; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
1035; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1036; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1037; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1038; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1039; SSSE3-NEXT:    retq
1040;
1041; SSE41-LABEL: sext_8i16_to_8i64:
1042; SSE41:       # %bb.0: # %entry
1043; SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
1044; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1045; SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
1046; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1047; SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
1048; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1049; SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
1050; SSE41-NEXT:    movdqa %xmm4, %xmm0
1051; SSE41-NEXT:    retq
1052;
1053; AVX1-LABEL: sext_8i16_to_8i64:
1054; AVX1:       # %bb.0: # %entry
1055; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
1056; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
1057; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
1058; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
1059; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1060; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
1061; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1062; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
1063; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
1064; AVX1-NEXT:    vmovaps %ymm2, %ymm0
1065; AVX1-NEXT:    retq
1066;
1067; AVX2-LABEL: sext_8i16_to_8i64:
1068; AVX2:       # %bb.0: # %entry
1069; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm2
1070; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1071; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm1
1072; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
1073; AVX2-NEXT:    retq
1074;
1075; AVX512-LABEL: sext_8i16_to_8i64:
1076; AVX512:       # %bb.0: # %entry
1077; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
1078; AVX512-NEXT:    retq
1079;
1080; X86-SSE2-LABEL: sext_8i16_to_8i64:
1081; X86-SSE2:       # %bb.0: # %entry
1082; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1083; X86-SSE2-NEXT:    psrad $16, %xmm1
1084; X86-SSE2-NEXT:    pxor %xmm5, %xmm5
1085; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
1086; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1087; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
1088; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
1089; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1090; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1091; X86-SSE2-NEXT:    psrad $16, %xmm3
1092; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
1093; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
1094; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1095; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1096; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
1097; X86-SSE2-NEXT:    retl
1098;
1099; X86-SSE41-LABEL: sext_8i16_to_8i64:
1100; X86-SSE41:       # %bb.0: # %entry
1101; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
1102; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1103; X86-SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
1104; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1105; X86-SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
1106; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1107; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
1108; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
1109; X86-SSE41-NEXT:    retl
1110entry:
1111  %B = sext <8 x i16> %A to <8 x i64>
1112  ret <8 x i64> %B
1113}
1114
1115define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
1116; SSE2-LABEL: sext_4i32_to_2i64:
1117; SSE2:       # %bb.0: # %entry
1118; SSE2-NEXT:    pxor %xmm1, %xmm1
1119; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1120; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1121; SSE2-NEXT:    retq
1122;
1123; SSSE3-LABEL: sext_4i32_to_2i64:
1124; SSSE3:       # %bb.0: # %entry
1125; SSSE3-NEXT:    pxor %xmm1, %xmm1
1126; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
1127; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1128; SSSE3-NEXT:    retq
1129;
1130; SSE41-LABEL: sext_4i32_to_2i64:
1131; SSE41:       # %bb.0: # %entry
1132; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
1133; SSE41-NEXT:    retq
1134;
1135; AVX-LABEL: sext_4i32_to_2i64:
1136; AVX:       # %bb.0: # %entry
1137; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
1138; AVX-NEXT:    retq
1139;
1140; X86-SSE2-LABEL: sext_4i32_to_2i64:
1141; X86-SSE2:       # %bb.0: # %entry
1142; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1143; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1144; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1145; X86-SSE2-NEXT:    retl
1146;
1147; X86-SSE41-LABEL: sext_4i32_to_2i64:
1148; X86-SSE41:       # %bb.0: # %entry
1149; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
1150; X86-SSE41-NEXT:    retl
1151entry:
1152  %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1153  %C = sext <2 x i32> %B to <2 x i64>
1154  ret <2 x i64> %C
1155}
1156
1157define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
1158; SSE2-LABEL: sext_4i32_to_4i64:
1159; SSE2:       # %bb.0: # %entry
1160; SSE2-NEXT:    pxor %xmm2, %xmm2
1161; SSE2-NEXT:    pxor %xmm3, %xmm3
1162; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1163; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1164; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1165; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1166; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1167; SSE2-NEXT:    retq
1168;
1169; SSSE3-LABEL: sext_4i32_to_4i64:
1170; SSSE3:       # %bb.0: # %entry
1171; SSSE3-NEXT:    pxor %xmm2, %xmm2
1172; SSSE3-NEXT:    pxor %xmm3, %xmm3
1173; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
1174; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1175; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1176; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
1177; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1178; SSSE3-NEXT:    retq
1179;
1180; SSE41-LABEL: sext_4i32_to_4i64:
1181; SSE41:       # %bb.0: # %entry
1182; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
1183; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1184; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
1185; SSE41-NEXT:    movdqa %xmm2, %xmm0
1186; SSE41-NEXT:    retq
1187;
1188; AVX1-LABEL: sext_4i32_to_4i64:
1189; AVX1:       # %bb.0: # %entry
1190; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1191; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1192; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1193; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1194; AVX1-NEXT:    retq
1195;
1196; AVX2-LABEL: sext_4i32_to_4i64:
1197; AVX2:       # %bb.0: # %entry
1198; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
1199; AVX2-NEXT:    retq
1200;
1201; AVX512-LABEL: sext_4i32_to_4i64:
1202; AVX512:       # %bb.0: # %entry
1203; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
1204; AVX512-NEXT:    retq
1205;
1206; X86-SSE2-LABEL: sext_4i32_to_4i64:
1207; X86-SSE2:       # %bb.0: # %entry
1208; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
1209; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
1210; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1211; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1212; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1213; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1214; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1215; X86-SSE2-NEXT:    retl
1216;
1217; X86-SSE41-LABEL: sext_4i32_to_4i64:
1218; X86-SSE41:       # %bb.0: # %entry
1219; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
1220; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1221; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
1222; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
1223; X86-SSE41-NEXT:    retl
1224entry:
1225  %B = sext <4 x i32> %A to <4 x i64>
1226  ret <4 x i64> %B
1227}
1228
1229define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp {
1230; SSE2-LABEL: sext_8i32_to_8i64:
1231; SSE2:       # %bb.0: # %entry
1232; SSE2-NEXT:    movdqa %xmm1, %xmm2
1233; SSE2-NEXT:    pxor %xmm4, %xmm4
1234; SSE2-NEXT:    pxor %xmm3, %xmm3
1235; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1236; SSE2-NEXT:    pxor %xmm5, %xmm5
1237; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
1238; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1239; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1240; SSE2-NEXT:    pxor %xmm3, %xmm3
1241; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
1242; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1243; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
1244; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1245; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
1246; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1247; SSE2-NEXT:    retq
1248;
1249; SSSE3-LABEL: sext_8i32_to_8i64:
1250; SSSE3:       # %bb.0: # %entry
1251; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1252; SSSE3-NEXT:    pxor %xmm4, %xmm4
1253; SSSE3-NEXT:    pxor %xmm3, %xmm3
1254; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
1255; SSSE3-NEXT:    pxor %xmm5, %xmm5
1256; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
1257; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1258; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1259; SSSE3-NEXT:    pxor %xmm3, %xmm3
1260; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
1261; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1262; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
1263; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1264; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
1265; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1266; SSSE3-NEXT:    retq
1267;
1268; SSE41-LABEL: sext_8i32_to_8i64:
1269; SSE41:       # %bb.0: # %entry
1270; SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
1271; SSE41-NEXT:    pmovsxdq %xmm1, %xmm2
1272; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1273; SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
1274; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1275; SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
1276; SSE41-NEXT:    movdqa %xmm5, %xmm0
1277; SSE41-NEXT:    movdqa %xmm4, %xmm1
1278; SSE41-NEXT:    retq
1279;
1280; AVX1-LABEL: sext_8i32_to_8i64:
1281; AVX1:       # %bb.0: # %entry
1282; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1283; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1284; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
1285; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
1286; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1287; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1288; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1289; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1290; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
1291; AVX1-NEXT:    vmovaps %ymm2, %ymm0
1292; AVX1-NEXT:    retq
1293;
1294; AVX2-LABEL: sext_8i32_to_8i64:
1295; AVX2:       # %bb.0: # %entry
1296; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm2
1297; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1298; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm1
1299; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
1300; AVX2-NEXT:    retq
1301;
1302; AVX512-LABEL: sext_8i32_to_8i64:
1303; AVX512:       # %bb.0: # %entry
1304; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
1305; AVX512-NEXT:    retq
1306;
1307; X86-SSE2-LABEL: sext_8i32_to_8i64:
1308; X86-SSE2:       # %bb.0: # %entry
1309; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
1310; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
1311; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
1312; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1313; X86-SSE2-NEXT:    pxor %xmm5, %xmm5
1314; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
1315; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1316; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1317; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
1318; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
1319; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1320; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
1321; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1322; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
1323; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1324; X86-SSE2-NEXT:    retl
1325;
1326; X86-SSE41-LABEL: sext_8i32_to_8i64:
1327; X86-SSE41:       # %bb.0: # %entry
1328; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
1329; X86-SSE41-NEXT:    pmovsxdq %xmm1, %xmm2
1330; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1331; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
1332; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1333; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
1334; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
1335; X86-SSE41-NEXT:    movdqa %xmm4, %xmm1
1336; X86-SSE41-NEXT:    retl
1337entry:
1338  %B = sext <8 x i32> %A to <8 x i64>
1339  ret <8 x i64> %B
1340}
1341
1342define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
1343; SSE-LABEL: load_sext_2i1_to_2i64:
1344; SSE:       # %bb.0: # %entry
1345; SSE-NEXT:    movb (%rdi), %al
1346; SSE-NEXT:    movzbl %al, %ecx
1347; SSE-NEXT:    shrb %al
1348; SSE-NEXT:    movzbl %al, %eax
1349; SSE-NEXT:    negq %rax
1350; SSE-NEXT:    movq %rax, %xmm1
1351; SSE-NEXT:    andl $1, %ecx
1352; SSE-NEXT:    negq %rcx
1353; SSE-NEXT:    movq %rcx, %xmm0
1354; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1355; SSE-NEXT:    retq
1356;
1357; AVX1-LABEL: load_sext_2i1_to_2i64:
1358; AVX1:       # %bb.0: # %entry
1359; AVX1-NEXT:    movb (%rdi), %al
1360; AVX1-NEXT:    movzbl %al, %ecx
1361; AVX1-NEXT:    shrb %al
1362; AVX1-NEXT:    movzbl %al, %eax
1363; AVX1-NEXT:    negq %rax
1364; AVX1-NEXT:    vmovq %rax, %xmm0
1365; AVX1-NEXT:    andl $1, %ecx
1366; AVX1-NEXT:    negq %rcx
1367; AVX1-NEXT:    vmovq %rcx, %xmm1
1368; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1369; AVX1-NEXT:    retq
1370;
1371; AVX2-LABEL: load_sext_2i1_to_2i64:
1372; AVX2:       # %bb.0: # %entry
1373; AVX2-NEXT:    movb (%rdi), %al
1374; AVX2-NEXT:    movzbl %al, %ecx
1375; AVX2-NEXT:    shrb %al
1376; AVX2-NEXT:    movzbl %al, %eax
1377; AVX2-NEXT:    negq %rax
1378; AVX2-NEXT:    vmovq %rax, %xmm0
1379; AVX2-NEXT:    andl $1, %ecx
1380; AVX2-NEXT:    negq %rcx
1381; AVX2-NEXT:    vmovq %rcx, %xmm1
1382; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1383; AVX2-NEXT:    retq
1384;
1385; AVX512-LABEL: load_sext_2i1_to_2i64:
1386; AVX512:       # %bb.0: # %entry
1387; AVX512-NEXT:    kmovw (%rdi), %k1
1388; AVX512-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1389; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1390; AVX512-NEXT:    vzeroupper
1391; AVX512-NEXT:    retq
1392;
1393; X86-SSE2-LABEL: load_sext_2i1_to_2i64:
1394; X86-SSE2:       # %bb.0: # %entry
1395; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1396; X86-SSE2-NEXT:    movb (%eax), %al
1397; X86-SSE2-NEXT:    movzbl %al, %ecx
1398; X86-SSE2-NEXT:    shrb %al
1399; X86-SSE2-NEXT:    movzbl %al, %eax
1400; X86-SSE2-NEXT:    negl %eax
1401; X86-SSE2-NEXT:    movd %eax, %xmm0
1402; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
1403; X86-SSE2-NEXT:    andl $1, %ecx
1404; X86-SSE2-NEXT:    negl %ecx
1405; X86-SSE2-NEXT:    movd %ecx, %xmm0
1406; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1407; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1408; X86-SSE2-NEXT:    retl
1409;
1410; X86-SSE41-LABEL: load_sext_2i1_to_2i64:
1411; X86-SSE41:       # %bb.0: # %entry
1412; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1413; X86-SSE41-NEXT:    movb (%eax), %al
1414; X86-SSE41-NEXT:    movzbl %al, %ecx
1415; X86-SSE41-NEXT:    andl $1, %ecx
1416; X86-SSE41-NEXT:    negl %ecx
1417; X86-SSE41-NEXT:    movd %ecx, %xmm0
1418; X86-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
1419; X86-SSE41-NEXT:    shrb %al
1420; X86-SSE41-NEXT:    movzbl %al, %eax
1421; X86-SSE41-NEXT:    negl %eax
1422; X86-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
1423; X86-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1424; X86-SSE41-NEXT:    retl
1425entry:
1426 %X = load <2 x i1>, <2 x i1>* %ptr
1427 %Y = sext <2 x i1> %X to <2 x i64>
1428 ret <2 x i64> %Y
1429}
1430
1431define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
1432; SSE2-LABEL: load_sext_2i8_to_2i64:
1433; SSE2:       # %bb.0: # %entry
1434; SSE2-NEXT:    movzwl (%rdi), %eax
1435; SSE2-NEXT:    movd %eax, %xmm0
1436; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1437; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1438; SSE2-NEXT:    pxor %xmm1, %xmm1
1439; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1440; SSE2-NEXT:    psrad $24, %xmm0
1441; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1442; SSE2-NEXT:    retq
1443;
1444; SSSE3-LABEL: load_sext_2i8_to_2i64:
1445; SSSE3:       # %bb.0: # %entry
1446; SSSE3-NEXT:    movzwl (%rdi), %eax
1447; SSSE3-NEXT:    movd %eax, %xmm0
1448; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1449; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1450; SSSE3-NEXT:    pxor %xmm1, %xmm1
1451; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
1452; SSSE3-NEXT:    psrad $24, %xmm0
1453; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1454; SSSE3-NEXT:    retq
1455;
1456; SSE41-LABEL: load_sext_2i8_to_2i64:
1457; SSE41:       # %bb.0: # %entry
1458; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
1459; SSE41-NEXT:    retq
1460;
1461; AVX-LABEL: load_sext_2i8_to_2i64:
1462; AVX:       # %bb.0: # %entry
1463; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
1464; AVX-NEXT:    retq
1465;
1466; X86-SSE2-LABEL: load_sext_2i8_to_2i64:
1467; X86-SSE2:       # %bb.0: # %entry
1468; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1469; X86-SSE2-NEXT:    movzwl (%eax), %eax
1470; X86-SSE2-NEXT:    movd %eax, %xmm0
1471; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1472; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1473; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1474; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1475; X86-SSE2-NEXT:    psrad $24, %xmm0
1476; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1477; X86-SSE2-NEXT:    retl
1478;
1479; X86-SSE41-LABEL: load_sext_2i8_to_2i64:
1480; X86-SSE41:       # %bb.0: # %entry
1481; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1482; X86-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
1483; X86-SSE41-NEXT:    retl
1484entry:
1485 %X = load <2 x i8>, <2 x i8>* %ptr
1486 %Y = sext <2 x i8> %X to <2 x i64>
1487 ret <2 x i64> %Y
1488}
1489
1490define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
1491; SSE2-LABEL: load_sext_4i1_to_4i32:
1492; SSE2:       # %bb.0: # %entry
1493; SSE2-NEXT:    movb (%rdi), %al
1494; SSE2-NEXT:    movl %eax, %ecx
1495; SSE2-NEXT:    shrb $3, %cl
1496; SSE2-NEXT:    movzbl %cl, %ecx
1497; SSE2-NEXT:    negl %ecx
1498; SSE2-NEXT:    movd %ecx, %xmm0
1499; SSE2-NEXT:    movzbl %al, %ecx
1500; SSE2-NEXT:    shrb $2, %al
1501; SSE2-NEXT:    movzbl %al, %eax
1502; SSE2-NEXT:    andl $1, %eax
1503; SSE2-NEXT:    negl %eax
1504; SSE2-NEXT:    movd %eax, %xmm1
1505; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1506; SSE2-NEXT:    movl %ecx, %eax
1507; SSE2-NEXT:    andl $1, %eax
1508; SSE2-NEXT:    negl %eax
1509; SSE2-NEXT:    movd %eax, %xmm0
1510; SSE2-NEXT:    shrb %cl
1511; SSE2-NEXT:    movzbl %cl, %eax
1512; SSE2-NEXT:    andl $1, %eax
1513; SSE2-NEXT:    negl %eax
1514; SSE2-NEXT:    movd %eax, %xmm2
1515; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1516; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1517; SSE2-NEXT:    retq
1518;
1519; SSSE3-LABEL: load_sext_4i1_to_4i32:
1520; SSSE3:       # %bb.0: # %entry
1521; SSSE3-NEXT:    movb (%rdi), %al
1522; SSSE3-NEXT:    movl %eax, %ecx
1523; SSSE3-NEXT:    shrb $3, %cl
1524; SSSE3-NEXT:    movzbl %cl, %ecx
1525; SSSE3-NEXT:    negl %ecx
1526; SSSE3-NEXT:    movd %ecx, %xmm0
1527; SSSE3-NEXT:    movzbl %al, %ecx
1528; SSSE3-NEXT:    shrb $2, %al
1529; SSSE3-NEXT:    movzbl %al, %eax
1530; SSSE3-NEXT:    andl $1, %eax
1531; SSSE3-NEXT:    negl %eax
1532; SSSE3-NEXT:    movd %eax, %xmm1
1533; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1534; SSSE3-NEXT:    movl %ecx, %eax
1535; SSSE3-NEXT:    andl $1, %eax
1536; SSSE3-NEXT:    negl %eax
1537; SSSE3-NEXT:    movd %eax, %xmm0
1538; SSSE3-NEXT:    shrb %cl
1539; SSSE3-NEXT:    movzbl %cl, %eax
1540; SSSE3-NEXT:    andl $1, %eax
1541; SSSE3-NEXT:    negl %eax
1542; SSSE3-NEXT:    movd %eax, %xmm2
1543; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1544; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1545; SSSE3-NEXT:    retq
1546;
1547; SSE41-LABEL: load_sext_4i1_to_4i32:
1548; SSE41:       # %bb.0: # %entry
1549; SSE41-NEXT:    movb (%rdi), %al
1550; SSE41-NEXT:    movzbl %al, %ecx
1551; SSE41-NEXT:    shrb %al
1552; SSE41-NEXT:    movzbl %al, %eax
1553; SSE41-NEXT:    andl $1, %eax
1554; SSE41-NEXT:    negl %eax
1555; SSE41-NEXT:    movl %ecx, %edx
1556; SSE41-NEXT:    andl $1, %edx
1557; SSE41-NEXT:    negl %edx
1558; SSE41-NEXT:    movd %edx, %xmm0
1559; SSE41-NEXT:    pinsrd $1, %eax, %xmm0
1560; SSE41-NEXT:    movl %ecx, %eax
1561; SSE41-NEXT:    shrb $2, %al
1562; SSE41-NEXT:    movzbl %al, %eax
1563; SSE41-NEXT:    andl $1, %eax
1564; SSE41-NEXT:    negl %eax
1565; SSE41-NEXT:    pinsrd $2, %eax, %xmm0
1566; SSE41-NEXT:    shrb $3, %cl
1567; SSE41-NEXT:    movzbl %cl, %eax
1568; SSE41-NEXT:    negl %eax
1569; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1570; SSE41-NEXT:    retq
1571;
1572; AVX1-LABEL: load_sext_4i1_to_4i32:
1573; AVX1:       # %bb.0: # %entry
1574; AVX1-NEXT:    movb (%rdi), %al
1575; AVX1-NEXT:    movzbl %al, %ecx
1576; AVX1-NEXT:    shrb %al
1577; AVX1-NEXT:    movzbl %al, %eax
1578; AVX1-NEXT:    andl $1, %eax
1579; AVX1-NEXT:    negl %eax
1580; AVX1-NEXT:    movl %ecx, %edx
1581; AVX1-NEXT:    andl $1, %edx
1582; AVX1-NEXT:    negl %edx
1583; AVX1-NEXT:    vmovd %edx, %xmm0
1584; AVX1-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
1585; AVX1-NEXT:    movl %ecx, %eax
1586; AVX1-NEXT:    shrb $2, %al
1587; AVX1-NEXT:    movzbl %al, %eax
1588; AVX1-NEXT:    andl $1, %eax
1589; AVX1-NEXT:    negl %eax
1590; AVX1-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
1591; AVX1-NEXT:    shrb $3, %cl
1592; AVX1-NEXT:    movzbl %cl, %eax
1593; AVX1-NEXT:    negl %eax
1594; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1595; AVX1-NEXT:    retq
1596;
1597; AVX2-LABEL: load_sext_4i1_to_4i32:
1598; AVX2:       # %bb.0: # %entry
1599; AVX2-NEXT:    movb (%rdi), %al
1600; AVX2-NEXT:    movzbl %al, %ecx
1601; AVX2-NEXT:    shrb %al
1602; AVX2-NEXT:    movzbl %al, %eax
1603; AVX2-NEXT:    andl $1, %eax
1604; AVX2-NEXT:    negl %eax
1605; AVX2-NEXT:    movl %ecx, %edx
1606; AVX2-NEXT:    andl $1, %edx
1607; AVX2-NEXT:    negl %edx
1608; AVX2-NEXT:    vmovd %edx, %xmm0
1609; AVX2-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
1610; AVX2-NEXT:    movl %ecx, %eax
1611; AVX2-NEXT:    shrb $2, %al
1612; AVX2-NEXT:    movzbl %al, %eax
1613; AVX2-NEXT:    andl $1, %eax
1614; AVX2-NEXT:    negl %eax
1615; AVX2-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
1616; AVX2-NEXT:    shrb $3, %cl
1617; AVX2-NEXT:    movzbl %cl, %eax
1618; AVX2-NEXT:    negl %eax
1619; AVX2-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1620; AVX2-NEXT:    retq
1621;
1622; AVX512-LABEL: load_sext_4i1_to_4i32:
1623; AVX512:       # %bb.0: # %entry
1624; AVX512-NEXT:    kmovw (%rdi), %k1
1625; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1626; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1627; AVX512-NEXT:    vzeroupper
1628; AVX512-NEXT:    retq
1629;
1630; X86-SSE2-LABEL: load_sext_4i1_to_4i32:
1631; X86-SSE2:       # %bb.0: # %entry
1632; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1633; X86-SSE2-NEXT:    movb (%eax), %al
1634; X86-SSE2-NEXT:    movl %eax, %ecx
1635; X86-SSE2-NEXT:    shrb $3, %cl
1636; X86-SSE2-NEXT:    movzbl %cl, %ecx
1637; X86-SSE2-NEXT:    negl %ecx
1638; X86-SSE2-NEXT:    movd %ecx, %xmm0
1639; X86-SSE2-NEXT:    movl %eax, %ecx
1640; X86-SSE2-NEXT:    shrb $2, %cl
1641; X86-SSE2-NEXT:    movzbl %cl, %ecx
1642; X86-SSE2-NEXT:    andl $1, %ecx
1643; X86-SSE2-NEXT:    negl %ecx
1644; X86-SSE2-NEXT:    movd %ecx, %xmm1
1645; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1646; X86-SSE2-NEXT:    movzbl %al, %ecx
1647; X86-SSE2-NEXT:    andl $1, %ecx
1648; X86-SSE2-NEXT:    negl %ecx
1649; X86-SSE2-NEXT:    movd %ecx, %xmm0
1650; X86-SSE2-NEXT:    shrb %al
1651; X86-SSE2-NEXT:    movzbl %al, %eax
1652; X86-SSE2-NEXT:    andl $1, %eax
1653; X86-SSE2-NEXT:    negl %eax
1654; X86-SSE2-NEXT:    movd %eax, %xmm2
1655; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1656; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1657; X86-SSE2-NEXT:    retl
1658;
1659; X86-SSE41-LABEL: load_sext_4i1_to_4i32:
1660; X86-SSE41:       # %bb.0: # %entry
1661; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1662; X86-SSE41-NEXT:    movb (%eax), %al
1663; X86-SSE41-NEXT:    movl %eax, %ecx
1664; X86-SSE41-NEXT:    shrb %cl
1665; X86-SSE41-NEXT:    movzbl %cl, %ecx
1666; X86-SSE41-NEXT:    andl $1, %ecx
1667; X86-SSE41-NEXT:    negl %ecx
1668; X86-SSE41-NEXT:    movzbl %al, %edx
1669; X86-SSE41-NEXT:    andl $1, %edx
1670; X86-SSE41-NEXT:    negl %edx
1671; X86-SSE41-NEXT:    movd %edx, %xmm0
1672; X86-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
1673; X86-SSE41-NEXT:    movl %eax, %ecx
1674; X86-SSE41-NEXT:    shrb $2, %cl
1675; X86-SSE41-NEXT:    movzbl %cl, %ecx
1676; X86-SSE41-NEXT:    andl $1, %ecx
1677; X86-SSE41-NEXT:    negl %ecx
1678; X86-SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
1679; X86-SSE41-NEXT:    shrb $3, %al
1680; X86-SSE41-NEXT:    movzbl %al, %eax
1681; X86-SSE41-NEXT:    negl %eax
1682; X86-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1683; X86-SSE41-NEXT:    retl
1684entry:
1685 %X = load <4 x i1>, <4 x i1>* %ptr
1686 %Y = sext <4 x i1> %X to <4 x i32>
1687 ret <4 x i32> %Y
1688}
1689
1690define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
1691; SSE2-LABEL: load_sext_4i8_to_4i32:
1692; SSE2:       # %bb.0: # %entry
1693; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1694; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1695; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1696; SSE2-NEXT:    psrad $24, %xmm0
1697; SSE2-NEXT:    retq
1698;
1699; SSSE3-LABEL: load_sext_4i8_to_4i32:
1700; SSSE3:       # %bb.0: # %entry
1701; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1702; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1703; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1704; SSSE3-NEXT:    psrad $24, %xmm0
1705; SSSE3-NEXT:    retq
1706;
1707; SSE41-LABEL: load_sext_4i8_to_4i32:
1708; SSE41:       # %bb.0: # %entry
1709; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
1710; SSE41-NEXT:    retq
1711;
1712; AVX-LABEL: load_sext_4i8_to_4i32:
1713; AVX:       # %bb.0: # %entry
1714; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
1715; AVX-NEXT:    retq
1716;
1717; X86-SSE2-LABEL: load_sext_4i8_to_4i32:
1718; X86-SSE2:       # %bb.0: # %entry
1719; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1720; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1721; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1722; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1723; X86-SSE2-NEXT:    psrad $24, %xmm0
1724; X86-SSE2-NEXT:    retl
1725;
1726; X86-SSE41-LABEL: load_sext_4i8_to_4i32:
1727; X86-SSE41:       # %bb.0: # %entry
1728; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1729; X86-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
1730; X86-SSE41-NEXT:    retl
1731entry:
1732 %X = load <4 x i8>, <4 x i8>* %ptr
1733 %Y = sext <4 x i8> %X to <4 x i32>
1734 ret <4 x i32> %Y
1735}
1736
1737define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
1738; SSE2-LABEL: load_sext_4i1_to_4i64:
1739; SSE2:       # %bb.0: # %entry
1740; SSE2-NEXT:    movb (%rdi), %al
1741; SSE2-NEXT:    movl %eax, %ecx
1742; SSE2-NEXT:    shrb %cl
1743; SSE2-NEXT:    andb $1, %cl
1744; SSE2-NEXT:    movzbl %cl, %ecx
1745; SSE2-NEXT:    movl %eax, %edx
1746; SSE2-NEXT:    andb $1, %dl
1747; SSE2-NEXT:    movzbl %dl, %edx
1748; SSE2-NEXT:    movd %edx, %xmm1
1749; SSE2-NEXT:    pinsrw $2, %ecx, %xmm1
1750; SSE2-NEXT:    movl %eax, %ecx
1751; SSE2-NEXT:    shrb $2, %cl
1752; SSE2-NEXT:    andb $1, %cl
1753; SSE2-NEXT:    movzbl %cl, %ecx
1754; SSE2-NEXT:    pinsrw $4, %ecx, %xmm1
1755; SSE2-NEXT:    shrb $3, %al
1756; SSE2-NEXT:    movzbl %al, %eax
1757; SSE2-NEXT:    pinsrw $6, %eax, %xmm1
1758; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
1759; SSE2-NEXT:    psllq $63, %xmm0
1760; SSE2-NEXT:    psrad $31, %xmm0
1761; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1762; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1763; SSE2-NEXT:    psllq $63, %xmm1
1764; SSE2-NEXT:    psrad $31, %xmm1
1765; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1766; SSE2-NEXT:    retq
1767;
1768; SSSE3-LABEL: load_sext_4i1_to_4i64:
1769; SSSE3:       # %bb.0: # %entry
1770; SSSE3-NEXT:    movb (%rdi), %al
1771; SSSE3-NEXT:    movl %eax, %ecx
1772; SSSE3-NEXT:    shrb %cl
1773; SSSE3-NEXT:    andb $1, %cl
1774; SSSE3-NEXT:    movzbl %cl, %ecx
1775; SSSE3-NEXT:    movl %eax, %edx
1776; SSSE3-NEXT:    andb $1, %dl
1777; SSSE3-NEXT:    movzbl %dl, %edx
1778; SSSE3-NEXT:    movd %edx, %xmm1
1779; SSSE3-NEXT:    pinsrw $2, %ecx, %xmm1
1780; SSSE3-NEXT:    movl %eax, %ecx
1781; SSSE3-NEXT:    shrb $2, %cl
1782; SSSE3-NEXT:    andb $1, %cl
1783; SSSE3-NEXT:    movzbl %cl, %ecx
1784; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm1
1785; SSSE3-NEXT:    shrb $3, %al
1786; SSSE3-NEXT:    movzbl %al, %eax
1787; SSSE3-NEXT:    pinsrw $6, %eax, %xmm1
1788; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
1789; SSSE3-NEXT:    psllq $63, %xmm0
1790; SSSE3-NEXT:    psrad $31, %xmm0
1791; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1792; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1793; SSSE3-NEXT:    psllq $63, %xmm1
1794; SSSE3-NEXT:    psrad $31, %xmm1
1795; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1796; SSSE3-NEXT:    retq
1797;
1798; SSE41-LABEL: load_sext_4i1_to_4i64:
1799; SSE41:       # %bb.0: # %entry
1800; SSE41-NEXT:    movb (%rdi), %al
1801; SSE41-NEXT:    movl %eax, %ecx
1802; SSE41-NEXT:    shrb %cl
1803; SSE41-NEXT:    andb $1, %cl
1804; SSE41-NEXT:    movzbl %cl, %ecx
1805; SSE41-NEXT:    movl %eax, %edx
1806; SSE41-NEXT:    andb $1, %dl
1807; SSE41-NEXT:    movzbl %dl, %edx
1808; SSE41-NEXT:    movd %edx, %xmm1
1809; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
1810; SSE41-NEXT:    movl %eax, %ecx
1811; SSE41-NEXT:    shrb $2, %cl
1812; SSE41-NEXT:    andb $1, %cl
1813; SSE41-NEXT:    movzbl %cl, %ecx
1814; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1815; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
1816; SSE41-NEXT:    shrb $3, %al
1817; SSE41-NEXT:    movzbl %al, %eax
1818; SSE41-NEXT:    pinsrb $12, %eax, %xmm1
1819; SSE41-NEXT:    psllq $63, %xmm0
1820; SSE41-NEXT:    psrad $31, %xmm0
1821; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1822; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1823; SSE41-NEXT:    psllq $63, %xmm1
1824; SSE41-NEXT:    psrad $31, %xmm1
1825; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1826; SSE41-NEXT:    retq
1827;
1828; AVX1-LABEL: load_sext_4i1_to_4i64:
1829; AVX1:       # %bb.0: # %entry
1830; AVX1-NEXT:    movb (%rdi), %al
1831; AVX1-NEXT:    movzbl %al, %ecx
1832; AVX1-NEXT:    shrb %al
1833; AVX1-NEXT:    movzbl %al, %eax
1834; AVX1-NEXT:    andl $1, %eax
1835; AVX1-NEXT:    negl %eax
1836; AVX1-NEXT:    movl %ecx, %edx
1837; AVX1-NEXT:    andl $1, %edx
1838; AVX1-NEXT:    negl %edx
1839; AVX1-NEXT:    vmovd %edx, %xmm0
1840; AVX1-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
1841; AVX1-NEXT:    movl %ecx, %eax
1842; AVX1-NEXT:    shrb $2, %al
1843; AVX1-NEXT:    movzbl %al, %eax
1844; AVX1-NEXT:    andl $1, %eax
1845; AVX1-NEXT:    negl %eax
1846; AVX1-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
1847; AVX1-NEXT:    shrb $3, %cl
1848; AVX1-NEXT:    movzbl %cl, %eax
1849; AVX1-NEXT:    negl %eax
1850; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1851; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1852; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1853; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1854; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1855; AVX1-NEXT:    retq
1856;
1857; AVX2-LABEL: load_sext_4i1_to_4i64:
1858; AVX2:       # %bb.0: # %entry
1859; AVX2-NEXT:    movb (%rdi), %al
1860; AVX2-NEXT:    movl %eax, %ecx
1861; AVX2-NEXT:    shrb $3, %cl
1862; AVX2-NEXT:    movzbl %cl, %ecx
1863; AVX2-NEXT:    negq %rcx
1864; AVX2-NEXT:    vmovq %rcx, %xmm0
1865; AVX2-NEXT:    movzbl %al, %ecx
1866; AVX2-NEXT:    shrb $2, %al
1867; AVX2-NEXT:    movzbl %al, %eax
1868; AVX2-NEXT:    andl $1, %eax
1869; AVX2-NEXT:    negq %rax
1870; AVX2-NEXT:    vmovq %rax, %xmm1
1871; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1872; AVX2-NEXT:    movl %ecx, %eax
1873; AVX2-NEXT:    andl $1, %eax
1874; AVX2-NEXT:    negq %rax
1875; AVX2-NEXT:    vmovq %rax, %xmm1
1876; AVX2-NEXT:    shrb %cl
1877; AVX2-NEXT:    movzbl %cl, %eax
1878; AVX2-NEXT:    andl $1, %eax
1879; AVX2-NEXT:    negq %rax
1880; AVX2-NEXT:    vmovq %rax, %xmm2
1881; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1882; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1883; AVX2-NEXT:    retq
1884;
1885; AVX512-LABEL: load_sext_4i1_to_4i64:
1886; AVX512:       # %bb.0: # %entry
1887; AVX512-NEXT:    kmovw (%rdi), %k1
1888; AVX512-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1889; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1890; AVX512-NEXT:    retq
1891;
1892; X86-SSE2-LABEL: load_sext_4i1_to_4i64:
1893; X86-SSE2:       # %bb.0: # %entry
1894; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1895; X86-SSE2-NEXT:    movb (%eax), %al
1896; X86-SSE2-NEXT:    movl %eax, %ecx
1897; X86-SSE2-NEXT:    shrb %cl
1898; X86-SSE2-NEXT:    andb $1, %cl
1899; X86-SSE2-NEXT:    movzbl %cl, %ecx
1900; X86-SSE2-NEXT:    movl %eax, %edx
1901; X86-SSE2-NEXT:    andb $1, %dl
1902; X86-SSE2-NEXT:    movzbl %dl, %edx
1903; X86-SSE2-NEXT:    movd %edx, %xmm1
1904; X86-SSE2-NEXT:    pinsrw $2, %ecx, %xmm1
1905; X86-SSE2-NEXT:    movl %eax, %ecx
1906; X86-SSE2-NEXT:    shrb $2, %cl
1907; X86-SSE2-NEXT:    andb $1, %cl
1908; X86-SSE2-NEXT:    movzbl %cl, %ecx
1909; X86-SSE2-NEXT:    pinsrw $4, %ecx, %xmm1
1910; X86-SSE2-NEXT:    shrb $3, %al
1911; X86-SSE2-NEXT:    movzbl %al, %eax
1912; X86-SSE2-NEXT:    pinsrw $6, %eax, %xmm1
1913; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
1914; X86-SSE2-NEXT:    psllq $63, %xmm0
1915; X86-SSE2-NEXT:    psrad $31, %xmm0
1916; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1917; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1918; X86-SSE2-NEXT:    psllq $63, %xmm1
1919; X86-SSE2-NEXT:    psrad $31, %xmm1
1920; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1921; X86-SSE2-NEXT:    retl
1922;
1923; X86-SSE41-LABEL: load_sext_4i1_to_4i64:
1924; X86-SSE41:       # %bb.0: # %entry
1925; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1926; X86-SSE41-NEXT:    movb (%eax), %al
1927; X86-SSE41-NEXT:    movl %eax, %ecx
1928; X86-SSE41-NEXT:    shrb %cl
1929; X86-SSE41-NEXT:    andb $1, %cl
1930; X86-SSE41-NEXT:    movzbl %cl, %ecx
1931; X86-SSE41-NEXT:    movl %eax, %edx
1932; X86-SSE41-NEXT:    andb $1, %dl
1933; X86-SSE41-NEXT:    movzbl %dl, %edx
1934; X86-SSE41-NEXT:    movd %edx, %xmm1
1935; X86-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
1936; X86-SSE41-NEXT:    movl %eax, %ecx
1937; X86-SSE41-NEXT:    shrb $2, %cl
1938; X86-SSE41-NEXT:    andb $1, %cl
1939; X86-SSE41-NEXT:    movzbl %cl, %ecx
1940; X86-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1941; X86-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
1942; X86-SSE41-NEXT:    shrb $3, %al
1943; X86-SSE41-NEXT:    movzbl %al, %eax
1944; X86-SSE41-NEXT:    pinsrb $12, %eax, %xmm1
1945; X86-SSE41-NEXT:    psllq $63, %xmm0
1946; X86-SSE41-NEXT:    psrad $31, %xmm0
1947; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1948; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1949; X86-SSE41-NEXT:    psllq $63, %xmm1
1950; X86-SSE41-NEXT:    psrad $31, %xmm1
1951; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1952; X86-SSE41-NEXT:    retl
1953entry:
1954 %X = load <4 x i1>, <4 x i1>* %ptr
1955 %Y = sext <4 x i1> %X to <4 x i64>
1956 ret <4 x i64> %Y
1957}
1958
1959define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
1960; SSE2-LABEL: load_sext_4i8_to_4i64:
1961; SSE2:       # %bb.0: # %entry
1962; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1963; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1964; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1965; SSE2-NEXT:    psrad $24, %xmm1
1966; SSE2-NEXT:    pxor %xmm2, %xmm2
1967; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1968; SSE2-NEXT:    movdqa %xmm1, %xmm0
1969; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1970; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1971; SSE2-NEXT:    retq
1972;
1973; SSSE3-LABEL: load_sext_4i8_to_4i64:
1974; SSSE3:       # %bb.0: # %entry
1975; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1976; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1977; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1978; SSSE3-NEXT:    psrad $24, %xmm1
1979; SSSE3-NEXT:    pxor %xmm2, %xmm2
1980; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
1981; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1982; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1983; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1984; SSSE3-NEXT:    retq
1985;
1986; SSE41-LABEL: load_sext_4i8_to_4i64:
1987; SSE41:       # %bb.0: # %entry
1988; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
1989; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
1990; SSE41-NEXT:    retq
1991;
1992; AVX1-LABEL: load_sext_4i8_to_4i64:
1993; AVX1:       # %bb.0: # %entry
1994; AVX1-NEXT:    vpmovsxbq 2(%rdi), %xmm0
1995; AVX1-NEXT:    vpmovsxbq (%rdi), %xmm1
1996; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1997; AVX1-NEXT:    retq
1998;
1999; AVX2-LABEL: load_sext_4i8_to_4i64:
2000; AVX2:       # %bb.0: # %entry
2001; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
2002; AVX2-NEXT:    retq
2003;
2004; AVX512-LABEL: load_sext_4i8_to_4i64:
2005; AVX512:       # %bb.0: # %entry
2006; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
2007; AVX512-NEXT:    retq
2008;
2009; X86-SSE2-LABEL: load_sext_4i8_to_4i64:
2010; X86-SSE2:       # %bb.0: # %entry
2011; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2012; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2013; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2014; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2015; X86-SSE2-NEXT:    psrad $24, %xmm1
2016; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
2017; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2018; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
2019; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2020; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2021; X86-SSE2-NEXT:    retl
2022;
2023; X86-SSE41-LABEL: load_sext_4i8_to_4i64:
2024; X86-SSE41:       # %bb.0: # %entry
2025; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2026; X86-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
2027; X86-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
2028; X86-SSE41-NEXT:    retl
2029entry:
2030 %X = load <4 x i8>, <4 x i8>* %ptr
2031 %Y = sext <4 x i8> %X to <4 x i64>
2032 ret <4 x i64> %Y
2033}
2034
2035define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) {
2036; SSE2-LABEL: load_sext_4i8_to_4i64_extract:
2037; SSE2:       # %bb.0:
2038; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2039; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2040; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2041; SSE2-NEXT:    psrad $24, %xmm0
2042; SSE2-NEXT:    pxor %xmm1, %xmm1
2043; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2044; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2045; SSE2-NEXT:    retq
2046;
2047; SSSE3-LABEL: load_sext_4i8_to_4i64_extract:
2048; SSSE3:       # %bb.0:
2049; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2050; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2051; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2052; SSSE3-NEXT:    psrad $24, %xmm0
2053; SSSE3-NEXT:    pxor %xmm1, %xmm1
2054; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
2055; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2056; SSSE3-NEXT:    retq
2057;
2058; SSE41-LABEL: load_sext_4i8_to_4i64_extract:
2059; SSE41:       # %bb.0:
2060; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm0
2061; SSE41-NEXT:    retq
2062;
2063; AVX1-LABEL: load_sext_4i8_to_4i64_extract:
2064; AVX1:       # %bb.0:
2065; AVX1-NEXT:    vpmovsxbq 2(%rdi), %xmm0
2066; AVX1-NEXT:    retq
2067;
2068; AVX2-LABEL: load_sext_4i8_to_4i64_extract:
2069; AVX2:       # %bb.0:
2070; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
2071; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
2072; AVX2-NEXT:    vzeroupper
2073; AVX2-NEXT:    retq
2074;
2075; AVX512-LABEL: load_sext_4i8_to_4i64_extract:
2076; AVX512:       # %bb.0:
2077; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
2078; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
2079; AVX512-NEXT:    vzeroupper
2080; AVX512-NEXT:    retq
2081;
2082; X86-SSE2-LABEL: load_sext_4i8_to_4i64_extract:
2083; X86-SSE2:       # %bb.0:
2084; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2085; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2086; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2087; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2088; X86-SSE2-NEXT:    psrad $24, %xmm0
2089; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
2090; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2091; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2092; X86-SSE2-NEXT:    retl
2093;
2094; X86-SSE41-LABEL: load_sext_4i8_to_4i64_extract:
2095; X86-SSE41:       # %bb.0:
2096; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2097; X86-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm0
2098; X86-SSE41-NEXT:    retl
2099 %ld = load <4 x i8>, <4 x i8>* %ptr
2100 %sext = sext <4 x i8> %ld to <4 x i64>
2101 %extract = shufflevector <4 x i64> %sext, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
2102 ret <2 x i64> %extract
2103}
2104
2105define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
2106; SSE-LABEL: load_sext_8i1_to_8i16:
2107; SSE:       # %bb.0: # %entry
2108; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2109; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2110; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2111; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
2112; SSE-NEXT:    pand %xmm1, %xmm0
2113; SSE-NEXT:    pcmpeqw %xmm1, %xmm0
2114; SSE-NEXT:    retq
2115;
2116; AVX1-LABEL: load_sext_8i1_to_8i16:
2117; AVX1:       # %bb.0: # %entry
2118; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2119; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2120; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2121; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
2122; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
2123; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2124; AVX1-NEXT:    retq
2125;
2126; AVX2-LABEL: load_sext_8i1_to_8i16:
2127; AVX2:       # %bb.0: # %entry
2128; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
2129; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
2130; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
2131; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2132; AVX2-NEXT:    retq
2133;
2134; AVX512F-LABEL: load_sext_8i1_to_8i16:
2135; AVX512F:       # %bb.0: # %entry
2136; AVX512F-NEXT:    kmovw (%rdi), %k1
2137; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2138; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2139; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2140; AVX512F-NEXT:    vzeroupper
2141; AVX512F-NEXT:    retq
2142;
2143; AVX512BW-LABEL: load_sext_8i1_to_8i16:
2144; AVX512BW:       # %bb.0: # %entry
2145; AVX512BW-NEXT:    kmovw (%rdi), %k0
2146; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
2147; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2148; AVX512BW-NEXT:    vzeroupper
2149; AVX512BW-NEXT:    retq
2150;
2151; X86-SSE-LABEL: load_sext_8i1_to_8i16:
2152; X86-SSE:       # %bb.0: # %entry
2153; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2154; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2155; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2156; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2157; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
2158; X86-SSE-NEXT:    pand %xmm1, %xmm0
2159; X86-SSE-NEXT:    pcmpeqw %xmm1, %xmm0
2160; X86-SSE-NEXT:    retl
2161entry:
2162 %X = load <8 x i1>, <8 x i1>* %ptr
2163 %Y = sext <8 x i1> %X to <8 x i16>
2164 ret <8 x i16> %Y
2165}
2166
2167define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) {
2168; SSE2-LABEL: load_sext_8i8_to_8i16:
2169; SSE2:       # %bb.0: # %entry
2170; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2171; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2172; SSE2-NEXT:    psraw $8, %xmm0
2173; SSE2-NEXT:    retq
2174;
2175; SSSE3-LABEL: load_sext_8i8_to_8i16:
2176; SSSE3:       # %bb.0: # %entry
2177; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2178; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2179; SSSE3-NEXT:    psraw $8, %xmm0
2180; SSSE3-NEXT:    retq
2181;
2182; SSE41-LABEL: load_sext_8i8_to_8i16:
2183; SSE41:       # %bb.0: # %entry
2184; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
2185; SSE41-NEXT:    retq
2186;
2187; AVX-LABEL: load_sext_8i8_to_8i16:
2188; AVX:       # %bb.0: # %entry
2189; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
2190; AVX-NEXT:    retq
2191;
2192; X86-SSE2-LABEL: load_sext_8i8_to_8i16:
2193; X86-SSE2:       # %bb.0: # %entry
2194; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2195; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2196; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2197; X86-SSE2-NEXT:    psraw $8, %xmm0
2198; X86-SSE2-NEXT:    retl
2199;
2200; X86-SSE41-LABEL: load_sext_8i8_to_8i16:
2201; X86-SSE41:       # %bb.0: # %entry
2202; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2203; X86-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
2204; X86-SSE41-NEXT:    retl
2205entry:
2206 %X = load <8 x i8>, <8 x i8>* %ptr
2207 %Y = sext <8 x i8> %X to <8 x i16>
2208 ret <8 x i16> %Y
2209}
2210
2211define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
2212; SSE2-LABEL: load_sext_8i8_to_8i64:
2213; SSE2:       # %bb.0: # %entry
2214; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2215; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2216; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2217; SSE2-NEXT:    psrad $24, %xmm1
2218; SSE2-NEXT:    pxor %xmm4, %xmm4
2219; SSE2-NEXT:    pxor %xmm3, %xmm3
2220; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
2221; SSE2-NEXT:    movdqa %xmm1, %xmm0
2222; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2223; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
2224; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2225; SSE2-NEXT:    psrad $24, %xmm3
2226; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
2227; SSE2-NEXT:    movdqa %xmm3, %xmm2
2228; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2229; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2230; SSE2-NEXT:    retq
2231;
2232; SSSE3-LABEL: load_sext_8i8_to_8i64:
2233; SSSE3:       # %bb.0: # %entry
2234; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2235; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2236; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2237; SSSE3-NEXT:    psrad $24, %xmm1
2238; SSSE3-NEXT:    pxor %xmm4, %xmm4
2239; SSSE3-NEXT:    pxor %xmm3, %xmm3
2240; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
2241; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2242; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2243; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
2244; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2245; SSSE3-NEXT:    psrad $24, %xmm3
2246; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
2247; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2248; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2249; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2250; SSSE3-NEXT:    retq
2251;
2252; SSE41-LABEL: load_sext_8i8_to_8i64:
2253; SSE41:       # %bb.0: # %entry
2254; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
2255; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
2256; SSE41-NEXT:    pmovsxbq 4(%rdi), %xmm2
2257; SSE41-NEXT:    pmovsxbq 6(%rdi), %xmm3
2258; SSE41-NEXT:    retq
2259;
2260; AVX1-LABEL: load_sext_8i8_to_8i64:
2261; AVX1:       # %bb.0: # %entry
2262; AVX1-NEXT:    vpmovsxbq 6(%rdi), %xmm1
2263; AVX1-NEXT:    vpmovsxbq 4(%rdi), %xmm2
2264; AVX1-NEXT:    vpmovsxbq 2(%rdi), %xmm0
2265; AVX1-NEXT:    vpmovsxbq (%rdi), %xmm3
2266; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
2267; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
2268; AVX1-NEXT:    retq
2269;
2270; AVX2-LABEL: load_sext_8i8_to_8i64:
2271; AVX2:       # %bb.0: # %entry
2272; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
2273; AVX2-NEXT:    vpmovsxbq 4(%rdi), %ymm1
2274; AVX2-NEXT:    retq
2275;
2276; AVX512-LABEL: load_sext_8i8_to_8i64:
2277; AVX512:       # %bb.0: # %entry
2278; AVX512-NEXT:    vpmovsxbq (%rdi), %zmm0
2279; AVX512-NEXT:    retq
2280;
2281; X86-SSE2-LABEL: load_sext_8i8_to_8i64:
2282; X86-SSE2:       # %bb.0: # %entry
2283; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2284; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2285; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2286; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2287; X86-SSE2-NEXT:    psrad $24, %xmm1
2288; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
2289; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
2290; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
2291; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
2292; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2293; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
2294; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2295; X86-SSE2-NEXT:    psrad $24, %xmm3
2296; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
2297; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
2298; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2299; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2300; X86-SSE2-NEXT:    retl
2301;
2302; X86-SSE41-LABEL: load_sext_8i8_to_8i64:
2303; X86-SSE41:       # %bb.0: # %entry
2304; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2305; X86-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
2306; X86-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
2307; X86-SSE41-NEXT:    pmovsxbq 4(%eax), %xmm2
2308; X86-SSE41-NEXT:    pmovsxbq 6(%eax), %xmm3
2309; X86-SSE41-NEXT:    retl
2310entry:
2311 %X = load <8 x i8>, <8 x i8>* %ptr
2312 %Y = sext <8 x i8> %X to <8 x i64>
2313 ret <8 x i64> %Y
2314}
2315
2316define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
2317; SSE-LABEL: load_sext_8i1_to_8i32:
2318; SSE:       # %bb.0: # %entry
2319; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2320; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
2321; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8]
2322; SSE-NEXT:    movdqa %xmm1, %xmm0
2323; SSE-NEXT:    pand %xmm2, %xmm0
2324; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
2325; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [16,32,64,128]
2326; SSE-NEXT:    pand %xmm2, %xmm1
2327; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
2328; SSE-NEXT:    retq
2329;
2330; AVX1-LABEL: load_sext_8i1_to_8i32:
2331; AVX1:       # %bb.0: # %entry
2332; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2333; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2334; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2335; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2336; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2337; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2338; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2339; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2340; AVX1-NEXT:    retq
2341;
2342; AVX2-LABEL: load_sext_8i1_to_8i32:
2343; AVX2:       # %bb.0: # %entry
2344; AVX2-NEXT:    vpbroadcastb (%rdi), %ymm0
2345; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
2346; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2347; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
2348; AVX2-NEXT:    retq
2349;
2350; AVX512-LABEL: load_sext_8i1_to_8i32:
2351; AVX512:       # %bb.0: # %entry
2352; AVX512-NEXT:    kmovw (%rdi), %k1
2353; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2354; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2355; AVX512-NEXT:    retq
2356;
2357; X86-SSE-LABEL: load_sext_8i1_to_8i32:
2358; X86-SSE:       # %bb.0: # %entry
2359; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2360; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2361; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
2362; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8]
2363; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
2364; X86-SSE-NEXT:    pand %xmm2, %xmm0
2365; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm0
2366; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [16,32,64,128]
2367; X86-SSE-NEXT:    pand %xmm2, %xmm1
2368; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm1
2369; X86-SSE-NEXT:    retl
2370entry:
2371 %X = load <8 x i1>, <8 x i1>* %ptr
2372 %Y = sext <8 x i1> %X to <8 x i32>
2373 ret <8 x i32> %Y
2374}
2375
2376define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
2377; SSE2-LABEL: load_sext_8i8_to_8i32:
2378; SSE2:       # %bb.0: # %entry
2379; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2380; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2381; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2382; SSE2-NEXT:    psrad $24, %xmm0
2383; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2384; SSE2-NEXT:    psrad $24, %xmm1
2385; SSE2-NEXT:    retq
2386;
2387; SSSE3-LABEL: load_sext_8i8_to_8i32:
2388; SSSE3:       # %bb.0: # %entry
2389; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2390; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2391; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2392; SSSE3-NEXT:    psrad $24, %xmm0
2393; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2394; SSSE3-NEXT:    psrad $24, %xmm1
2395; SSSE3-NEXT:    retq
2396;
2397; SSE41-LABEL: load_sext_8i8_to_8i32:
2398; SSE41:       # %bb.0: # %entry
2399; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
2400; SSE41-NEXT:    pmovsxbd 4(%rdi), %xmm1
2401; SSE41-NEXT:    retq
2402;
2403; AVX1-LABEL: load_sext_8i8_to_8i32:
2404; AVX1:       # %bb.0: # %entry
2405; AVX1-NEXT:    vpmovsxbd 4(%rdi), %xmm0
2406; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm1
2407; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2408; AVX1-NEXT:    retq
2409;
2410; AVX2-LABEL: load_sext_8i8_to_8i32:
2411; AVX2:       # %bb.0: # %entry
2412; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
2413; AVX2-NEXT:    retq
2414;
2415; AVX512-LABEL: load_sext_8i8_to_8i32:
2416; AVX512:       # %bb.0: # %entry
2417; AVX512-NEXT:    vpmovsxbd (%rdi), %ymm0
2418; AVX512-NEXT:    retq
2419;
2420; X86-SSE2-LABEL: load_sext_8i8_to_8i32:
2421; X86-SSE2:       # %bb.0: # %entry
2422; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2423; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2424; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2425; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2426; X86-SSE2-NEXT:    psrad $24, %xmm0
2427; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2428; X86-SSE2-NEXT:    psrad $24, %xmm1
2429; X86-SSE2-NEXT:    retl
2430;
2431; X86-SSE41-LABEL: load_sext_8i8_to_8i32:
2432; X86-SSE41:       # %bb.0: # %entry
2433; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2434; X86-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
2435; X86-SSE41-NEXT:    pmovsxbd 4(%eax), %xmm1
2436; X86-SSE41-NEXT:    retl
2437entry:
2438 %X = load <8 x i8>, <8 x i8>* %ptr
2439 %Y = sext <8 x i8> %X to <8 x i32>
2440 ret <8 x i32> %Y
2441}
2442
2443define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
2444; SSE2-LABEL: load_sext_16i1_to_16i8:
2445; SSE2:       # %bb.0: # %entry
2446; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2447; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2448; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
2449; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2450; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2451; SSE2-NEXT:    pand %xmm1, %xmm0
2452; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
2453; SSE2-NEXT:    retq
2454;
2455; SSSE3-LABEL: load_sext_16i1_to_16i8:
2456; SSSE3:       # %bb.0: # %entry
2457; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2458; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2459; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2460; SSSE3-NEXT:    pand %xmm1, %xmm0
2461; SSSE3-NEXT:    pcmpeqb %xmm1, %xmm0
2462; SSSE3-NEXT:    retq
2463;
2464; SSE41-LABEL: load_sext_16i1_to_16i8:
2465; SSE41:       # %bb.0: # %entry
2466; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2467; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2468; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2469; SSE41-NEXT:    pand %xmm1, %xmm0
2470; SSE41-NEXT:    pcmpeqb %xmm1, %xmm0
2471; SSE41-NEXT:    retq
2472;
2473; AVX1-LABEL: load_sext_16i1_to_16i8:
2474; AVX1:       # %bb.0: # %entry
2475; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2476; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2477; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
2478; AVX1-NEXT:    # xmm1 = mem[0,0]
2479; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
2480; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
2481; AVX1-NEXT:    retq
2482;
2483; AVX2-LABEL: load_sext_16i1_to_16i8:
2484; AVX2:       # %bb.0: # %entry
2485; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2486; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2487; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
2488; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
2489; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
2490; AVX2-NEXT:    retq
2491;
2492; AVX512F-LABEL: load_sext_16i1_to_16i8:
2493; AVX512F:       # %bb.0: # %entry
2494; AVX512F-NEXT:    kmovw (%rdi), %k1
2495; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2496; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2497; AVX512F-NEXT:    vzeroupper
2498; AVX512F-NEXT:    retq
2499;
2500; AVX512BW-LABEL: load_sext_16i1_to_16i8:
2501; AVX512BW:       # %bb.0: # %entry
2502; AVX512BW-NEXT:    kmovw (%rdi), %k0
2503; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
2504; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2505; AVX512BW-NEXT:    vzeroupper
2506; AVX512BW-NEXT:    retq
2507;
2508; X86-SSE2-LABEL: load_sext_16i1_to_16i8:
2509; X86-SSE2:       # %bb.0: # %entry
2510; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2511; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2512; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2513; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
2514; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2515; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2516; X86-SSE2-NEXT:    pand %xmm1, %xmm0
2517; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
2518; X86-SSE2-NEXT:    retl
2519;
2520; X86-SSE41-LABEL: load_sext_16i1_to_16i8:
2521; X86-SSE41:       # %bb.0: # %entry
2522; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2523; X86-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2524; X86-SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2525; X86-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2526; X86-SSE41-NEXT:    pand %xmm1, %xmm0
2527; X86-SSE41-NEXT:    pcmpeqb %xmm1, %xmm0
2528; X86-SSE41-NEXT:    retl
2529entry:
2530 %X = load <16 x i1>, <16 x i1>* %ptr
2531 %Y = sext <16 x i1> %X to <16 x i8>
2532 ret <16 x i8> %Y
2533}
2534
2535define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
2536; SSE-LABEL: load_sext_16i1_to_16i16:
2537; SSE:       # %bb.0: # %entry
2538; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2539; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2540; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
2541; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
2542; SSE-NEXT:    movdqa %xmm1, %xmm0
2543; SSE-NEXT:    pand %xmm2, %xmm0
2544; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
2545; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
2546; SSE-NEXT:    pand %xmm2, %xmm1
2547; SSE-NEXT:    pcmpeqw %xmm2, %xmm1
2548; SSE-NEXT:    retq
2549;
2550; AVX1-LABEL: load_sext_16i1_to_16i16:
2551; AVX1:       # %bb.0: # %entry
2552; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2553; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2554; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2555; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2556; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2557; AVX1-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2558; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2559; AVX1-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2560; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2561; AVX1-NEXT:    retq
2562;
2563; AVX2-LABEL: load_sext_16i1_to_16i16:
2564; AVX2:       # %bb.0: # %entry
2565; AVX2-NEXT:    vpbroadcastw (%rdi), %ymm0
2566; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
2567; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2568; AVX2-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
2569; AVX2-NEXT:    retq
2570;
2571; AVX512F-LABEL: load_sext_16i1_to_16i16:
2572; AVX512F:       # %bb.0: # %entry
2573; AVX512F-NEXT:    kmovw (%rdi), %k1
2574; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2575; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2576; AVX512F-NEXT:    retq
2577;
2578; AVX512BW-LABEL: load_sext_16i1_to_16i16:
2579; AVX512BW:       # %bb.0: # %entry
2580; AVX512BW-NEXT:    kmovw (%rdi), %k0
2581; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
2582; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2583; AVX512BW-NEXT:    retq
2584;
2585; X86-SSE-LABEL: load_sext_16i1_to_16i16:
2586; X86-SSE:       # %bb.0: # %entry
2587; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2588; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2589; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2590; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
2591; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
2592; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
2593; X86-SSE-NEXT:    pand %xmm2, %xmm0
2594; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
2595; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
2596; X86-SSE-NEXT:    pand %xmm2, %xmm1
2597; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
2598; X86-SSE-NEXT:    retl
2599entry:
2600 %X = load <16 x i1>, <16 x i1>* %ptr
2601 %Y = sext <16 x i1> %X to <16 x i16>
2602 ret <16 x i16> %Y
2603}
2604
2605define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
2606; SSE-LABEL: load_sext_32i1_to_32i8:
2607; SSE:       # %bb.0: # %entry
2608; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2609; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2610; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
2611; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2612; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2613; SSE-NEXT:    pand %xmm2, %xmm0
2614; SSE-NEXT:    pcmpeqb %xmm2, %xmm0
2615; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7]
2616; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
2617; SSE-NEXT:    pand %xmm2, %xmm1
2618; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
2619; SSE-NEXT:    retq
2620;
2621; AVX1-LABEL: load_sext_32i1_to_32i8:
2622; AVX1:       # %bb.0: # %entry
2623; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2624; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2625; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
2626; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
2627; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2628; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
2629; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2630; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2631; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
2632; AVX1-NEXT:    # xmm2 = mem[0,0]
2633; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
2634; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
2635; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2636; AVX1-NEXT:    retq
2637;
2638; AVX2-LABEL: load_sext_32i1_to_32i8:
2639; AVX2:       # %bb.0: # %entry
2640; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2641; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2642; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
2643; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2644; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2645; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
2646; AVX2-NEXT:    retq
2647;
2648; AVX512F-LABEL: load_sext_32i1_to_32i8:
2649; AVX512F:       # %bb.0: # %entry
2650; AVX512F-NEXT:    kmovw (%rdi), %k1
2651; AVX512F-NEXT:    kmovw 2(%rdi), %k2
2652; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2653; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2654; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
2655; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
2656; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2657; AVX512F-NEXT:    retq
2658;
2659; AVX512BW-LABEL: load_sext_32i1_to_32i8:
2660; AVX512BW:       # %bb.0: # %entry
2661; AVX512BW-NEXT:    kmovd (%rdi), %k0
2662; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
2663; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2664; AVX512BW-NEXT:    retq
2665;
2666; X86-SSE-LABEL: load_sext_32i1_to_32i8:
2667; X86-SSE:       # %bb.0: # %entry
2668; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2669; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2670; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2671; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
2672; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2673; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2674; X86-SSE-NEXT:    pand %xmm2, %xmm0
2675; X86-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
2676; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7]
2677; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
2678; X86-SSE-NEXT:    pand %xmm2, %xmm1
2679; X86-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
2680; X86-SSE-NEXT:    retl
2681entry:
2682 %X = load <32 x i1>, <32 x i1>* %ptr
2683 %Y = sext <32 x i1> %X to <32 x i8>
2684 ret <32 x i8> %Y
2685}
2686
2687define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
2688; SSE2-LABEL: load_sext_16i8_to_16i16:
2689; SSE2:       # %bb.0: # %entry
2690; SSE2-NEXT:    movdqa (%rdi), %xmm1
2691; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2692; SSE2-NEXT:    psraw $8, %xmm0
2693; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2694; SSE2-NEXT:    psraw $8, %xmm1
2695; SSE2-NEXT:    retq
2696;
2697; SSSE3-LABEL: load_sext_16i8_to_16i16:
2698; SSSE3:       # %bb.0: # %entry
2699; SSSE3-NEXT:    movdqa (%rdi), %xmm1
2700; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2701; SSSE3-NEXT:    psraw $8, %xmm0
2702; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2703; SSSE3-NEXT:    psraw $8, %xmm1
2704; SSSE3-NEXT:    retq
2705;
2706; SSE41-LABEL: load_sext_16i8_to_16i16:
2707; SSE41:       # %bb.0: # %entry
2708; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
2709; SSE41-NEXT:    pmovsxbw 8(%rdi), %xmm1
2710; SSE41-NEXT:    retq
2711;
2712; AVX1-LABEL: load_sext_16i8_to_16i16:
2713; AVX1:       # %bb.0: # %entry
2714; AVX1-NEXT:    vpmovsxbw 8(%rdi), %xmm0
2715; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm1
2716; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2717; AVX1-NEXT:    retq
2718;
2719; AVX2-LABEL: load_sext_16i8_to_16i16:
2720; AVX2:       # %bb.0: # %entry
2721; AVX2-NEXT:    vpmovsxbw (%rdi), %ymm0
2722; AVX2-NEXT:    retq
2723;
2724; AVX512-LABEL: load_sext_16i8_to_16i16:
2725; AVX512:       # %bb.0: # %entry
2726; AVX512-NEXT:    vpmovsxbw (%rdi), %ymm0
2727; AVX512-NEXT:    retq
2728;
2729; X86-SSE2-LABEL: load_sext_16i8_to_16i16:
2730; X86-SSE2:       # %bb.0: # %entry
2731; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2732; X86-SSE2-NEXT:    movdqa (%eax), %xmm1
2733; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2734; X86-SSE2-NEXT:    psraw $8, %xmm0
2735; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2736; X86-SSE2-NEXT:    psraw $8, %xmm1
2737; X86-SSE2-NEXT:    retl
2738;
2739; X86-SSE41-LABEL: load_sext_16i8_to_16i16:
2740; X86-SSE41:       # %bb.0: # %entry
2741; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2742; X86-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
2743; X86-SSE41-NEXT:    pmovsxbw 8(%eax), %xmm1
2744; X86-SSE41-NEXT:    retl
2745entry:
2746 %X = load <16 x i8>, <16 x i8>* %ptr
2747 %Y = sext <16 x i8> %X to <16 x i16>
2748 ret <16 x i16> %Y
2749}
2750
2751define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
2752; SSE2-LABEL: load_sext_2i16_to_2i64:
2753; SSE2:       # %bb.0: # %entry
2754; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2755; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2756; SSE2-NEXT:    pxor %xmm1, %xmm1
2757; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2758; SSE2-NEXT:    psrad $16, %xmm0
2759; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2760; SSE2-NEXT:    retq
2761;
2762; SSSE3-LABEL: load_sext_2i16_to_2i64:
2763; SSSE3:       # %bb.0: # %entry
2764; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2765; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2766; SSSE3-NEXT:    pxor %xmm1, %xmm1
2767; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
2768; SSSE3-NEXT:    psrad $16, %xmm0
2769; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2770; SSSE3-NEXT:    retq
2771;
2772; SSE41-LABEL: load_sext_2i16_to_2i64:
2773; SSE41:       # %bb.0: # %entry
2774; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
2775; SSE41-NEXT:    retq
2776;
2777; AVX-LABEL: load_sext_2i16_to_2i64:
2778; AVX:       # %bb.0: # %entry
2779; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
2780; AVX-NEXT:    retq
2781;
2782; X86-SSE2-LABEL: load_sext_2i16_to_2i64:
2783; X86-SSE2:       # %bb.0: # %entry
2784; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2785; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2786; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2787; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
2788; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2789; X86-SSE2-NEXT:    psrad $16, %xmm0
2790; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2791; X86-SSE2-NEXT:    retl
2792;
2793; X86-SSE41-LABEL: load_sext_2i16_to_2i64:
2794; X86-SSE41:       # %bb.0: # %entry
2795; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2796; X86-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
2797; X86-SSE41-NEXT:    retl
2798entry:
2799 %X = load <2 x i16>, <2 x i16>* %ptr
2800 %Y = sext <2 x i16> %X to <2 x i64>
2801 ret <2 x i64> %Y
2802}
2803
2804define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) {
2805; SSE2-LABEL: load_sext_4i16_to_4i32:
2806; SSE2:       # %bb.0: # %entry
2807; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2808; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2809; SSE2-NEXT:    psrad $16, %xmm0
2810; SSE2-NEXT:    retq
2811;
2812; SSSE3-LABEL: load_sext_4i16_to_4i32:
2813; SSSE3:       # %bb.0: # %entry
2814; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2815; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2816; SSSE3-NEXT:    psrad $16, %xmm0
2817; SSSE3-NEXT:    retq
2818;
2819; SSE41-LABEL: load_sext_4i16_to_4i32:
2820; SSE41:       # %bb.0: # %entry
2821; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
2822; SSE41-NEXT:    retq
2823;
2824; AVX-LABEL: load_sext_4i16_to_4i32:
2825; AVX:       # %bb.0: # %entry
2826; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
2827; AVX-NEXT:    retq
2828;
2829; X86-SSE2-LABEL: load_sext_4i16_to_4i32:
2830; X86-SSE2:       # %bb.0: # %entry
2831; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2832; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2833; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2834; X86-SSE2-NEXT:    psrad $16, %xmm0
2835; X86-SSE2-NEXT:    retl
2836;
2837; X86-SSE41-LABEL: load_sext_4i16_to_4i32:
2838; X86-SSE41:       # %bb.0: # %entry
2839; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2840; X86-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
2841; X86-SSE41-NEXT:    retl
2842entry:
2843 %X = load <4 x i16>, <4 x i16>* %ptr
2844 %Y = sext <4 x i16> %X to <4 x i32>
2845 ret <4 x i32> %Y
2846}
2847
2848define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
2849; SSE2-LABEL: load_sext_4i16_to_4i64:
2850; SSE2:       # %bb.0: # %entry
2851; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2852; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2853; SSE2-NEXT:    psrad $16, %xmm1
2854; SSE2-NEXT:    pxor %xmm2, %xmm2
2855; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2856; SSE2-NEXT:    movdqa %xmm1, %xmm0
2857; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2858; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2859; SSE2-NEXT:    retq
2860;
2861; SSSE3-LABEL: load_sext_4i16_to_4i64:
2862; SSSE3:       # %bb.0: # %entry
2863; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2864; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2865; SSSE3-NEXT:    psrad $16, %xmm1
2866; SSSE3-NEXT:    pxor %xmm2, %xmm2
2867; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
2868; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2869; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2870; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2871; SSSE3-NEXT:    retq
2872;
2873; SSE41-LABEL: load_sext_4i16_to_4i64:
2874; SSE41:       # %bb.0: # %entry
2875; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
2876; SSE41-NEXT:    pmovsxwq 4(%rdi), %xmm1
2877; SSE41-NEXT:    retq
2878;
2879; AVX1-LABEL: load_sext_4i16_to_4i64:
2880; AVX1:       # %bb.0: # %entry
2881; AVX1-NEXT:    vpmovsxwq 4(%rdi), %xmm0
2882; AVX1-NEXT:    vpmovsxwq (%rdi), %xmm1
2883; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2884; AVX1-NEXT:    retq
2885;
2886; AVX2-LABEL: load_sext_4i16_to_4i64:
2887; AVX2:       # %bb.0: # %entry
2888; AVX2-NEXT:    vpmovsxwq (%rdi), %ymm0
2889; AVX2-NEXT:    retq
2890;
2891; AVX512-LABEL: load_sext_4i16_to_4i64:
2892; AVX512:       # %bb.0: # %entry
2893; AVX512-NEXT:    vpmovsxwq (%rdi), %ymm0
2894; AVX512-NEXT:    retq
2895;
2896; X86-SSE2-LABEL: load_sext_4i16_to_4i64:
2897; X86-SSE2:       # %bb.0: # %entry
2898; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2899; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2900; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2901; X86-SSE2-NEXT:    psrad $16, %xmm1
2902; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
2903; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2904; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
2905; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2906; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2907; X86-SSE2-NEXT:    retl
2908;
2909; X86-SSE41-LABEL: load_sext_4i16_to_4i64:
2910; X86-SSE41:       # %bb.0: # %entry
2911; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2912; X86-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
2913; X86-SSE41-NEXT:    pmovsxwq 4(%eax), %xmm1
2914; X86-SSE41-NEXT:    retl
2915entry:
2916 %X = load <4 x i16>, <4 x i16>* %ptr
2917 %Y = sext <4 x i16> %X to <4 x i64>
2918 ret <4 x i64> %Y
2919}
2920
2921define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
2922; SSE2-LABEL: load_sext_8i16_to_8i32:
2923; SSE2:       # %bb.0: # %entry
2924; SSE2-NEXT:    movdqa (%rdi), %xmm1
2925; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2926; SSE2-NEXT:    psrad $16, %xmm0
2927; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2928; SSE2-NEXT:    psrad $16, %xmm1
2929; SSE2-NEXT:    retq
2930;
2931; SSSE3-LABEL: load_sext_8i16_to_8i32:
2932; SSSE3:       # %bb.0: # %entry
2933; SSSE3-NEXT:    movdqa (%rdi), %xmm1
2934; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2935; SSSE3-NEXT:    psrad $16, %xmm0
2936; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2937; SSSE3-NEXT:    psrad $16, %xmm1
2938; SSSE3-NEXT:    retq
2939;
2940; SSE41-LABEL: load_sext_8i16_to_8i32:
2941; SSE41:       # %bb.0: # %entry
2942; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
2943; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
2944; SSE41-NEXT:    retq
2945;
2946; AVX1-LABEL: load_sext_8i16_to_8i32:
2947; AVX1:       # %bb.0: # %entry
2948; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm0
2949; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm1
2950; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2951; AVX1-NEXT:    retq
2952;
2953; AVX2-LABEL: load_sext_8i16_to_8i32:
2954; AVX2:       # %bb.0: # %entry
2955; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
2956; AVX2-NEXT:    retq
2957;
2958; AVX512-LABEL: load_sext_8i16_to_8i32:
2959; AVX512:       # %bb.0: # %entry
2960; AVX512-NEXT:    vpmovsxwd (%rdi), %ymm0
2961; AVX512-NEXT:    retq
2962;
2963; X86-SSE2-LABEL: load_sext_8i16_to_8i32:
2964; X86-SSE2:       # %bb.0: # %entry
2965; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2966; X86-SSE2-NEXT:    movdqa (%eax), %xmm1
2967; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2968; X86-SSE2-NEXT:    psrad $16, %xmm0
2969; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2970; X86-SSE2-NEXT:    psrad $16, %xmm1
2971; X86-SSE2-NEXT:    retl
2972;
2973; X86-SSE41-LABEL: load_sext_8i16_to_8i32:
2974; X86-SSE41:       # %bb.0: # %entry
2975; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2976; X86-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
2977; X86-SSE41-NEXT:    pmovsxwd 8(%eax), %xmm1
2978; X86-SSE41-NEXT:    retl
2979entry:
2980 %X = load <8 x i16>, <8 x i16>* %ptr
2981 %Y = sext <8 x i16> %X to <8 x i32>
2982 ret <8 x i32> %Y
2983}
2984
2985define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
2986; SSE2-LABEL: load_sext_2i32_to_2i64:
2987; SSE2:       # %bb.0: # %entry
2988; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2989; SSE2-NEXT:    pxor %xmm1, %xmm1
2990; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2991; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2992; SSE2-NEXT:    retq
2993;
2994; SSSE3-LABEL: load_sext_2i32_to_2i64:
2995; SSSE3:       # %bb.0: # %entry
2996; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2997; SSSE3-NEXT:    pxor %xmm1, %xmm1
2998; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
2999; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3000; SSSE3-NEXT:    retq
3001;
3002; SSE41-LABEL: load_sext_2i32_to_2i64:
3003; SSE41:       # %bb.0: # %entry
3004; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
3005; SSE41-NEXT:    retq
3006;
3007; AVX-LABEL: load_sext_2i32_to_2i64:
3008; AVX:       # %bb.0: # %entry
3009; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
3010; AVX-NEXT:    retq
3011;
3012; X86-SSE2-LABEL: load_sext_2i32_to_2i64:
3013; X86-SSE2:       # %bb.0: # %entry
3014; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
3015; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3016; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
3017; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
3018; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3019; X86-SSE2-NEXT:    retl
3020;
3021; X86-SSE41-LABEL: load_sext_2i32_to_2i64:
3022; X86-SSE41:       # %bb.0: # %entry
3023; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3024; X86-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
3025; X86-SSE41-NEXT:    retl
3026entry:
3027 %X = load <2 x i32>, <2 x i32>* %ptr
3028 %Y = sext <2 x i32> %X to <2 x i64>
3029 ret <2 x i64> %Y
3030}
3031
3032define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
3033; SSE2-LABEL: load_sext_4i32_to_4i64:
3034; SSE2:       # %bb.0: # %entry
3035; SSE2-NEXT:    movdqa (%rdi), %xmm0
3036; SSE2-NEXT:    pxor %xmm2, %xmm2
3037; SSE2-NEXT:    pxor %xmm3, %xmm3
3038; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3039; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3040; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3041; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3042; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3043; SSE2-NEXT:    retq
3044;
3045; SSSE3-LABEL: load_sext_4i32_to_4i64:
3046; SSSE3:       # %bb.0: # %entry
3047; SSSE3-NEXT:    movdqa (%rdi), %xmm0
3048; SSSE3-NEXT:    pxor %xmm2, %xmm2
3049; SSSE3-NEXT:    pxor %xmm3, %xmm3
3050; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
3051; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3052; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3053; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
3054; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3055; SSSE3-NEXT:    retq
3056;
3057; SSE41-LABEL: load_sext_4i32_to_4i64:
3058; SSE41:       # %bb.0: # %entry
3059; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
3060; SSE41-NEXT:    pmovsxdq 8(%rdi), %xmm1
3061; SSE41-NEXT:    retq
3062;
3063; AVX1-LABEL: load_sext_4i32_to_4i64:
3064; AVX1:       # %bb.0: # %entry
3065; AVX1-NEXT:    vpmovsxdq 8(%rdi), %xmm0
3066; AVX1-NEXT:    vpmovsxdq (%rdi), %xmm1
3067; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3068; AVX1-NEXT:    retq
3069;
3070; AVX2-LABEL: load_sext_4i32_to_4i64:
3071; AVX2:       # %bb.0: # %entry
3072; AVX2-NEXT:    vpmovsxdq (%rdi), %ymm0
3073; AVX2-NEXT:    retq
3074;
3075; AVX512-LABEL: load_sext_4i32_to_4i64:
3076; AVX512:       # %bb.0: # %entry
3077; AVX512-NEXT:    vpmovsxdq (%rdi), %ymm0
3078; AVX512-NEXT:    retq
3079;
3080; X86-SSE2-LABEL: load_sext_4i32_to_4i64:
3081; X86-SSE2:       # %bb.0: # %entry
3082; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
3083; X86-SSE2-NEXT:    movdqa (%eax), %xmm0
3084; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
3085; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
3086; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3087; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3088; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3089; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3090; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3091; X86-SSE2-NEXT:    retl
3092;
3093; X86-SSE41-LABEL: load_sext_4i32_to_4i64:
3094; X86-SSE41:       # %bb.0: # %entry
3095; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3096; X86-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
3097; X86-SSE41-NEXT:    pmovsxdq 8(%eax), %xmm1
3098; X86-SSE41-NEXT:    retl
3099entry:
3100 %X = load <4 x i32>, <4 x i32>* %ptr
3101 %Y = sext <4 x i32> %X to <4 x i64>
3102 ret <4 x i64> %Y
3103}
3104
3105define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
3106; SSE2-LABEL: sext_2i8_to_i32:
3107; SSE2:       # %bb.0: # %entry
3108; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3109; SSE2-NEXT:    psraw $8, %xmm0
3110; SSE2-NEXT:    movd %xmm0, %eax
3111; SSE2-NEXT:    retq
3112;
3113; SSSE3-LABEL: sext_2i8_to_i32:
3114; SSSE3:       # %bb.0: # %entry
3115; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3116; SSSE3-NEXT:    psraw $8, %xmm0
3117; SSSE3-NEXT:    movd %xmm0, %eax
3118; SSSE3-NEXT:    retq
3119;
3120; SSE41-LABEL: sext_2i8_to_i32:
3121; SSE41:       # %bb.0: # %entry
3122; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
3123; SSE41-NEXT:    movd %xmm0, %eax
3124; SSE41-NEXT:    retq
3125;
3126; AVX-LABEL: sext_2i8_to_i32:
3127; AVX:       # %bb.0: # %entry
3128; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
3129; AVX-NEXT:    vmovd %xmm0, %eax
3130; AVX-NEXT:    retq
3131;
3132; X86-SSE2-LABEL: sext_2i8_to_i32:
3133; X86-SSE2:       # %bb.0: # %entry
3134; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3135; X86-SSE2-NEXT:    psraw $8, %xmm0
3136; X86-SSE2-NEXT:    movd %xmm0, %eax
3137; X86-SSE2-NEXT:    retl
3138;
3139; X86-SSE41-LABEL: sext_2i8_to_i32:
3140; X86-SSE41:       # %bb.0: # %entry
3141; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
3142; X86-SSE41-NEXT:    movd %xmm0, %eax
3143; X86-SSE41-NEXT:    retl
3144entry:
3145  %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
3146  %Ex = sext <2 x i8> %Shuf to <2 x i16>
3147  %Bc = bitcast <2 x i16> %Ex to i32
3148  ret i32 %Bc
3149}
3150
3151define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
3152; SSE2-LABEL: sext_4i1_to_4i64:
3153; SSE2:       # %bb.0:
3154; SSE2-NEXT:    pslld $31, %xmm0
3155; SSE2-NEXT:    psrad $31, %xmm0
3156; SSE2-NEXT:    pxor %xmm2, %xmm2
3157; SSE2-NEXT:    pxor %xmm3, %xmm3
3158; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3159; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3160; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3161; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3162; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3163; SSE2-NEXT:    retq
3164;
3165; SSSE3-LABEL: sext_4i1_to_4i64:
3166; SSSE3:       # %bb.0:
3167; SSSE3-NEXT:    pslld $31, %xmm0
3168; SSSE3-NEXT:    psrad $31, %xmm0
3169; SSSE3-NEXT:    pxor %xmm2, %xmm2
3170; SSSE3-NEXT:    pxor %xmm3, %xmm3
3171; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
3172; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3173; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3174; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
3175; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3176; SSSE3-NEXT:    retq
3177;
3178; SSE41-LABEL: sext_4i1_to_4i64:
3179; SSE41:       # %bb.0:
3180; SSE41-NEXT:    pslld $31, %xmm0
3181; SSE41-NEXT:    psrad $31, %xmm0
3182; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
3183; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3184; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
3185; SSE41-NEXT:    movdqa %xmm2, %xmm0
3186; SSE41-NEXT:    retq
3187;
3188; AVX1-LABEL: sext_4i1_to_4i64:
3189; AVX1:       # %bb.0:
3190; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
3191; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
3192; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
3193; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3194; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
3195; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3196; AVX1-NEXT:    retq
3197;
3198; AVX2-LABEL: sext_4i1_to_4i64:
3199; AVX2:       # %bb.0:
3200; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
3201; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
3202; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
3203; AVX2-NEXT:    retq
3204;
3205; AVX512-LABEL: sext_4i1_to_4i64:
3206; AVX512:       # %bb.0:
3207; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
3208; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
3209; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
3210; AVX512-NEXT:    retq
3211;
3212; X86-SSE2-LABEL: sext_4i1_to_4i64:
3213; X86-SSE2:       # %bb.0:
3214; X86-SSE2-NEXT:    pslld $31, %xmm0
3215; X86-SSE2-NEXT:    psrad $31, %xmm0
3216; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
3217; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
3218; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3219; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3220; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3221; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3222; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3223; X86-SSE2-NEXT:    retl
3224;
3225; X86-SSE41-LABEL: sext_4i1_to_4i64:
3226; X86-SSE41:       # %bb.0:
3227; X86-SSE41-NEXT:    pslld $31, %xmm0
3228; X86-SSE41-NEXT:    psrad $31, %xmm0
3229; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
3230; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3231; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
3232; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
3233; X86-SSE41-NEXT:    retl
3234  %extmask = sext <4 x i1> %mask to <4 x i64>
3235  ret <4 x i64> %extmask
3236}
3237
3238define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
3239; SSE2-LABEL: sext_4i8_to_4i64:
3240; SSE2:       # %bb.0:
3241; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3242; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3243; SSE2-NEXT:    psrad $24, %xmm1
3244; SSE2-NEXT:    pxor %xmm2, %xmm2
3245; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3246; SSE2-NEXT:    movdqa %xmm1, %xmm0
3247; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3248; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3249; SSE2-NEXT:    retq
3250;
3251; SSSE3-LABEL: sext_4i8_to_4i64:
3252; SSSE3:       # %bb.0:
3253; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3254; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3255; SSSE3-NEXT:    psrad $24, %xmm1
3256; SSSE3-NEXT:    pxor %xmm2, %xmm2
3257; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
3258; SSSE3-NEXT:    movdqa %xmm1, %xmm0
3259; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3260; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3261; SSSE3-NEXT:    retq
3262;
3263; SSE41-LABEL: sext_4i8_to_4i64:
3264; SSE41:       # %bb.0:
3265; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
3266; SSE41-NEXT:    psrld $16, %xmm0
3267; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
3268; SSE41-NEXT:    movdqa %xmm2, %xmm0
3269; SSE41-NEXT:    retq
3270;
3271; AVX1-LABEL: sext_4i8_to_4i64:
3272; AVX1:       # %bb.0:
3273; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
3274; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
3275; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
3276; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3277; AVX1-NEXT:    retq
3278;
3279; AVX2-LABEL: sext_4i8_to_4i64:
3280; AVX2:       # %bb.0:
3281; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
3282; AVX2-NEXT:    retq
3283;
3284; AVX512-LABEL: sext_4i8_to_4i64:
3285; AVX512:       # %bb.0:
3286; AVX512-NEXT:    vpmovsxbq %xmm0, %ymm0
3287; AVX512-NEXT:    retq
3288;
3289; X86-SSE2-LABEL: sext_4i8_to_4i64:
3290; X86-SSE2:       # %bb.0:
3291; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3292; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3293; X86-SSE2-NEXT:    psrad $24, %xmm1
3294; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
3295; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3296; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
3297; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3298; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3299; X86-SSE2-NEXT:    retl
3300;
3301; X86-SSE41-LABEL: sext_4i8_to_4i64:
3302; X86-SSE41:       # %bb.0:
3303; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
3304; X86-SSE41-NEXT:    psrld $16, %xmm0
3305; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
3306; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
3307; X86-SSE41-NEXT:    retl
3308  %extmask = sext <4 x i8> %mask to <4 x i64>
3309  ret <4 x i64> %extmask
3310}
3311
3312define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
3313; SSE-LABEL: sext_32xi1_to_32xi8:
3314; SSE:       # %bb.0:
3315; SSE-NEXT:    pcmpeqw %xmm5, %xmm1
3316; SSE-NEXT:    pcmpeqw %xmm4, %xmm0
3317; SSE-NEXT:    packsswb %xmm1, %xmm0
3318; SSE-NEXT:    pcmpeqw %xmm7, %xmm3
3319; SSE-NEXT:    pcmpeqw %xmm6, %xmm2
3320; SSE-NEXT:    packsswb %xmm3, %xmm2
3321; SSE-NEXT:    movdqa %xmm2, %xmm1
3322; SSE-NEXT:    retq
3323;
3324; AVX1-LABEL: sext_32xi1_to_32xi8:
3325; AVX1:       # %bb.0:
3326; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
3327; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
3328; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm5, %xmm4
3329; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm1, %xmm1
3330; AVX1-NEXT:    vpacksswb %xmm4, %xmm1, %xmm1
3331; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
3332; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
3333; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm4, %xmm3
3334; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
3335; AVX1-NEXT:    vpacksswb %xmm3, %xmm0, %xmm0
3336; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3337; AVX1-NEXT:    retq
3338;
3339; AVX2-LABEL: sext_32xi1_to_32xi8:
3340; AVX2:       # %bb.0:
3341; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
3342; AVX2-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
3343; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
3344; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3345; AVX2-NEXT:    retq
3346;
3347; AVX512F-LABEL: sext_32xi1_to_32xi8:
3348; AVX512F:       # %bb.0:
3349; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
3350; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
3351; AVX512F-NEXT:    vpcmpeqw %ymm2, %ymm3, %ymm2
3352; AVX512F-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
3353; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3354; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3355; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
3356; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
3357; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3358; AVX512F-NEXT:    retq
3359;
3360; AVX512BW-LABEL: sext_32xi1_to_32xi8:
3361; AVX512BW:       # %bb.0:
3362; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
3363; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
3364; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3365; AVX512BW-NEXT:    retq
3366;
3367; X86-SSE-LABEL: sext_32xi1_to_32xi8:
3368; X86-SSE:       # %bb.0:
3369; X86-SSE-NEXT:    pushl %ebp
3370; X86-SSE-NEXT:    movl %esp, %ebp
3371; X86-SSE-NEXT:    andl $-16, %esp
3372; X86-SSE-NEXT:    subl $16, %esp
3373; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
3374; X86-SSE-NEXT:    pcmpeqw 40(%ebp), %xmm1
3375; X86-SSE-NEXT:    pcmpeqw 24(%ebp), %xmm0
3376; X86-SSE-NEXT:    packsswb %xmm1, %xmm0
3377; X86-SSE-NEXT:    pcmpeqw 72(%ebp), %xmm3
3378; X86-SSE-NEXT:    pcmpeqw 56(%ebp), %xmm2
3379; X86-SSE-NEXT:    packsswb %xmm3, %xmm2
3380; X86-SSE-NEXT:    movdqa %xmm2, %xmm1
3381; X86-SSE-NEXT:    movl %ebp, %esp
3382; X86-SSE-NEXT:    popl %ebp
3383; X86-SSE-NEXT:    retl
3384  %a = icmp eq <32 x i16> %c1, %c2
3385  %b = sext <32 x i1> %a to <32 x i8>
3386  ret <32 x i8> %b
3387}
3388
3389define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) {
3390; SSE2-LABEL: sext_2i8_to_2i32:
3391; SSE2:       # %bb.0:
3392; SSE2-NEXT:    movzwl (%rdi), %eax
3393; SSE2-NEXT:    movd %eax, %xmm0
3394; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3395; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3396; SSE2-NEXT:    psrad $24, %xmm0
3397; SSE2-NEXT:    paddd %xmm0, %xmm0
3398; SSE2-NEXT:    retq
3399;
3400; SSSE3-LABEL: sext_2i8_to_2i32:
3401; SSSE3:       # %bb.0:
3402; SSSE3-NEXT:    movzwl (%rdi), %eax
3403; SSSE3-NEXT:    movd %eax, %xmm0
3404; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3405; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3406; SSSE3-NEXT:    psrad $24, %xmm0
3407; SSSE3-NEXT:    paddd %xmm0, %xmm0
3408; SSSE3-NEXT:    retq
3409;
3410; SSE41-LABEL: sext_2i8_to_2i32:
3411; SSE41:       # %bb.0:
3412; SSE41-NEXT:    movzwl (%rdi), %eax
3413; SSE41-NEXT:    movd %eax, %xmm0
3414; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
3415; SSE41-NEXT:    paddd %xmm0, %xmm0
3416; SSE41-NEXT:    retq
3417;
3418; AVX-LABEL: sext_2i8_to_2i32:
3419; AVX:       # %bb.0:
3420; AVX-NEXT:    movzwl (%rdi), %eax
3421; AVX-NEXT:    vmovd %eax, %xmm0
3422; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
3423; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
3424; AVX-NEXT:    retq
3425;
3426; X86-SSE2-LABEL: sext_2i8_to_2i32:
3427; X86-SSE2:       # %bb.0:
3428; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
3429; X86-SSE2-NEXT:    movzwl (%eax), %eax
3430; X86-SSE2-NEXT:    movd %eax, %xmm0
3431; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3432; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3433; X86-SSE2-NEXT:    psrad $24, %xmm0
3434; X86-SSE2-NEXT:    paddd %xmm0, %xmm0
3435; X86-SSE2-NEXT:    retl
3436;
3437; X86-SSE41-LABEL: sext_2i8_to_2i32:
3438; X86-SSE41:       # %bb.0:
3439; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3440; X86-SSE41-NEXT:    movzwl (%eax), %eax
3441; X86-SSE41-NEXT:    movd %eax, %xmm0
3442; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
3443; X86-SSE41-NEXT:    paddd %xmm0, %xmm0
3444; X86-SSE41-NEXT:    retl
3445  %x = load <2 x i8>, <2 x i8>* %addr, align 1
3446  %y = sext <2 x i8> %x to <2 x i32>
3447  %z = add <2 x i32>%y, %y
3448  ret <2 x i32>%z
3449}
3450
3451define <4 x i32> @sext_4i17_to_4i32(<4 x i17>* %ptr) {
3452; SSE2-LABEL: sext_4i17_to_4i32:
3453; SSE2:       # %bb.0:
3454; SSE2-NEXT:    movq (%rdi), %rax
3455; SSE2-NEXT:    movl %eax, %ecx
3456; SSE2-NEXT:    shll $15, %ecx
3457; SSE2-NEXT:    sarl $15, %ecx
3458; SSE2-NEXT:    movd %ecx, %xmm0
3459; SSE2-NEXT:    movq %rax, %rcx
3460; SSE2-NEXT:    shrq $17, %rcx
3461; SSE2-NEXT:    shll $15, %ecx
3462; SSE2-NEXT:    sarl $15, %ecx
3463; SSE2-NEXT:    movd %ecx, %xmm1
3464; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3465; SSE2-NEXT:    movl 8(%rdi), %ecx
3466; SSE2-NEXT:    shll $13, %ecx
3467; SSE2-NEXT:    movq %rax, %rdx
3468; SSE2-NEXT:    shrq $51, %rdx
3469; SSE2-NEXT:    orl %ecx, %edx
3470; SSE2-NEXT:    shll $15, %edx
3471; SSE2-NEXT:    sarl $15, %edx
3472; SSE2-NEXT:    movd %edx, %xmm1
3473; SSE2-NEXT:    shrq $34, %rax
3474; SSE2-NEXT:    shll $15, %eax
3475; SSE2-NEXT:    sarl $15, %eax
3476; SSE2-NEXT:    movd %eax, %xmm2
3477; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3478; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3479; SSE2-NEXT:    retq
3480;
3481; SSSE3-LABEL: sext_4i17_to_4i32:
3482; SSSE3:       # %bb.0:
3483; SSSE3-NEXT:    movq (%rdi), %rax
3484; SSSE3-NEXT:    movl %eax, %ecx
3485; SSSE3-NEXT:    shll $15, %ecx
3486; SSSE3-NEXT:    sarl $15, %ecx
3487; SSSE3-NEXT:    movd %ecx, %xmm0
3488; SSSE3-NEXT:    movq %rax, %rcx
3489; SSSE3-NEXT:    shrq $17, %rcx
3490; SSSE3-NEXT:    shll $15, %ecx
3491; SSSE3-NEXT:    sarl $15, %ecx
3492; SSSE3-NEXT:    movd %ecx, %xmm1
3493; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3494; SSSE3-NEXT:    movl 8(%rdi), %ecx
3495; SSSE3-NEXT:    shll $13, %ecx
3496; SSSE3-NEXT:    movq %rax, %rdx
3497; SSSE3-NEXT:    shrq $51, %rdx
3498; SSSE3-NEXT:    orl %ecx, %edx
3499; SSSE3-NEXT:    shll $15, %edx
3500; SSSE3-NEXT:    sarl $15, %edx
3501; SSSE3-NEXT:    movd %edx, %xmm1
3502; SSSE3-NEXT:    shrq $34, %rax
3503; SSSE3-NEXT:    shll $15, %eax
3504; SSSE3-NEXT:    sarl $15, %eax
3505; SSSE3-NEXT:    movd %eax, %xmm2
3506; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3507; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3508; SSSE3-NEXT:    retq
3509;
3510; SSE41-LABEL: sext_4i17_to_4i32:
3511; SSE41:       # %bb.0:
3512; SSE41-NEXT:    movq (%rdi), %rax
3513; SSE41-NEXT:    movq %rax, %rcx
3514; SSE41-NEXT:    shrq $17, %rcx
3515; SSE41-NEXT:    shll $15, %ecx
3516; SSE41-NEXT:    sarl $15, %ecx
3517; SSE41-NEXT:    movl %eax, %edx
3518; SSE41-NEXT:    shll $15, %edx
3519; SSE41-NEXT:    sarl $15, %edx
3520; SSE41-NEXT:    movd %edx, %xmm0
3521; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
3522; SSE41-NEXT:    movq %rax, %rcx
3523; SSE41-NEXT:    shrq $34, %rcx
3524; SSE41-NEXT:    shll $15, %ecx
3525; SSE41-NEXT:    sarl $15, %ecx
3526; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
3527; SSE41-NEXT:    movl 8(%rdi), %ecx
3528; SSE41-NEXT:    shll $13, %ecx
3529; SSE41-NEXT:    shrq $51, %rax
3530; SSE41-NEXT:    orl %ecx, %eax
3531; SSE41-NEXT:    shll $15, %eax
3532; SSE41-NEXT:    sarl $15, %eax
3533; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
3534; SSE41-NEXT:    retq
3535;
3536; AVX-LABEL: sext_4i17_to_4i32:
3537; AVX:       # %bb.0:
3538; AVX-NEXT:    movq (%rdi), %rax
3539; AVX-NEXT:    movq %rax, %rcx
3540; AVX-NEXT:    shrq $17, %rcx
3541; AVX-NEXT:    shll $15, %ecx
3542; AVX-NEXT:    sarl $15, %ecx
3543; AVX-NEXT:    movl %eax, %edx
3544; AVX-NEXT:    shll $15, %edx
3545; AVX-NEXT:    sarl $15, %edx
3546; AVX-NEXT:    vmovd %edx, %xmm0
3547; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
3548; AVX-NEXT:    movq %rax, %rcx
3549; AVX-NEXT:    shrq $34, %rcx
3550; AVX-NEXT:    shll $15, %ecx
3551; AVX-NEXT:    sarl $15, %ecx
3552; AVX-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
3553; AVX-NEXT:    movl 8(%rdi), %ecx
3554; AVX-NEXT:    shll $13, %ecx
3555; AVX-NEXT:    shrq $51, %rax
3556; AVX-NEXT:    orl %ecx, %eax
3557; AVX-NEXT:    shll $15, %eax
3558; AVX-NEXT:    sarl $15, %eax
3559; AVX-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
3560; AVX-NEXT:    retq
3561;
3562; X86-SSE2-LABEL: sext_4i17_to_4i32:
3563; X86-SSE2:       # %bb.0:
3564; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
3565; X86-SSE2-NEXT:    movl (%eax), %ecx
3566; X86-SSE2-NEXT:    movl 4(%eax), %edx
3567; X86-SSE2-NEXT:    movl 8(%eax), %eax
3568; X86-SSE2-NEXT:    shldl $13, %edx, %eax
3569; X86-SSE2-NEXT:    shll $15, %eax
3570; X86-SSE2-NEXT:    sarl $15, %eax
3571; X86-SSE2-NEXT:    movd %eax, %xmm0
3572; X86-SSE2-NEXT:    movl %edx, %eax
3573; X86-SSE2-NEXT:    shll $13, %eax
3574; X86-SSE2-NEXT:    sarl $15, %eax
3575; X86-SSE2-NEXT:    movd %eax, %xmm1
3576; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3577; X86-SSE2-NEXT:    shldl $15, %ecx, %edx
3578; X86-SSE2-NEXT:    shll $15, %ecx
3579; X86-SSE2-NEXT:    sarl $15, %ecx
3580; X86-SSE2-NEXT:    movd %ecx, %xmm0
3581; X86-SSE2-NEXT:    shll $15, %edx
3582; X86-SSE2-NEXT:    sarl $15, %edx
3583; X86-SSE2-NEXT:    movd %edx, %xmm2
3584; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3585; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3586; X86-SSE2-NEXT:    retl
3587;
3588; X86-SSE41-LABEL: sext_4i17_to_4i32:
3589; X86-SSE41:       # %bb.0:
3590; X86-SSE41-NEXT:    pushl %esi
3591; X86-SSE41-NEXT:    .cfi_def_cfa_offset 8
3592; X86-SSE41-NEXT:    .cfi_offset %esi, -8
3593; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3594; X86-SSE41-NEXT:    movl (%eax), %ecx
3595; X86-SSE41-NEXT:    movl 4(%eax), %edx
3596; X86-SSE41-NEXT:    movl %edx, %esi
3597; X86-SSE41-NEXT:    movl 8(%eax), %eax
3598; X86-SSE41-NEXT:    shldl $13, %edx, %eax
3599; X86-SSE41-NEXT:    shldl $15, %ecx, %edx
3600; X86-SSE41-NEXT:    shll $15, %edx
3601; X86-SSE41-NEXT:    sarl $15, %edx
3602; X86-SSE41-NEXT:    shll $15, %ecx
3603; X86-SSE41-NEXT:    sarl $15, %ecx
3604; X86-SSE41-NEXT:    movd %ecx, %xmm0
3605; X86-SSE41-NEXT:    pinsrd $1, %edx, %xmm0
3606; X86-SSE41-NEXT:    shll $13, %esi
3607; X86-SSE41-NEXT:    sarl $15, %esi
3608; X86-SSE41-NEXT:    pinsrd $2, %esi, %xmm0
3609; X86-SSE41-NEXT:    shll $15, %eax
3610; X86-SSE41-NEXT:    sarl $15, %eax
3611; X86-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
3612; X86-SSE41-NEXT:    popl %esi
3613; X86-SSE41-NEXT:    .cfi_def_cfa_offset 4
3614; X86-SSE41-NEXT:    retl
3615  %a = load <4 x i17>, <4 x i17>* %ptr
3616  %b = sext <4 x i17> %a to <4 x i32>
3617  ret <4 x i32> %b
3618}
3619
3620define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
3621; SSE2-LABEL: sext_8i6_to_8i64:
3622; SSE2:       # %bb.0: # %entry
3623; SSE2-NEXT:    movd %edi, %xmm0
3624; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3625; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3626; SSE2-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
3627; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
3628; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3629; SSE2-NEXT:    psllq $58, %xmm0
3630; SSE2-NEXT:    movdqa %xmm0, %xmm1
3631; SSE2-NEXT:    psrad $31, %xmm1
3632; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3633; SSE2-NEXT:    psrad $26, %xmm0
3634; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
3635; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3636; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3637; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3638; SSE2-NEXT:    psllq $58, %xmm1
3639; SSE2-NEXT:    movdqa %xmm1, %xmm2
3640; SSE2-NEXT:    psrad $31, %xmm2
3641; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3642; SSE2-NEXT:    psrad $26, %xmm1
3643; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3644; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3645; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
3646; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
3647; SSE2-NEXT:    psllq $58, %xmm2
3648; SSE2-NEXT:    movdqa %xmm2, %xmm4
3649; SSE2-NEXT:    psrad $31, %xmm4
3650; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3651; SSE2-NEXT:    psrad $26, %xmm2
3652; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3653; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
3654; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3655; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
3656; SSE2-NEXT:    psllq $58, %xmm3
3657; SSE2-NEXT:    movdqa %xmm3, %xmm4
3658; SSE2-NEXT:    psrad $31, %xmm4
3659; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3660; SSE2-NEXT:    psrad $26, %xmm3
3661; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
3662; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
3663; SSE2-NEXT:    retq
3664;
3665; SSSE3-LABEL: sext_8i6_to_8i64:
3666; SSSE3:       # %bb.0: # %entry
3667; SSSE3-NEXT:    movd %edi, %xmm0
3668; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3669; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3670; SSSE3-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
3671; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
3672; SSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3673; SSSE3-NEXT:    psllq $58, %xmm0
3674; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3675; SSSE3-NEXT:    psrad $31, %xmm1
3676; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3677; SSSE3-NEXT:    psrad $26, %xmm0
3678; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
3679; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3680; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3681; SSSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3682; SSSE3-NEXT:    psllq $58, %xmm1
3683; SSSE3-NEXT:    movdqa %xmm1, %xmm2
3684; SSSE3-NEXT:    psrad $31, %xmm2
3685; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3686; SSSE3-NEXT:    psrad $26, %xmm1
3687; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3688; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3689; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
3690; SSSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
3691; SSSE3-NEXT:    psllq $58, %xmm2
3692; SSSE3-NEXT:    movdqa %xmm2, %xmm4
3693; SSSE3-NEXT:    psrad $31, %xmm4
3694; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3695; SSSE3-NEXT:    psrad $26, %xmm2
3696; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3697; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
3698; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3699; SSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
3700; SSSE3-NEXT:    psllq $58, %xmm3
3701; SSSE3-NEXT:    movdqa %xmm3, %xmm4
3702; SSSE3-NEXT:    psrad $31, %xmm4
3703; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3704; SSSE3-NEXT:    psrad $26, %xmm3
3705; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
3706; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
3707; SSSE3-NEXT:    retq
3708;
3709; SSE41-LABEL: sext_8i6_to_8i64:
3710; SSE41:       # %bb.0: # %entry
3711; SSE41-NEXT:    movd %edi, %xmm0
3712; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3713; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3714; SSE41-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
3715; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3716; SSE41-NEXT:    psllq $58, %xmm0
3717; SSE41-NEXT:    movdqa %xmm0, %xmm1
3718; SSE41-NEXT:    psrad $31, %xmm1
3719; SSE41-NEXT:    psrad $26, %xmm0
3720; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
3721; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
3722; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3723; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3724; SSE41-NEXT:    psllq $58, %xmm1
3725; SSE41-NEXT:    movdqa %xmm1, %xmm2
3726; SSE41-NEXT:    psrad $31, %xmm2
3727; SSE41-NEXT:    psrad $26, %xmm1
3728; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3729; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3730; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
3731; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3732; SSE41-NEXT:    psllq $58, %xmm2
3733; SSE41-NEXT:    movdqa %xmm2, %xmm4
3734; SSE41-NEXT:    psrad $31, %xmm4
3735; SSE41-NEXT:    psrad $26, %xmm2
3736; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
3737; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
3738; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3739; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3740; SSE41-NEXT:    psllq $58, %xmm3
3741; SSE41-NEXT:    movdqa %xmm3, %xmm4
3742; SSE41-NEXT:    psrad $31, %xmm4
3743; SSE41-NEXT:    psrad $26, %xmm3
3744; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
3745; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
3746; SSE41-NEXT:    retq
3747;
3748; AVX1-LABEL: sext_8i6_to_8i64:
3749; AVX1:       # %bb.0: # %entry
3750; AVX1-NEXT:    vmovd %edi, %xmm0
3751; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3752; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3753; AVX1-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3754; AVX1-NEXT:    vpsllw $10, %xmm0, %xmm0
3755; AVX1-NEXT:    vpsraw $10, %xmm0, %xmm1
3756; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm0
3757; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
3758; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
3759; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3760; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
3761; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
3762; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
3763; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
3764; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
3765; AVX1-NEXT:    retq
3766;
3767; AVX2-LABEL: sext_8i6_to_8i64:
3768; AVX2:       # %bb.0: # %entry
3769; AVX2-NEXT:    vmovd %edi, %xmm0
3770; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
3771; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3772; AVX2-NEXT:    vpsllw $10, %xmm0, %xmm0
3773; AVX2-NEXT:    vpsraw $10, %xmm0, %xmm1
3774; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm0
3775; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3776; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
3777; AVX2-NEXT:    retq
3778;
3779; AVX512-LABEL: sext_8i6_to_8i64:
3780; AVX512:       # %bb.0: # %entry
3781; AVX512-NEXT:    vmovd %edi, %xmm0
3782; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
3783; AVX512-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3784; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3785; AVX512-NEXT:    vpsllq $58, %zmm0, %zmm0
3786; AVX512-NEXT:    vpsraq $58, %zmm0, %zmm0
3787; AVX512-NEXT:    retq
3788;
3789; X86-SSE2-LABEL: sext_8i6_to_8i64:
3790; X86-SSE2:       # %bb.0: # %entry
3791; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3792; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3793; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3794; X86-SSE2-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
3795; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
3796; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3797; X86-SSE2-NEXT:    psllq $58, %xmm0
3798; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
3799; X86-SSE2-NEXT:    psrad $31, %xmm1
3800; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3801; X86-SSE2-NEXT:    psrad $26, %xmm0
3802; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
3803; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3804; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3805; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3806; X86-SSE2-NEXT:    psllq $58, %xmm1
3807; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
3808; X86-SSE2-NEXT:    psrad $31, %xmm2
3809; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3810; X86-SSE2-NEXT:    psrad $26, %xmm1
3811; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3812; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3813; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
3814; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
3815; X86-SSE2-NEXT:    psllq $58, %xmm2
3816; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
3817; X86-SSE2-NEXT:    psrad $31, %xmm4
3818; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3819; X86-SSE2-NEXT:    psrad $26, %xmm2
3820; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3821; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
3822; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3823; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
3824; X86-SSE2-NEXT:    psllq $58, %xmm3
3825; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
3826; X86-SSE2-NEXT:    psrad $31, %xmm4
3827; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3828; X86-SSE2-NEXT:    psrad $26, %xmm3
3829; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
3830; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
3831; X86-SSE2-NEXT:    retl
3832;
3833; X86-SSE41-LABEL: sext_8i6_to_8i64:
3834; X86-SSE41:       # %bb.0: # %entry
3835; X86-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3836; X86-SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3837; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3838; X86-SSE41-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
3839; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3840; X86-SSE41-NEXT:    psllq $58, %xmm0
3841; X86-SSE41-NEXT:    movdqa %xmm0, %xmm1
3842; X86-SSE41-NEXT:    psrad $31, %xmm1
3843; X86-SSE41-NEXT:    psrad $26, %xmm0
3844; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
3845; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
3846; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3847; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3848; X86-SSE41-NEXT:    psllq $58, %xmm1
3849; X86-SSE41-NEXT:    movdqa %xmm1, %xmm2
3850; X86-SSE41-NEXT:    psrad $31, %xmm2
3851; X86-SSE41-NEXT:    psrad $26, %xmm1
3852; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3853; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3854; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
3855; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3856; X86-SSE41-NEXT:    psllq $58, %xmm2
3857; X86-SSE41-NEXT:    movdqa %xmm2, %xmm4
3858; X86-SSE41-NEXT:    psrad $31, %xmm4
3859; X86-SSE41-NEXT:    psrad $26, %xmm2
3860; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
3861; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
3862; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3863; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3864; X86-SSE41-NEXT:    psllq $58, %xmm3
3865; X86-SSE41-NEXT:    movdqa %xmm3, %xmm4
3866; X86-SSE41-NEXT:    psrad $31, %xmm4
3867; X86-SSE41-NEXT:    psrad $26, %xmm3
3868; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
3869; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
3870; X86-SSE41-NEXT:    retl
3871entry:
3872  %a = trunc i32 %x to i6
3873  %b = insertelement <8 x i6> undef, i6 %a, i32 0
3874  %c = shufflevector <8 x i6> %b, <8 x i6> undef, <8 x i32> zeroinitializer
3875  %d = add <8 x i6> %c, <i6 0, i6 1, i6 2, i6 3, i6 4, i6 5, i6 6, i6 7>
3876  %e = sext <8 x i6> %d to <8 x i64>
3877  ret <8 x i64> %e
3878}
3879
3880define <8 x i32> @zext_negate_sext(<8 x i8> %x) {
3881; SSE2-LABEL: zext_negate_sext:
3882; SSE2:       # %bb.0:
3883; SSE2-NEXT:    pxor %xmm1, %xmm1
3884; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3885; SSE2-NEXT:    psubw %xmm0, %xmm1
3886; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3887; SSE2-NEXT:    psrad $16, %xmm0
3888; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3889; SSE2-NEXT:    psrad $16, %xmm1
3890; SSE2-NEXT:    retq
3891;
3892; SSSE3-LABEL: zext_negate_sext:
3893; SSSE3:       # %bb.0:
3894; SSSE3-NEXT:    pxor %xmm1, %xmm1
3895; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3896; SSSE3-NEXT:    psubw %xmm0, %xmm1
3897; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3898; SSSE3-NEXT:    psrad $16, %xmm0
3899; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3900; SSSE3-NEXT:    psrad $16, %xmm1
3901; SSSE3-NEXT:    retq
3902;
3903; SSE41-LABEL: zext_negate_sext:
3904; SSE41:       # %bb.0:
3905; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3906; SSE41-NEXT:    pxor %xmm1, %xmm1
3907; SSE41-NEXT:    psubw %xmm0, %xmm1
3908; SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
3909; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3910; SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
3911; SSE41-NEXT:    retq
3912;
3913; AVX1-LABEL: zext_negate_sext:
3914; AVX1:       # %bb.0:
3915; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
3916; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
3917; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3918; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
3919; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
3920; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
3921; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3922; AVX1-NEXT:    retq
3923;
3924; AVX2-LABEL: zext_negate_sext:
3925; AVX2:       # %bb.0:
3926; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3927; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3928; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
3929; AVX2-NEXT:    retq
3930;
3931; AVX512-LABEL: zext_negate_sext:
3932; AVX512:       # %bb.0:
3933; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3934; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3935; AVX512-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
3936; AVX512-NEXT:    retq
3937;
3938; X86-SSE2-LABEL: zext_negate_sext:
3939; X86-SSE2:       # %bb.0:
3940; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
3941; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3942; X86-SSE2-NEXT:    psubw %xmm0, %xmm1
3943; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3944; X86-SSE2-NEXT:    psrad $16, %xmm0
3945; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3946; X86-SSE2-NEXT:    psrad $16, %xmm1
3947; X86-SSE2-NEXT:    retl
3948;
3949; X86-SSE41-LABEL: zext_negate_sext:
3950; X86-SSE41:       # %bb.0:
3951; X86-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3952; X86-SSE41-NEXT:    pxor %xmm1, %xmm1
3953; X86-SSE41-NEXT:    psubw %xmm0, %xmm1
3954; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
3955; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3956; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
3957; X86-SSE41-NEXT:    retl
3958  %z = zext <8 x i8> %x to <8 x i16>
3959  %neg = sub nsw <8 x i16> zeroinitializer, %z
3960  %r = sext <8 x i16> %neg to <8 x i32>
3961  ret <8 x i32> %r
3962}
3963
3964define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) {
3965; SSE2-LABEL: zext_decremenet_sext:
3966; SSE2:       # %bb.0:
3967; SSE2-NEXT:    pxor %xmm1, %xmm1
3968; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3969; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
3970; SSE2-NEXT:    paddw %xmm0, %xmm1
3971; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3972; SSE2-NEXT:    psrad $16, %xmm0
3973; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3974; SSE2-NEXT:    psrad $16, %xmm1
3975; SSE2-NEXT:    retq
3976;
3977; SSSE3-LABEL: zext_decremenet_sext:
3978; SSSE3:       # %bb.0:
3979; SSSE3-NEXT:    pxor %xmm1, %xmm1
3980; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3981; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
3982; SSSE3-NEXT:    paddw %xmm0, %xmm1
3983; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3984; SSSE3-NEXT:    psrad $16, %xmm0
3985; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3986; SSSE3-NEXT:    psrad $16, %xmm1
3987; SSSE3-NEXT:    retq
3988;
3989; SSE41-LABEL: zext_decremenet_sext:
3990; SSE41:       # %bb.0:
3991; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3992; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
3993; SSE41-NEXT:    paddw %xmm0, %xmm1
3994; SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
3995; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3996; SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
3997; SSE41-NEXT:    retq
3998;
3999; AVX1-LABEL: zext_decremenet_sext:
4000; AVX1:       # %bb.0:
4001; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
4002; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
4003; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
4004; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
4005; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4006; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
4007; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4008; AVX1-NEXT:    retq
4009;
4010; AVX2-LABEL: zext_decremenet_sext:
4011; AVX2:       # %bb.0:
4012; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
4013; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
4014; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
4015; AVX2-NEXT:    retq
4016;
4017; AVX512-LABEL: zext_decremenet_sext:
4018; AVX512:       # %bb.0:
4019; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
4020; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
4021; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
4022; AVX512-NEXT:    retq
4023;
4024; X86-SSE2-LABEL: zext_decremenet_sext:
4025; X86-SSE2:       # %bb.0:
4026; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
4027; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4028; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
4029; X86-SSE2-NEXT:    paddw %xmm0, %xmm1
4030; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4031; X86-SSE2-NEXT:    psrad $16, %xmm0
4032; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
4033; X86-SSE2-NEXT:    psrad $16, %xmm1
4034; X86-SSE2-NEXT:    retl
4035;
4036; X86-SSE41-LABEL: zext_decremenet_sext:
4037; X86-SSE41:       # %bb.0:
4038; X86-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
4039; X86-SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
4040; X86-SSE41-NEXT:    paddw %xmm0, %xmm1
4041; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
4042; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
4043; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
4044; X86-SSE41-NEXT:    retl
4045  %z = zext <8 x i8> %x to <8 x i16>
4046  %dec = add <8 x i16> %z, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
4047  %r = sext <8 x i16> %dec to <8 x i32>
4048  ret <8 x i32> %r
4049}
4050