1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X86-SSE
3; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX1
4; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X86-AVX512
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X64-SSE
6; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1,X64-AVX1
7; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512,X64-AVX512
8
9@g16 = external global i16
10
11define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
12; X86-SSE-LABEL: pinsrd_1:
13; X86-SSE:       ## %bb.0:
14; X86-SSE-NEXT:    pinsrd $1, {{[0-9]+}}(%esp), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x44,0x24,0x04,0x01]
15; X86-SSE-NEXT:    retl ## encoding: [0xc3]
16;
17; X86-AVX1-LABEL: pinsrd_1:
18; X86-AVX1:       ## %bb.0:
19; X86-AVX1-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x01]
20; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
21;
22; X86-AVX512-LABEL: pinsrd_1:
23; X86-AVX512:       ## %bb.0:
24; X86-AVX512-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x01]
25; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
26;
27; X64-SSE-LABEL: pinsrd_1:
28; X64-SSE:       ## %bb.0:
29; X64-SSE-NEXT:    pinsrd $1, %edi, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0xc7,0x01]
30; X64-SSE-NEXT:    retq ## encoding: [0xc3]
31;
32; X64-AVX1-LABEL: pinsrd_1:
33; X64-AVX1:       ## %bb.0:
34; X64-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
35; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
36;
37; X64-AVX512-LABEL: pinsrd_1:
38; X64-AVX512:       ## %bb.0:
39; X64-AVX512-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
40; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
41  %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
42  ret <4 x i32> %tmp1
43}
44
45define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
46; X86-SSE-LABEL: pinsrb_1:
47; X86-SSE:       ## %bb.0:
48; X86-SSE-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x20,0x44,0x24,0x04,0x01]
49; X86-SSE-NEXT:    retl ## encoding: [0xc3]
50;
51; X86-AVX1-LABEL: pinsrb_1:
52; X86-AVX1:       ## %bb.0:
53; X86-AVX1-NEXT:    vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0x44,0x24,0x04,0x01]
54; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
55;
56; X86-AVX512-LABEL: pinsrb_1:
57; X86-AVX512:       ## %bb.0:
58; X86-AVX512-NEXT:    vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0x44,0x24,0x04,0x01]
59; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
60;
61; X64-SSE-LABEL: pinsrb_1:
62; X64-SSE:       ## %bb.0:
63; X64-SSE-NEXT:    pinsrb $1, %edi, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x20,0xc7,0x01]
64; X64-SSE-NEXT:    retq ## encoding: [0xc3]
65;
66; X64-AVX1-LABEL: pinsrb_1:
67; X64-AVX1:       ## %bb.0:
68; X64-AVX1-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x01]
69; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
70;
71; X64-AVX512-LABEL: pinsrb_1:
72; X64-AVX512:       ## %bb.0:
73; X64-AVX512-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x01]
74; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
75  %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
76  ret <16 x i8> %tmp1
77}
78
79define <2 x i64> @pmovzxbq_1() nounwind {
80; X86-SSE-LABEL: pmovzxbq_1:
81; X86-SSE:       ## %bb.0: ## %entry
82; X86-SSE-NEXT:    movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A]
83; X86-SSE-NEXT:    ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4
84; X86-SSE-NEXT:    pmovzxbq (%eax), %xmm0 ## encoding: [0x66,0x0f,0x38,0x32,0x00]
85; X86-SSE-NEXT:    ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
86; X86-SSE-NEXT:    retl ## encoding: [0xc3]
87;
88; X86-AVX1-LABEL: pmovzxbq_1:
89; X86-AVX1:       ## %bb.0: ## %entry
90; X86-AVX1-NEXT:    movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A]
91; X86-AVX1-NEXT:    ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4
92; X86-AVX1-NEXT:    vpmovzxbq (%eax), %xmm0 ## encoding: [0xc4,0xe2,0x79,0x32,0x00]
93; X86-AVX1-NEXT:    ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
94; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
95;
96; X86-AVX512-LABEL: pmovzxbq_1:
97; X86-AVX512:       ## %bb.0: ## %entry
98; X86-AVX512-NEXT:    movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A]
99; X86-AVX512-NEXT:    ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4
100; X86-AVX512-NEXT:    vpmovzxbq (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00]
101; X86-AVX512-NEXT:    ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
102; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
103;
104; X64-SSE-LABEL: pmovzxbq_1:
105; X64-SSE:       ## %bb.0: ## %entry
106; X64-SSE-NEXT:    movq _g16@GOTPCREL(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
107; X64-SSE-NEXT:    ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
108; X64-SSE-NEXT:    pmovzxbq (%rax), %xmm0 ## encoding: [0x66,0x0f,0x38,0x32,0x00]
109; X64-SSE-NEXT:    ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
110; X64-SSE-NEXT:    retq ## encoding: [0xc3]
111;
112; X64-AVX1-LABEL: pmovzxbq_1:
113; X64-AVX1:       ## %bb.0: ## %entry
114; X64-AVX1-NEXT:    movq _g16@GOTPCREL(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
115; X64-AVX1-NEXT:    ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
116; X64-AVX1-NEXT:    vpmovzxbq (%rax), %xmm0 ## encoding: [0xc4,0xe2,0x79,0x32,0x00]
117; X64-AVX1-NEXT:    ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
118; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
119;
120; X64-AVX512-LABEL: pmovzxbq_1:
121; X64-AVX512:       ## %bb.0: ## %entry
122; X64-AVX512-NEXT:    movq _g16@GOTPCREL(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
123; X64-AVX512-NEXT:    ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
124; X64-AVX512-NEXT:    vpmovzxbq (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00]
125; X64-AVX512-NEXT:    ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
126; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
127entry:
128	%0 = load i16, i16* @g16, align 2		; <i16> [#uses=1]
129	%1 = insertelement <8 x i16> undef, i16 %0, i32 0		; <<8 x i16>> [#uses=1]
130	%2 = bitcast <8 x i16> %1 to <16 x i8>		; <<16 x i8>> [#uses=1]
131	%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone		; <<2 x i64>> [#uses=1]
132	ret <2 x i64> %3
133}
134
135declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
136
137define i32 @extractps_1(<4 x float> %v) nounwind {
138; SSE-LABEL: extractps_1:
139; SSE:       ## %bb.0:
140; SSE-NEXT:    extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03]
141; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
142;
143; AVX1-LABEL: extractps_1:
144; AVX1:       ## %bb.0:
145; AVX1-NEXT:    vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
146; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
147;
148; AVX512-LABEL: extractps_1:
149; AVX512:       ## %bb.0:
150; AVX512-NEXT:    vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
151; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
152  %s = extractelement <4 x float> %v, i32 3
153  %i = bitcast float %s to i32
154  ret i32 %i
155}
156define i32 @extractps_2(<4 x float> %v) nounwind {
157; SSE-LABEL: extractps_2:
158; SSE:       ## %bb.0:
159; SSE-NEXT:    extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03]
160; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
161;
162; AVX1-LABEL: extractps_2:
163; AVX1:       ## %bb.0:
164; AVX1-NEXT:    vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
165; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
166;
167; AVX512-LABEL: extractps_2:
168; AVX512:       ## %bb.0:
169; AVX512-NEXT:    vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
170; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
171  %t = bitcast <4 x float> %v to <4 x i32>
172  %s = extractelement <4 x i32> %t, i32 3
173  ret i32 %s
174}
175
176
177; The non-store form of extractps puts its result into a GPR.
178; This makes it suitable for an extract from a <4 x float> that
179; is bitcasted to i32, but unsuitable for much of anything else.
180
181define float @ext_1(<4 x float> %v) nounwind {
182; X86-SSE-LABEL: ext_1:
183; X86-SSE:       ## %bb.0:
184; X86-SSE-NEXT:    pushl %eax ## encoding: [0x50]
185; X86-SSE-NEXT:    shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff]
186; X86-SSE-NEXT:    ## xmm0 = xmm0[3,3,3,3]
187; X86-SSE-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ## encoding: [0xf3,0x0f,0x58,0x05,A,A,A,A]
188; X86-SSE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
189; X86-SSE-NEXT:    movss %xmm0, (%esp) ## encoding: [0xf3,0x0f,0x11,0x04,0x24]
190; X86-SSE-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
191; X86-SSE-NEXT:    popl %eax ## encoding: [0x58]
192; X86-SSE-NEXT:    retl ## encoding: [0xc3]
193;
194; X86-AVX1-LABEL: ext_1:
195; X86-AVX1:       ## %bb.0:
196; X86-AVX1-NEXT:    pushl %eax ## encoding: [0x50]
197; X86-AVX1-NEXT:    vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
198; X86-AVX1-NEXT:    ## xmm0 = xmm0[3,3,3,3]
199; X86-AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
200; X86-AVX1-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
201; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
202; X86-AVX1-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
203; X86-AVX1-NEXT:    popl %eax ## encoding: [0x58]
204; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
205;
206; X86-AVX512-LABEL: ext_1:
207; X86-AVX512:       ## %bb.0:
208; X86-AVX512-NEXT:    pushl %eax ## encoding: [0x50]
209; X86-AVX512-NEXT:    vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
210; X86-AVX512-NEXT:    ## xmm0 = xmm0[3,3,3,3]
211; X86-AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
212; X86-AVX512-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
213; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
214; X86-AVX512-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
215; X86-AVX512-NEXT:    popl %eax ## encoding: [0x58]
216; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
217;
218; X64-SSE-LABEL: ext_1:
219; X64-SSE:       ## %bb.0:
220; X64-SSE-NEXT:    shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff]
221; X64-SSE-NEXT:    ## xmm0 = xmm0[3,3,3,3]
222; X64-SSE-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ## encoding: [0xf3,0x0f,0x58,0x05,A,A,A,A]
223; X64-SSE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
224; X64-SSE-NEXT:    retq ## encoding: [0xc3]
225;
226; X64-AVX1-LABEL: ext_1:
227; X64-AVX1:       ## %bb.0:
228; X64-AVX1-NEXT:    vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
229; X64-AVX1-NEXT:    ## xmm0 = xmm0[3,3,3,3]
230; X64-AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
231; X64-AVX1-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
232; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
233;
234; X64-AVX512-LABEL: ext_1:
235; X64-AVX512:       ## %bb.0:
236; X64-AVX512-NEXT:    vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
237; X64-AVX512-NEXT:    ## xmm0 = xmm0[3,3,3,3]
238; X64-AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
239; X64-AVX512-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
240; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
241  %s = extractelement <4 x float> %v, i32 3
242  %t = fadd float %s, 1.0
243  ret float %t
244}
245
246define float @ext_2(<4 x float> %v) nounwind {
247; X86-SSE-LABEL: ext_2:
248; X86-SSE:       ## %bb.0:
249; X86-SSE-NEXT:    pushl %eax ## encoding: [0x50]
250; X86-SSE-NEXT:    shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff]
251; X86-SSE-NEXT:    ## xmm0 = xmm0[3,3,3,3]
252; X86-SSE-NEXT:    movss %xmm0, (%esp) ## encoding: [0xf3,0x0f,0x11,0x04,0x24]
253; X86-SSE-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
254; X86-SSE-NEXT:    popl %eax ## encoding: [0x58]
255; X86-SSE-NEXT:    retl ## encoding: [0xc3]
256;
257; X86-AVX1-LABEL: ext_2:
258; X86-AVX1:       ## %bb.0:
259; X86-AVX1-NEXT:    pushl %eax ## encoding: [0x50]
260; X86-AVX1-NEXT:    vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
261; X86-AVX1-NEXT:    ## xmm0 = xmm0[3,3,3,3]
262; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
263; X86-AVX1-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
264; X86-AVX1-NEXT:    popl %eax ## encoding: [0x58]
265; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
266;
267; X86-AVX512-LABEL: ext_2:
268; X86-AVX512:       ## %bb.0:
269; X86-AVX512-NEXT:    pushl %eax ## encoding: [0x50]
270; X86-AVX512-NEXT:    vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
271; X86-AVX512-NEXT:    ## xmm0 = xmm0[3,3,3,3]
272; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
273; X86-AVX512-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
274; X86-AVX512-NEXT:    popl %eax ## encoding: [0x58]
275; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
276;
277; X64-SSE-LABEL: ext_2:
278; X64-SSE:       ## %bb.0:
279; X64-SSE-NEXT:    shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff]
280; X64-SSE-NEXT:    ## xmm0 = xmm0[3,3,3,3]
281; X64-SSE-NEXT:    retq ## encoding: [0xc3]
282;
283; X64-AVX1-LABEL: ext_2:
284; X64-AVX1:       ## %bb.0:
285; X64-AVX1-NEXT:    vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
286; X64-AVX1-NEXT:    ## xmm0 = xmm0[3,3,3,3]
287; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
288;
289; X64-AVX512-LABEL: ext_2:
290; X64-AVX512:       ## %bb.0:
291; X64-AVX512-NEXT:    vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
292; X64-AVX512-NEXT:    ## xmm0 = xmm0[3,3,3,3]
293; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
294  %s = extractelement <4 x float> %v, i32 3
295  ret float %s
296}
297
298define i32 @ext_3(<4 x i32> %v) nounwind {
299; SSE-LABEL: ext_3:
300; SSE:       ## %bb.0:
301; SSE-NEXT:    extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03]
302; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
303;
304; AVX1-LABEL: ext_3:
305; AVX1:       ## %bb.0:
306; AVX1-NEXT:    vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
307; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
308;
309; AVX512-LABEL: ext_3:
310; AVX512:       ## %bb.0:
311; AVX512-NEXT:    vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
312; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
313  %i = extractelement <4 x i32> %v, i32 3
314  ret i32 %i
315}
316
317define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
318; SSE-LABEL: insertps_1:
319; SSE:       ## %bb.0:
320; SSE-NEXT:    insertps $21, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x15]
321; SSE-NEXT:    ## xmm0 = zero,xmm1[0],zero,xmm0[3]
322; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
323;
324; AVX1-LABEL: insertps_1:
325; AVX1:       ## %bb.0:
326; AVX1-NEXT:    vinsertps $21, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x15]
327; AVX1-NEXT:    ## xmm0 = zero,xmm1[0],zero,xmm0[3]
328; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
329;
330; AVX512-LABEL: insertps_1:
331; AVX512:       ## %bb.0:
332; AVX512-NEXT:    vinsertps $21, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x15]
333; AVX512-NEXT:    ## xmm0 = zero,xmm1[0],zero,xmm0[3]
334; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
335  %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 21) nounwind readnone
336  ret <4 x float> %tmp1
337}
338
339declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
340
341; When optimizing for speed, prefer blendps over insertps even if it means we have to
342; generate a separate movss to load the scalar operand.
343define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
344; X86-SSE-LABEL: blendps_not_insertps_1:
345; X86-SSE:       ## %bb.0:
346; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
347; X86-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
348; X86-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
349; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
350; X86-SSE-NEXT:    retl ## encoding: [0xc3]
351;
352; X86-AVX1-LABEL: blendps_not_insertps_1:
353; X86-AVX1:       ## %bb.0:
354; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
355; X86-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
356; X86-AVX1-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
357; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
358; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
359;
360; X86-AVX512-LABEL: blendps_not_insertps_1:
361; X86-AVX512:       ## %bb.0:
362; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
363; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
364; X86-AVX512-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
365; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
366; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
367;
368; X64-SSE-LABEL: blendps_not_insertps_1:
369; X64-SSE:       ## %bb.0:
370; X64-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
371; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
372; X64-SSE-NEXT:    retq ## encoding: [0xc3]
373;
374; X64-AVX-LABEL: blendps_not_insertps_1:
375; X64-AVX:       ## %bb.0:
376; X64-AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
377; X64-AVX-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
378; X64-AVX-NEXT:    retq ## encoding: [0xc3]
379  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
380  ret <4 x float> %tmp1
381}
382
383; When optimizing for size, generate an insertps if there's a load fold opportunity.
384; The difference between i386 and x86-64 ABIs for the float operand means we should
385; generate an insertps for X86 but not for X64!
386define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
387; X86-SSE-LABEL: insertps_or_blendps:
388; X86-SSE:       ## %bb.0:
389; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
390; X86-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
391; X86-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
392; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
393; X86-SSE-NEXT:    retl ## encoding: [0xc3]
394;
395; X86-AVX1-LABEL: insertps_or_blendps:
396; X86-AVX1:       ## %bb.0:
397; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
398; X86-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
399; X86-AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
400; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
401; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
402;
403; X86-AVX512-LABEL: insertps_or_blendps:
404; X86-AVX512:       ## %bb.0:
405; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
406; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
407; X86-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
408; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
409; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
410;
411; X64-SSE-LABEL: insertps_or_blendps:
412; X64-SSE:       ## %bb.0:
413; X64-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
414; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
415; X64-SSE-NEXT:    retq ## encoding: [0xc3]
416;
417; X64-AVX1-LABEL: insertps_or_blendps:
418; X64-AVX1:       ## %bb.0:
419; X64-AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
420; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
421; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
422;
423; X64-AVX512-LABEL: insertps_or_blendps:
424; X64-AVX512:       ## %bb.0:
425; X64-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
426; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
427; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
428  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
429  ret <4 x float> %tmp1
430}
431
432; An insert into the low 32-bits of a vector from the low 32-bits of another vector
433; is always just a blendps because blendps is never more expensive than insertps.
434define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
435; SSE-LABEL: blendps_not_insertps_2:
436; SSE:       ## %bb.0:
437; SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
438; SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
439; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
440;
441; AVX-LABEL: blendps_not_insertps_2:
442; AVX:       ## %bb.0:
443; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
444; AVX-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
445; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
446  %tmp2 = extractelement <4 x float> %t2, i32 0
447  %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
448  ret <4 x float> %tmp1
449}
450
451define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
452; SSE-LABEL: ptestz_1:
453; SSE:       ## %bb.0:
454; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
455; SSE-NEXT:    ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
456; SSE-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
457; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
458;
459; AVX-LABEL: ptestz_1:
460; AVX:       ## %bb.0:
461; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
462; AVX-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
463; AVX-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
464; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
465  %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
466  ret i32 %tmp1
467}
468
469define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
470; SSE-LABEL: ptestz_2:
471; SSE:       ## %bb.0:
472; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
473; SSE-NEXT:    ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
474; SSE-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
475; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
476;
477; AVX-LABEL: ptestz_2:
478; AVX:       ## %bb.0:
479; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
480; AVX-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
481; AVX-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
482; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
483  %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
484  ret i32 %tmp1
485}
486
487define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
488; SSE-LABEL: ptestz_3:
489; SSE:       ## %bb.0:
490; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
491; SSE-NEXT:    ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
492; SSE-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
493; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
494;
495; AVX-LABEL: ptestz_3:
496; AVX:       ## %bb.0:
497; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
498; AVX-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
499; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
500; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
501  %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
502  ret i32 %tmp1
503}
504
505declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
506declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
507declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
508
509; This used to compile to insertps $0  + insertps $16.  insertps $0 is always
510; pointless.
511define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
512; SSE-LABEL: buildvector:
513; SSE:       ## %bb.0: ## %entry
514; SSE-NEXT:    movshdup %xmm0, %xmm2 ## encoding: [0xf3,0x0f,0x16,0xd0]
515; SSE-NEXT:    ## xmm2 = xmm0[1,1,3,3]
516; SSE-NEXT:    movshdup %xmm1, %xmm3 ## encoding: [0xf3,0x0f,0x16,0xd9]
517; SSE-NEXT:    ## xmm3 = xmm1[1,1,3,3]
518; SSE-NEXT:    addss %xmm2, %xmm3 ## encoding: [0xf3,0x0f,0x58,0xda]
519; SSE-NEXT:    addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1]
520; SSE-NEXT:    insertps $16, %xmm3, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc3,0x10]
521; SSE-NEXT:    ## xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
522; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
523;
524; AVX1-LABEL: buildvector:
525; AVX1:       ## %bb.0: ## %entry
526; AVX1-NEXT:    vmovshdup %xmm0, %xmm2 ## encoding: [0xc5,0xfa,0x16,0xd0]
527; AVX1-NEXT:    ## xmm2 = xmm0[1,1,3,3]
528; AVX1-NEXT:    vmovshdup %xmm1, %xmm3 ## encoding: [0xc5,0xfa,0x16,0xd9]
529; AVX1-NEXT:    ## xmm3 = xmm1[1,1,3,3]
530; AVX1-NEXT:    vaddss %xmm3, %xmm2, %xmm2 ## encoding: [0xc5,0xea,0x58,0xd3]
531; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1]
532; AVX1-NEXT:    vinsertps $16, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10]
533; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
534; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
535;
536; AVX512-LABEL: buildvector:
537; AVX512:       ## %bb.0: ## %entry
538; AVX512-NEXT:    vmovshdup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd0]
539; AVX512-NEXT:    ## xmm2 = xmm0[1,1,3,3]
540; AVX512-NEXT:    vmovshdup %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd9]
541; AVX512-NEXT:    ## xmm3 = xmm1[1,1,3,3]
542; AVX512-NEXT:    vaddss %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xea,0x58,0xd3]
543; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1]
544; AVX512-NEXT:    vinsertps $16, %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10]
545; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
546; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
547entry:
548  %tmp7 = extractelement <2 x float> %A, i32 0
549  %tmp5 = extractelement <2 x float> %A, i32 1
550  %tmp3 = extractelement <2 x float> %B, i32 0
551  %tmp1 = extractelement <2 x float> %B, i32 1
552  %add.r = fadd float %tmp7, %tmp3
553  %add.i = fadd float %tmp5, %tmp1
554  %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
555  %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
556  ret <2 x float> %tmp9
557}
558
559define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
560; X86-SSE-LABEL: insertps_from_shufflevector_1:
561; X86-SSE:       ## %bb.0: ## %entry
562; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
563; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
564; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
565; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
566; X86-SSE-NEXT:    retl ## encoding: [0xc3]
567;
568; X86-AVX1-LABEL: insertps_from_shufflevector_1:
569; X86-AVX1:       ## %bb.0: ## %entry
570; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
571; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
572; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
573; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
574; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
575;
576; X86-AVX512-LABEL: insertps_from_shufflevector_1:
577; X86-AVX512:       ## %bb.0: ## %entry
578; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
579; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
580; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
581; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
582; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
583;
584; X64-SSE-LABEL: insertps_from_shufflevector_1:
585; X64-SSE:       ## %bb.0: ## %entry
586; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
587; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
588; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
589; X64-SSE-NEXT:    retq ## encoding: [0xc3]
590;
591; X64-AVX1-LABEL: insertps_from_shufflevector_1:
592; X64-AVX1:       ## %bb.0: ## %entry
593; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
594; X64-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
595; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
596; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
597;
598; X64-AVX512-LABEL: insertps_from_shufflevector_1:
599; X64-AVX512:       ## %bb.0: ## %entry
600; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
601; X64-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
602; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
603; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
604entry:
605  %0 = load <4 x float>, <4 x float>* %pb, align 16
606  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
607  ret <4 x float> %vecinit6
608}
609
610define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
611; SSE-LABEL: insertps_from_shufflevector_2:
612; SSE:       ## %bb.0: ## %entry
613; SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
614; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
615; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
616;
617; AVX1-LABEL: insertps_from_shufflevector_2:
618; AVX1:       ## %bb.0: ## %entry
619; AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
620; AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
621; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
622;
623; AVX512-LABEL: insertps_from_shufflevector_2:
624; AVX512:       ## %bb.0: ## %entry
625; AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
626; AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
627; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
628entry:
629  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
630  ret <4 x float> %vecinit6
631}
632
633; For loading an i32 from memory into an xmm register we use pinsrd
634; instead of insertps
635define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
636; X86-SSE-LABEL: pinsrd_from_shufflevector_i32:
637; X86-SSE:       ## %bb.0: ## %entry
638; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
639; X86-SSE-NEXT:    pshufd $0, (%eax), %xmm1 ## encoding: [0x66,0x0f,0x70,0x08,0x00]
640; X86-SSE-NEXT:    ## xmm1 = mem[0,0,0,0]
641; X86-SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
642; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
643; X86-SSE-NEXT:    retl ## encoding: [0xc3]
644;
645; X86-AVX1-LABEL: pinsrd_from_shufflevector_i32:
646; X86-AVX1:       ## %bb.0: ## %entry
647; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
648; X86-AVX1-NEXT:    vpermilps $0, (%eax), %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0x08,0x00]
649; X86-AVX1-NEXT:    ## xmm1 = mem[0,0,0,0]
650; X86-AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
651; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
652; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
653;
654; X86-AVX512-LABEL: pinsrd_from_shufflevector_i32:
655; X86-AVX512:       ## %bb.0: ## %entry
656; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
657; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08]
658; X86-AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
659; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
660; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
661;
662; X64-SSE-LABEL: pinsrd_from_shufflevector_i32:
663; X64-SSE:       ## %bb.0: ## %entry
664; X64-SSE-NEXT:    pshufd $0, (%rdi), %xmm1 ## encoding: [0x66,0x0f,0x70,0x0f,0x00]
665; X64-SSE-NEXT:    ## xmm1 = mem[0,0,0,0]
666; X64-SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
667; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
668; X64-SSE-NEXT:    retq ## encoding: [0xc3]
669;
670; X64-AVX1-LABEL: pinsrd_from_shufflevector_i32:
671; X64-AVX1:       ## %bb.0: ## %entry
672; X64-AVX1-NEXT:    vpermilps $0, (%rdi), %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0x0f,0x00]
673; X64-AVX1-NEXT:    ## xmm1 = mem[0,0,0,0]
674; X64-AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
675; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
676; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
677;
678; X64-AVX512-LABEL: pinsrd_from_shufflevector_i32:
679; X64-AVX512:       ## %bb.0: ## %entry
680; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f]
681; X64-AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
682; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
683; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
684entry:
685  %0 = load <4 x i32>, <4 x i32>* %pb, align 16
686  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
687  ret <4 x i32> %vecinit6
688}
689
690define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
691; SSE-LABEL: insertps_from_shufflevector_i32_2:
692; SSE:       ## %bb.0: ## %entry
693; SSE-NEXT:    pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee]
694; SSE-NEXT:    ## xmm1 = xmm1[2,3,2,3]
695; SSE-NEXT:    pblendw $12, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x0c]
696; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
697; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
698;
699; AVX1-LABEL: insertps_from_shufflevector_i32_2:
700; AVX1:       ## %bb.0: ## %entry
701; AVX1-NEXT:    vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
702; AVX1-NEXT:    ## xmm1 = xmm1[2,3,2,3]
703; AVX1-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
704; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
705; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
706;
707; AVX512-LABEL: insertps_from_shufflevector_i32_2:
708; AVX512:       ## %bb.0: ## %entry
709; AVX512-NEXT:    vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
710; AVX512-NEXT:    ## xmm1 = xmm1[2,3,2,3]
711; AVX512-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
712; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
713; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
714entry:
715  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
716  ret <4 x i32> %vecinit6
717}
718
719define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
720; X86-SSE-LABEL: insertps_from_load_ins_elt_undef:
721; X86-SSE:       ## %bb.0:
722; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
723; X86-SSE-NEXT:    insertps $16, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x10]
724; X86-SSE-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
725; X86-SSE-NEXT:    retl ## encoding: [0xc3]
726;
727; X86-AVX1-LABEL: insertps_from_load_ins_elt_undef:
728; X86-AVX1:       ## %bb.0:
729; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
730; X86-AVX1-NEXT:    vinsertps $16, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x10]
731; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
732; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
733;
734; X86-AVX512-LABEL: insertps_from_load_ins_elt_undef:
735; X86-AVX512:       ## %bb.0:
736; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
737; X86-AVX512-NEXT:    vinsertps $16, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x10]
738; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
739; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
740;
741; X64-SSE-LABEL: insertps_from_load_ins_elt_undef:
742; X64-SSE:       ## %bb.0:
743; X64-SSE-NEXT:    insertps $16, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x10]
744; X64-SSE-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
745; X64-SSE-NEXT:    retq ## encoding: [0xc3]
746;
747; X64-AVX1-LABEL: insertps_from_load_ins_elt_undef:
748; X64-AVX1:       ## %bb.0:
749; X64-AVX1-NEXT:    vinsertps $16, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x10]
750; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
751; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
752;
753; X64-AVX512-LABEL: insertps_from_load_ins_elt_undef:
754; X64-AVX512:       ## %bb.0:
755; X64-AVX512-NEXT:    vinsertps $16, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x10]
756; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
757; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
758  %1 = load float, float* %b, align 4
759  %2 = insertelement <4 x float> undef, float %1, i32 0
760  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
761  ret <4 x float> %result
762}
763
764; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
765define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
766; X86-SSE-LABEL: insertps_from_load_ins_elt_undef_i32:
767; X86-SSE:       ## %bb.0:
768; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
769; X86-SSE-NEXT:    pinsrd $2, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x00,0x02]
770; X86-SSE-NEXT:    retl ## encoding: [0xc3]
771;
772; X86-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32:
773; X86-AVX1:       ## %bb.0:
774; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
775; X86-AVX1-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02]
776; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
777;
778; X86-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32:
779; X86-AVX512:       ## %bb.0:
780; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
781; X86-AVX512-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02]
782; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
783;
784; X64-SSE-LABEL: insertps_from_load_ins_elt_undef_i32:
785; X64-SSE:       ## %bb.0:
786; X64-SSE-NEXT:    pinsrd $2, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x07,0x02]
787; X64-SSE-NEXT:    retq ## encoding: [0xc3]
788;
789; X64-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32:
790; X64-AVX1:       ## %bb.0:
791; X64-AVX1-NEXT:    vpinsrd $2, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02]
792; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
793;
794; X64-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32:
795; X64-AVX512:       ## %bb.0:
796; X64-AVX512-NEXT:    vpinsrd $2, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02]
797; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
798  %1 = load i32, i32* %b, align 4
799  %2 = insertelement <4 x i32> undef, i32 %1, i32 0
800  %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
801  ret <4 x i32> %result
802}
803
804;;;;;; Shuffles optimizable with a single insertps or blend instruction
805define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
806; SSE-LABEL: shuf_XYZ0:
807; SSE:       ## %bb.0:
808; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
809; SSE-NEXT:    blendps $8, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x08]
810; SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
811; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
812;
813; AVX1-LABEL: shuf_XYZ0:
814; AVX1:       ## %bb.0:
815; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
816; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
817; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
818; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
819;
820; AVX512-LABEL: shuf_XYZ0:
821; AVX512:       ## %bb.0:
822; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
823; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
824; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
825; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
826  %vecext = extractelement <4 x float> %x, i32 0
827  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
828  %vecext1 = extractelement <4 x float> %x, i32 1
829  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
830  %vecext3 = extractelement <4 x float> %x, i32 2
831  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
832  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
833  ret <4 x float> %vecinit5
834}
835
836define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
837; SSE-LABEL: shuf_XY00:
838; SSE:       ## %bb.0:
839; SSE-NEXT:    movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0]
840; SSE-NEXT:    ## xmm0 = xmm0[0],zero
841; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
842;
843; AVX1-LABEL: shuf_XY00:
844; AVX1:       ## %bb.0:
845; AVX1-NEXT:    vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0]
846; AVX1-NEXT:    ## xmm0 = xmm0[0],zero
847; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
848;
849; AVX512-LABEL: shuf_XY00:
850; AVX512:       ## %bb.0:
851; AVX512-NEXT:    vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0]
852; AVX512-NEXT:    ## xmm0 = xmm0[0],zero
853; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
854  %vecext = extractelement <4 x float> %x, i32 0
855  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
856  %vecext1 = extractelement <4 x float> %x, i32 1
857  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
858  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
859  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
860  ret <4 x float> %vecinit4
861}
862
863define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
864; SSE-LABEL: shuf_XYY0:
865; SSE:       ## %bb.0:
866; SSE-NEXT:    insertps $104, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x68]
867; SSE-NEXT:    ## xmm0 = xmm0[0,1,1],zero
868; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
869;
870; AVX1-LABEL: shuf_XYY0:
871; AVX1:       ## %bb.0:
872; AVX1-NEXT:    vinsertps $104, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x68]
873; AVX1-NEXT:    ## xmm0 = xmm0[0,1,1],zero
874; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
875;
876; AVX512-LABEL: shuf_XYY0:
877; AVX512:       ## %bb.0:
878; AVX512-NEXT:    vinsertps $104, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x68]
879; AVX512-NEXT:    ## xmm0 = xmm0[0,1,1],zero
880; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
881  %vecext = extractelement <4 x float> %x, i32 0
882  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
883  %vecext1 = extractelement <4 x float> %x, i32 1
884  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
885  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
886  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
887  ret <4 x float> %vecinit5
888}
889
890define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
891; SSE-LABEL: shuf_XYW0:
892; SSE:       ## %bb.0:
893; SSE-NEXT:    insertps $232, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0xe8]
894; SSE-NEXT:    ## xmm0 = xmm0[0,1,3],zero
895; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
896;
897; AVX1-LABEL: shuf_XYW0:
898; AVX1:       ## %bb.0:
899; AVX1-NEXT:    vinsertps $232, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xe8]
900; AVX1-NEXT:    ## xmm0 = xmm0[0,1,3],zero
901; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
902;
903; AVX512-LABEL: shuf_XYW0:
904; AVX512:       ## %bb.0:
905; AVX512-NEXT:    vinsertps $232, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xe8]
906; AVX512-NEXT:    ## xmm0 = xmm0[0,1,3],zero
907; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
908  %vecext = extractelement <4 x float> %x, i32 0
909  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
910  %vecext1 = extractelement <4 x float> %x, i32 1
911  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
912  %vecext2 = extractelement <4 x float> %x, i32 3
913  %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
914  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
915  ret <4 x float> %vecinit4
916}
917
918define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
919; SSE-LABEL: shuf_W00W:
920; SSE:       ## %bb.0:
921; SSE-NEXT:    insertps $198, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0xc6]
922; SSE-NEXT:    ## xmm0 = xmm0[3],zero,zero,xmm0[3]
923; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
924;
925; AVX1-LABEL: shuf_W00W:
926; AVX1:       ## %bb.0:
927; AVX1-NEXT:    vinsertps $198, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xc6]
928; AVX1-NEXT:    ## xmm0 = xmm0[3],zero,zero,xmm0[3]
929; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
930;
931; AVX512-LABEL: shuf_W00W:
932; AVX512:       ## %bb.0:
933; AVX512-NEXT:    vinsertps $198, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xc6]
934; AVX512-NEXT:    ## xmm0 = xmm0[3],zero,zero,xmm0[3]
935; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
936  %vecext = extractelement <4 x float> %x, i32 3
937  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
938  %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
939  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
940  %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
941  ret <4 x float> %vecinit4
942}
943
944define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
945; SSE-LABEL: shuf_X00A:
946; SSE:       ## %bb.0:
947; SSE-NEXT:    insertps $54, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x36]
948; SSE-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm1[0]
949; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
950;
951; AVX1-LABEL: shuf_X00A:
952; AVX1:       ## %bb.0:
953; AVX1-NEXT:    vinsertps $54, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x36]
954; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm1[0]
955; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
956;
957; AVX512-LABEL: shuf_X00A:
958; AVX512:       ## %bb.0:
959; AVX512-NEXT:    vinsertps $54, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x36]
960; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm1[0]
961; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
962  %vecext = extractelement <4 x float> %x, i32 0
963  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
964  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
965  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
966  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
967  ret <4 x float> %vecinit4
968}
969
970define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
971; SSE-LABEL: shuf_X00X:
972; SSE:       ## %bb.0:
973; SSE-NEXT:    insertps $54, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x36]
974; SSE-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm0[0]
975; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
976;
977; AVX1-LABEL: shuf_X00X:
978; AVX1:       ## %bb.0:
979; AVX1-NEXT:    vinsertps $54, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x36]
980; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm0[0]
981; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
982;
983; AVX512-LABEL: shuf_X00X:
984; AVX512:       ## %bb.0:
985; AVX512-NEXT:    vinsertps $54, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x36]
986; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm0[0]
987; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
988  %vecext = extractelement <4 x float> %x, i32 0
989  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
990  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
991  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
992  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
993  ret <4 x float> %vecinit4
994}
995
996define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
997; SSE-LABEL: shuf_X0YC:
998; SSE:       ## %bb.0:
999; SSE-NEXT:    xorps %xmm2, %xmm2 ## encoding: [0x0f,0x57,0xd2]
1000; SSE-NEXT:    unpcklps %xmm2, %xmm0 ## encoding: [0x0f,0x14,0xc2]
1001; SSE-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1002; SSE-NEXT:    insertps $176, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb0]
1003; SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[2]
1004; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1005;
1006; AVX1-LABEL: shuf_X0YC:
1007; AVX1:       ## %bb.0:
1008; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
1009; AVX1-NEXT:    vunpcklps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x14,0xc2]
1010; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1011; AVX1-NEXT:    vinsertps $176, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0]
1012; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[2]
1013; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1014;
1015; AVX512-LABEL: shuf_X0YC:
1016; AVX512:       ## %bb.0:
1017; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2]
1018; AVX512-NEXT:    vunpcklps %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc2]
1019; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1020; AVX512-NEXT:    vinsertps $176, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0]
1021; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[2]
1022; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1023  %vecext = extractelement <4 x float> %x, i32 0
1024  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1025  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
1026  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
1027  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
1028  ret <4 x float> %vecinit5
1029}
1030
1031define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
1032; SSE-LABEL: i32_shuf_XYZ0:
1033; SSE:       ## %bb.0:
1034; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
1035; SSE-NEXT:    blendps $8, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x08]
1036; SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1037; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1038;
1039; AVX1-LABEL: i32_shuf_XYZ0:
1040; AVX1:       ## %bb.0:
1041; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1042; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1043; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1044; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1045;
1046; AVX512-LABEL: i32_shuf_XYZ0:
1047; AVX512:       ## %bb.0:
1048; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1049; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1050; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1051; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1052  %vecext = extractelement <4 x i32> %x, i32 0
1053  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1054  %vecext1 = extractelement <4 x i32> %x, i32 1
1055  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1056  %vecext3 = extractelement <4 x i32> %x, i32 2
1057  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
1058  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
1059  ret <4 x i32> %vecinit5
1060}
1061
1062define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
1063; SSE-LABEL: i32_shuf_XY00:
1064; SSE:       ## %bb.0:
1065; SSE-NEXT:    movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0]
1066; SSE-NEXT:    ## xmm0 = xmm0[0],zero
1067; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1068;
1069; AVX1-LABEL: i32_shuf_XY00:
1070; AVX1:       ## %bb.0:
1071; AVX1-NEXT:    vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0]
1072; AVX1-NEXT:    ## xmm0 = xmm0[0],zero
1073; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1074;
1075; AVX512-LABEL: i32_shuf_XY00:
1076; AVX512:       ## %bb.0:
1077; AVX512-NEXT:    vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0]
1078; AVX512-NEXT:    ## xmm0 = xmm0[0],zero
1079; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1080  %vecext = extractelement <4 x i32> %x, i32 0
1081  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1082  %vecext1 = extractelement <4 x i32> %x, i32 1
1083  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1084  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
1085  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
1086  ret <4 x i32> %vecinit4
1087}
1088
1089define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
1090; SSE-LABEL: i32_shuf_XYY0:
1091; SSE:       ## %bb.0:
1092; SSE-NEXT:    pshufd $212, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xd4]
1093; SSE-NEXT:    ## xmm1 = xmm0[0,1,1,3]
1094; SSE-NEXT:    pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0]
1095; SSE-NEXT:    pblendw $63, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3f]
1096; SSE-NEXT:    ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
1097; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1098;
1099; AVX1-LABEL: i32_shuf_XYY0:
1100; AVX1:       ## %bb.0:
1101; AVX1-NEXT:    vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4]
1102; AVX1-NEXT:    ## xmm0 = xmm0[0,1,1,3]
1103; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1104; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1105; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1106; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1107;
1108; AVX512-LABEL: i32_shuf_XYY0:
1109; AVX512:       ## %bb.0:
1110; AVX512-NEXT:    vpermilps $212, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4]
1111; AVX512-NEXT:    ## xmm0 = xmm0[0,1,1,3]
1112; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1113; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1114; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1115; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1116  %vecext = extractelement <4 x i32> %x, i32 0
1117  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1118  %vecext1 = extractelement <4 x i32> %x, i32 1
1119  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1120  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
1121  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
1122  ret <4 x i32> %vecinit5
1123}
1124
1125define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
1126; SSE-LABEL: i32_shuf_XYW0:
1127; SSE:       ## %bb.0:
1128; SSE-NEXT:    pshufd $244, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xf4]
1129; SSE-NEXT:    ## xmm1 = xmm0[0,1,3,3]
1130; SSE-NEXT:    pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0]
1131; SSE-NEXT:    pblendw $63, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3f]
1132; SSE-NEXT:    ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
1133; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1134;
1135; AVX1-LABEL: i32_shuf_XYW0:
1136; AVX1:       ## %bb.0:
1137; AVX1-NEXT:    vpermilps $244, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4]
1138; AVX1-NEXT:    ## xmm0 = xmm0[0,1,3,3]
1139; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1140; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1141; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1142; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1143;
1144; AVX512-LABEL: i32_shuf_XYW0:
1145; AVX512:       ## %bb.0:
1146; AVX512-NEXT:    vpermilps $244, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4]
1147; AVX512-NEXT:    ## xmm0 = xmm0[0,1,3,3]
1148; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1149; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1150; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1151; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1152  %vecext = extractelement <4 x i32> %x, i32 0
1153  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1154  %vecext1 = extractelement <4 x i32> %x, i32 1
1155  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1156  %vecext2 = extractelement <4 x i32> %x, i32 3
1157  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
1158  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
1159  ret <4 x i32> %vecinit4
1160}
1161
1162define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
1163; SSE-LABEL: i32_shuf_W00W:
1164; SSE:       ## %bb.0:
1165; SSE-NEXT:    pshufd $255, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xff]
1166; SSE-NEXT:    ## xmm1 = xmm0[3,3,3,3]
1167; SSE-NEXT:    pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0]
1168; SSE-NEXT:    pblendw $195, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc3]
1169; SSE-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
1170; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1171;
1172; AVX1-LABEL: i32_shuf_W00W:
1173; AVX1:       ## %bb.0:
1174; AVX1-NEXT:    vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
1175; AVX1-NEXT:    ## xmm0 = xmm0[3,3,3,3]
1176; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1177; AVX1-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
1178; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1179; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1180;
1181; AVX512-LABEL: i32_shuf_W00W:
1182; AVX512:       ## %bb.0:
1183; AVX512-NEXT:    vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
1184; AVX512-NEXT:    ## xmm0 = xmm0[3,3,3,3]
1185; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1186; AVX512-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
1187; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1188; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1189  %vecext = extractelement <4 x i32> %x, i32 3
1190  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1191  %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
1192  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
1193  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
1194  ret <4 x i32> %vecinit4
1195}
1196
1197define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
1198; SSE-LABEL: i32_shuf_X00A:
1199; SSE:       ## %bb.0:
1200; SSE-NEXT:    pxor %xmm2, %xmm2 ## encoding: [0x66,0x0f,0xef,0xd2]
1201; SSE-NEXT:    pblendw $252, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0xfc]
1202; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
1203; SSE-NEXT:    pshufd $0, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x00]
1204; SSE-NEXT:    ## xmm1 = xmm1[0,0,0,0]
1205; SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
1206; SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1207; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1208;
1209; AVX1-LABEL: i32_shuf_X00A:
1210; AVX1:       ## %bb.0:
1211; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
1212; AVX1-NEXT:    vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
1213; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[1,2,3]
1214; AVX1-NEXT:    vpermilps $0, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x00]
1215; AVX1-NEXT:    ## xmm1 = xmm1[0,0,0,0]
1216; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1217; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1218; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1219;
1220; AVX512-LABEL: i32_shuf_X00A:
1221; AVX512:       ## %bb.0:
1222; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
1223; AVX512-NEXT:    vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
1224; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[1,2,3]
1225; AVX512-NEXT:    vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9]
1226; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1227; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1228; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1229  %vecext = extractelement <4 x i32> %x, i32 0
1230  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1231  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
1232  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
1233  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
1234  ret <4 x i32> %vecinit4
1235}
1236
1237define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
1238; SSE-LABEL: i32_shuf_X00X:
1239; SSE:       ## %bb.0:
1240; SSE-NEXT:    pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9]
1241; SSE-NEXT:    pshufd $0, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0x00]
1242; SSE-NEXT:    ## xmm0 = xmm0[0,0,0,0]
1243; SSE-NEXT:    pblendw $60, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3c]
1244; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
1245; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1246;
1247; AVX1-LABEL: i32_shuf_X00X:
1248; AVX1:       ## %bb.0:
1249; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1250; AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
1251; AVX1-NEXT:    ## xmm0 = xmm0[0,0,0,0]
1252; AVX1-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
1253; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1254; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1255;
1256; AVX512-LABEL: i32_shuf_X00X:
1257; AVX512:       ## %bb.0:
1258; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1259; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
1260; AVX512-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
1261; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1262; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1263  %vecext = extractelement <4 x i32> %x, i32 0
1264  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1265  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
1266  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
1267  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
1268  ret <4 x i32> %vecinit4
1269}
1270
1271define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
1272; SSE-LABEL: i32_shuf_X0YC:
1273; SSE:       ## %bb.0:
1274; SSE-NEXT:    pmovzxdq %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x38,0x35,0xd0]
1275; SSE-NEXT:    ## xmm2 = xmm0[0],zero,xmm0[1],zero
1276; SSE-NEXT:    pshufd $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc1,0xaa]
1277; SSE-NEXT:    ## xmm0 = xmm1[2,2,2,2]
1278; SSE-NEXT:    pblendw $63, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0x3f]
1279; SSE-NEXT:    ## xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
1280; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1281;
1282; AVX1-LABEL: i32_shuf_X0YC:
1283; AVX1:       ## %bb.0:
1284; AVX1-NEXT:    vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0]
1285; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
1286; AVX1-NEXT:    vpshufd $170, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
1287; AVX1-NEXT:    ## xmm1 = xmm1[2,2,2,2]
1288; AVX1-NEXT:    vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0]
1289; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1290; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1291;
1292; AVX512-LABEL: i32_shuf_X0YC:
1293; AVX512:       ## %bb.0:
1294; AVX512-NEXT:    vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0]
1295; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
1296; AVX512-NEXT:    vpshufd $170, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
1297; AVX512-NEXT:    ## xmm1 = xmm1[2,2,2,2]
1298; AVX512-NEXT:    vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
1299; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1300; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1301  %vecext = extractelement <4 x i32> %x, i32 0
1302  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1303  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
1304  %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
1305  %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
1306  ret <4 x i32> %vecinit5
1307}
1308
1309;; Test for a bug in the first implementation of LowerBuildVectorv4X86
1310define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
1311; SSE-LABEL: test_insertps_no_undef:
1312; SSE:       ## %bb.0:
1313; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
1314; SSE-NEXT:    blendps $7, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc8,0x07]
1315; SSE-NEXT:    ## xmm1 = xmm0[0,1,2],xmm1[3]
1316; SSE-NEXT:    maxps %xmm1, %xmm0 ## encoding: [0x0f,0x5f,0xc1]
1317; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1318;
1319; AVX1-LABEL: test_insertps_no_undef:
1320; AVX1:       ## %bb.0:
1321; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1322; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc9,0x08]
1323; AVX1-NEXT:    ## xmm1 = xmm0[0,1,2],xmm1[3]
1324; AVX1-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5f,0xc1]
1325; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1326;
1327; AVX512-LABEL: test_insertps_no_undef:
1328; AVX512:       ## %bb.0:
1329; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1330; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc9,0x08]
1331; AVX512-NEXT:    ## xmm1 = xmm0[0,1,2],xmm1[3]
1332; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
1333; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1334  %vecext = extractelement <4 x float> %x, i32 0
1335  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1336  %vecext1 = extractelement <4 x float> %x, i32 1
1337  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1338  %vecext3 = extractelement <4 x float> %x, i32 2
1339  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
1340  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
1341  %mask = fcmp olt <4 x float> %vecinit5, %x
1342  %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
1343  ret <4 x float> %res
1344}
1345
1346define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
1347; SSE-LABEL: blendvb_fallback:
1348; SSE:       ## %bb.0:
1349; SSE-NEXT:    psllw $15, %xmm0 ## encoding: [0x66,0x0f,0x71,0xf0,0x0f]
1350; SSE-NEXT:    psraw $15, %xmm0 ## encoding: [0x66,0x0f,0x71,0xe0,0x0f]
1351; SSE-NEXT:    pblendvb %xmm0, %xmm1, %xmm2 ## encoding: [0x66,0x0f,0x38,0x10,0xd1]
1352; SSE-NEXT:    movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2]
1353; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1354;
1355; AVX1-LABEL: blendvb_fallback:
1356; AVX1:       ## %bb.0:
1357; AVX1-NEXT:    vpsllw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x0f]
1358; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x0f]
1359; AVX1-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x4c,0xc1,0x00]
1360; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1361;
1362; AVX512-LABEL: blendvb_fallback:
1363; AVX512:       ## %bb.0:
1364; AVX512-NEXT:    vpsllw $15, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x0f]
1365; AVX512-NEXT:    vpmovw2m %xmm0, %k1 ## encoding: [0x62,0xf2,0xfe,0x08,0x29,0xc8]
1366; AVX512-NEXT:    vpblendmw %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x66,0xc1]
1367; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1368  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
1369  ret <8 x i16> %ret
1370}
1371
1372; On X86, account for the argument's move to registers
1373define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
1374; X86-SSE-LABEL: insertps_from_vector_load:
1375; X86-SSE:       ## %bb.0:
1376; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1377; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
1378; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
1379; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1380; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1381;
1382; X86-AVX1-LABEL: insertps_from_vector_load:
1383; X86-AVX1:       ## %bb.0:
1384; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1385; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
1386; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1387; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1388; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1389;
1390; X86-AVX512-LABEL: insertps_from_vector_load:
1391; X86-AVX512:       ## %bb.0:
1392; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1393; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
1394; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1395; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1396; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1397;
1398; X64-SSE-LABEL: insertps_from_vector_load:
1399; X64-SSE:       ## %bb.0:
1400; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
1401; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
1402; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1403; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1404;
1405; X64-AVX1-LABEL: insertps_from_vector_load:
1406; X64-AVX1:       ## %bb.0:
1407; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
1408; X64-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1409; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1410; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1411;
1412; X64-AVX512-LABEL: insertps_from_vector_load:
1413; X64-AVX512:       ## %bb.0:
1414; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
1415; X64-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1416; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1417; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1418  %1 = load <4 x float>, <4 x float>* %pb, align 16
1419  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
1420  ret <4 x float> %2
1421}
1422
1423;; Use a non-zero CountS for insertps
1424;; Try to match a bit more of the instr, since we need the load's offset.
1425define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
1426; X86-SSE-LABEL: insertps_from_vector_load_offset:
1427; X86-SSE:       ## %bb.0:
1428; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1429; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
1430; X86-SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
1431; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1432; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1433;
1434; X86-AVX1-LABEL: insertps_from_vector_load_offset:
1435; X86-AVX1:       ## %bb.0:
1436; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1437; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
1438; X86-AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
1439; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1440; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1441;
1442; X86-AVX512-LABEL: insertps_from_vector_load_offset:
1443; X86-AVX512:       ## %bb.0:
1444; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1445; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
1446; X86-AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
1447; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1448; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1449;
1450; X64-SSE-LABEL: insertps_from_vector_load_offset:
1451; X64-SSE:       ## %bb.0:
1452; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
1453; X64-SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
1454; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1455; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1456;
1457; X64-AVX1-LABEL: insertps_from_vector_load_offset:
1458; X64-AVX1:       ## %bb.0:
1459; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
1460; X64-AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
1461; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1462; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1463;
1464; X64-AVX512-LABEL: insertps_from_vector_load_offset:
1465; X64-AVX512:       ## %bb.0:
1466; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
1467; X64-AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
1468; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1469; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1470  %1 = load <4 x float>, <4 x float>* %pb, align 16
1471  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
1472  ret <4 x float> %2
1473}
1474
1475;; Try to match a bit more of the instr, since we need the load's offset.
1476define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
1477; X86-SSE-LABEL: insertps_from_vector_load_offset_2:
1478; X86-SSE:       ## %bb.0:
1479; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1480; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
1481; X86-SSE-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
1482; X86-SSE-NEXT:    movaps (%eax,%ecx), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x08]
1483; X86-SSE-NEXT:    insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0]
1484; X86-SSE-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1485; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1486;
1487; X86-AVX1-LABEL: insertps_from_vector_load_offset_2:
1488; X86-AVX1:       ## %bb.0:
1489; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1490; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
1491; X86-AVX1-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
1492; X86-AVX1-NEXT:    vmovaps (%eax,%ecx), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x08]
1493; X86-AVX1-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
1494; X86-AVX1-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1495; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1496;
1497; X86-AVX512-LABEL: insertps_from_vector_load_offset_2:
1498; X86-AVX512:       ## %bb.0:
1499; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1500; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
1501; X86-AVX512-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
1502; X86-AVX512-NEXT:    vmovaps (%eax,%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x08]
1503; X86-AVX512-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
1504; X86-AVX512-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1505; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1506;
1507; X64-SSE-LABEL: insertps_from_vector_load_offset_2:
1508; X64-SSE:       ## %bb.0:
1509; X64-SSE-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
1510; X64-SSE-NEXT:    movaps (%rdi,%rsi), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x37]
1511; X64-SSE-NEXT:    insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0]
1512; X64-SSE-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1513; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1514;
1515; X64-AVX1-LABEL: insertps_from_vector_load_offset_2:
1516; X64-AVX1:       ## %bb.0:
1517; X64-AVX1-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
1518; X64-AVX1-NEXT:    vmovaps (%rdi,%rsi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x37]
1519; X64-AVX1-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
1520; X64-AVX1-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1521; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1522;
1523; X64-AVX512-LABEL: insertps_from_vector_load_offset_2:
1524; X64-AVX512:       ## %bb.0:
1525; X64-AVX512-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
1526; X64-AVX512-NEXT:    vmovaps (%rdi,%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x37]
1527; X64-AVX512-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
1528; X64-AVX512-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1529; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1530  %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
1531  %2 = load <4 x float>, <4 x float>* %1, align 16
1532  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
1533  ret <4 x float> %3
1534}
1535
1536define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
1537; X86-SSE-LABEL: insertps_from_broadcast_loadf32:
1538; X86-SSE:       ## %bb.0:
1539; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1540; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1541; X86-SSE-NEXT:    insertps $48, (%ecx,%eax,4), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x04,0x81,0x30]
1542; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1543; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1544;
1545; X86-AVX1-LABEL: insertps_from_broadcast_loadf32:
1546; X86-AVX1:       ## %bb.0:
1547; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1548; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1549; X86-AVX1-NEXT:    vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x04,0x81,0x30]
1550; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1551; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1552;
1553; X86-AVX512-LABEL: insertps_from_broadcast_loadf32:
1554; X86-AVX512:       ## %bb.0:
1555; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1556; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1557; X86-AVX512-NEXT:    vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0x81,0x30]
1558; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1559; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1560;
1561; X64-SSE-LABEL: insertps_from_broadcast_loadf32:
1562; X64-SSE:       ## %bb.0:
1563; X64-SSE-NEXT:    insertps $48, (%rdi,%rsi,4), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x04,0xb7,0x30]
1564; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1565; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1566;
1567; X64-AVX1-LABEL: insertps_from_broadcast_loadf32:
1568; X64-AVX1:       ## %bb.0:
1569; X64-AVX1-NEXT:    vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x04,0xb7,0x30]
1570; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1571; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1572;
1573; X64-AVX512-LABEL: insertps_from_broadcast_loadf32:
1574; X64-AVX512:       ## %bb.0:
1575; X64-AVX512-NEXT:    vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0xb7,0x30]
1576; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1577; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1578  %1 = getelementptr inbounds float, float* %fb, i64 %index
1579  %2 = load float, float* %1, align 4
1580  %3 = insertelement <4 x float> undef, float %2, i32 0
1581  %4 = insertelement <4 x float> %3, float %2, i32 1
1582  %5 = insertelement <4 x float> %4, float %2, i32 2
1583  %6 = insertelement <4 x float> %5, float %2, i32 3
1584  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
1585  ret <4 x float> %7
1586}
1587
1588define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
1589; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32:
1590; X86-SSE:       ## %bb.0:
1591; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1592; X86-SSE-NEXT:    movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08]
1593; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
1594; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1595; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1596;
1597; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
1598; X86-AVX1:       ## %bb.0:
1599; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1600; X86-AVX1-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
1601; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1602; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1603;
1604; X86-AVX512-LABEL: insertps_from_broadcast_loadv4f32:
1605; X86-AVX512:       ## %bb.0:
1606; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1607; X86-AVX512-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
1608; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1609; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1610;
1611; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32:
1612; X64-SSE:       ## %bb.0:
1613; X64-SSE-NEXT:    movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f]
1614; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
1615; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1616; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1617;
1618; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
1619; X64-AVX1:       ## %bb.0:
1620; X64-AVX1-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
1621; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1622; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1623;
1624; X64-AVX512-LABEL: insertps_from_broadcast_loadv4f32:
1625; X64-AVX512:       ## %bb.0:
1626; X64-AVX512-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
1627; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1628; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1629  %1 = load <4 x float>, <4 x float>* %b, align 4
1630  %2 = extractelement <4 x float> %1, i32 0
1631  %3 = insertelement <4 x float> undef, float %2, i32 0
1632  %4 = insertelement <4 x float> %3, float %2, i32 1
1633  %5 = insertelement <4 x float> %4, float %2, i32 2
1634  %6 = insertelement <4 x float> %5, float %2, i32 3
1635  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
1636  ret <4 x float> %7
1637}
1638
1639define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
1640; X86-SSE-LABEL: insertps_from_broadcast_multiple_use:
1641; X86-SSE:       ## %bb.0:
1642; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1643; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1644; X86-SSE-NEXT:    movss (%ecx,%eax,4), %xmm4 ## encoding: [0xf3,0x0f,0x10,0x24,0x81]
1645; X86-SSE-NEXT:    ## xmm4 = mem[0],zero,zero,zero
1646; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30]
1647; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1648; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30]
1649; X86-SSE-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1650; X86-SSE-NEXT:    addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1]
1651; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30]
1652; X86-SSE-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[0]
1653; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30]
1654; X86-SSE-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[0]
1655; X86-SSE-NEXT:    addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda]
1656; X86-SSE-NEXT:    addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3]
1657; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1658;
1659; X86-AVX1-LABEL: insertps_from_broadcast_multiple_use:
1660; X86-AVX1:       ## %bb.0:
1661; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1662; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1663; X86-AVX1-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
1664; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1665; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1666; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1667; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1668; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1669; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1670; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
1671; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1672; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
1673; X86-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
1674; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1675; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1676;
1677; X86-AVX512-LABEL: insertps_from_broadcast_multiple_use:
1678; X86-AVX512:       ## %bb.0:
1679; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1680; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1681; X86-AVX512-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
1682; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1683; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1684; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1685; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1686; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1687; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1688; X86-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
1689; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1690; X86-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
1691; X86-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
1692; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1693; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1694;
1695; X64-SSE-LABEL: insertps_from_broadcast_multiple_use:
1696; X64-SSE:       ## %bb.0:
1697; X64-SSE-NEXT:    movss (%rdi,%rsi,4), %xmm4 ## encoding: [0xf3,0x0f,0x10,0x24,0xb7]
1698; X64-SSE-NEXT:    ## xmm4 = mem[0],zero,zero,zero
1699; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30]
1700; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1701; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30]
1702; X64-SSE-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1703; X64-SSE-NEXT:    addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1]
1704; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30]
1705; X64-SSE-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[0]
1706; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30]
1707; X64-SSE-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[0]
1708; X64-SSE-NEXT:    addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda]
1709; X64-SSE-NEXT:    addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3]
1710; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1711;
1712; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
1713; X64-AVX1:       ## %bb.0:
1714; X64-AVX1-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
1715; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1716; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1717; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1718; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1719; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1720; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1721; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
1722; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1723; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
1724; X64-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
1725; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1726; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1727;
1728; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
1729; X64-AVX512:       ## %bb.0:
1730; X64-AVX512-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
1731; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1732; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1733; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1734; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1735; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1736; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1737; X64-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
1738; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1739; X64-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
1740; X64-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
1741; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1742; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1743  %1 = getelementptr inbounds float, float* %fb, i64 %index
1744  %2 = load float, float* %1, align 4
1745  %3 = insertelement <4 x float> undef, float %2, i32 0
1746  %4 = insertelement <4 x float> %3, float %2, i32 1
1747  %5 = insertelement <4 x float> %4, float %2, i32 2
1748  %6 = insertelement <4 x float> %5, float %2, i32 3
1749  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
1750  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
1751  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
1752  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
1753  %11 = fadd <4 x float> %7, %8
1754  %12 = fadd <4 x float> %9, %10
1755  %13 = fadd <4 x float> %11, %12
1756  ret <4 x float> %13
1757}
1758
1759define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
1760; X86-SSE-LABEL: insertps_with_undefs:
1761; X86-SSE:       ## %bb.0:
1762; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1763; X86-SSE-NEXT:    movss (%eax), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x08]
1764; X86-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
1765; X86-SSE-NEXT:    movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8]
1766; X86-SSE-NEXT:    ## xmm1 = xmm1[0],xmm0[0]
1767; X86-SSE-NEXT:    movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1]
1768; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1769;
1770; X86-AVX1-LABEL: insertps_with_undefs:
1771; X86-AVX1:       ## %bb.0:
1772; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1773; X86-AVX1-NEXT:    vmovss (%eax), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x08]
1774; X86-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
1775; X86-AVX1-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc0]
1776; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
1777; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1778;
1779; X86-AVX512-LABEL: insertps_with_undefs:
1780; X86-AVX512:       ## %bb.0:
1781; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1782; X86-AVX512-NEXT:    vmovss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08]
1783; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
1784; X86-AVX512-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0]
1785; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
1786; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1787;
1788; X64-SSE-LABEL: insertps_with_undefs:
1789; X64-SSE:       ## %bb.0:
1790; X64-SSE-NEXT:    movss (%rdi), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x0f]
1791; X64-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
1792; X64-SSE-NEXT:    movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8]
1793; X64-SSE-NEXT:    ## xmm1 = xmm1[0],xmm0[0]
1794; X64-SSE-NEXT:    movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1]
1795; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1796;
1797; X64-AVX1-LABEL: insertps_with_undefs:
1798; X64-AVX1:       ## %bb.0:
1799; X64-AVX1-NEXT:    vmovss (%rdi), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x0f]
1800; X64-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
1801; X64-AVX1-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc0]
1802; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
1803; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1804;
1805; X64-AVX512-LABEL: insertps_with_undefs:
1806; X64-AVX512:       ## %bb.0:
1807; X64-AVX512-NEXT:    vmovss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f]
1808; X64-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
1809; X64-AVX512-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0]
1810; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
1811; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1812  %1 = load float, float* %b, align 4
1813  %2 = insertelement <4 x float> undef, float %1, i32 0
1814  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
1815  ret <4 x float> %result
1816}
1817
1818; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
1819; the destination index to change the load, instead of the source index.
1820define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
1821; X86-SSE-LABEL: pr20087:
1822; X86-SSE:       ## %bb.0:
1823; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1824; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
1825; X86-SSE-NEXT:    insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2]
1826; X86-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1827; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1828;
1829; X86-AVX1-LABEL: pr20087:
1830; X86-AVX1:       ## %bb.0:
1831; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1832; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
1833; X86-AVX1-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
1834; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1835; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1836;
1837; X86-AVX512-LABEL: pr20087:
1838; X86-AVX512:       ## %bb.0:
1839; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1840; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
1841; X86-AVX512-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
1842; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1843; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1844;
1845; X64-SSE-LABEL: pr20087:
1846; X64-SSE:       ## %bb.0:
1847; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
1848; X64-SSE-NEXT:    insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2]
1849; X64-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1850; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1851;
1852; X64-AVX1-LABEL: pr20087:
1853; X64-AVX1:       ## %bb.0:
1854; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
1855; X64-AVX1-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
1856; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1857; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1858;
1859; X64-AVX512-LABEL: pr20087:
1860; X64-AVX512:       ## %bb.0:
1861; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
1862; X64-AVX512-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
1863; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1864; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1865  %load = load <4 x float> , <4 x float> *%ptr
1866  %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
1867  ret <4 x float> %ret
1868}
1869
1870; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
1871define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
1872; X86-SSE-LABEL: insertps_pr20411:
1873; X86-SSE:       ## %bb.0:
1874; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1875; X86-SSE-NEXT:    pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee]
1876; X86-SSE-NEXT:    ## xmm1 = xmm1[2,3,2,3]
1877; X86-SSE-NEXT:    pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3]
1878; X86-SSE-NEXT:    ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1879; X86-SSE-NEXT:    movdqu %xmm1, (%eax) ## encoding: [0xf3,0x0f,0x7f,0x08]
1880; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1881;
1882; X86-AVX1-LABEL: insertps_pr20411:
1883; X86-AVX1:       ## %bb.0:
1884; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1885; X86-AVX1-NEXT:    vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
1886; X86-AVX1-NEXT:    ## xmm1 = xmm1[2,3,2,3]
1887; X86-AVX1-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
1888; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1889; X86-AVX1-NEXT:    vmovups %xmm0, (%eax) ## encoding: [0xc5,0xf8,0x11,0x00]
1890; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1891;
1892; X86-AVX512-LABEL: insertps_pr20411:
1893; X86-AVX512:       ## %bb.0:
1894; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1895; X86-AVX512-NEXT:    vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
1896; X86-AVX512-NEXT:    ## xmm1 = xmm1[2,3,2,3]
1897; X86-AVX512-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
1898; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1899; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
1900; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1901;
1902; X64-SSE-LABEL: insertps_pr20411:
1903; X64-SSE:       ## %bb.0:
1904; X64-SSE-NEXT:    pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee]
1905; X64-SSE-NEXT:    ## xmm1 = xmm1[2,3,2,3]
1906; X64-SSE-NEXT:    pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3]
1907; X64-SSE-NEXT:    ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1908; X64-SSE-NEXT:    movdqu %xmm1, (%rdi) ## encoding: [0xf3,0x0f,0x7f,0x0f]
1909; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1910;
1911; X64-AVX1-LABEL: insertps_pr20411:
1912; X64-AVX1:       ## %bb.0:
1913; X64-AVX1-NEXT:    vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
1914; X64-AVX1-NEXT:    ## xmm1 = xmm1[2,3,2,3]
1915; X64-AVX1-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
1916; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1917; X64-AVX1-NEXT:    vmovups %xmm0, (%rdi) ## encoding: [0xc5,0xf8,0x11,0x07]
1918; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1919;
1920; X64-AVX512-LABEL: insertps_pr20411:
1921; X64-AVX512:       ## %bb.0:
1922; X64-AVX512-NEXT:    vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
1923; X64-AVX512-NEXT:    ## xmm1 = xmm1[2,3,2,3]
1924; X64-AVX512-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
1925; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1926; X64-AVX512-NEXT:    vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
1927; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1928  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
1929  %ptrcast = bitcast i32* %RET to <4 x i32>*
1930  store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
1931  ret void
1932}
1933
1934define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
1935; SSE-LABEL: insertps_4:
1936; SSE:       ## %bb.0:
1937; SSE-NEXT:    insertps $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xaa]
1938; SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[2],zero
1939; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1940;
1941; AVX1-LABEL: insertps_4:
1942; AVX1:       ## %bb.0:
1943; AVX1-NEXT:    vinsertps $170, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xaa]
1944; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[2],zero
1945; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1946;
1947; AVX512-LABEL: insertps_4:
1948; AVX512:       ## %bb.0:
1949; AVX512-NEXT:    vinsertps $170, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xaa]
1950; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[2],zero
1951; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1952  %vecext = extractelement <4 x float> %A, i32 0
1953  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1954  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1955  %vecext2 = extractelement <4 x float> %B, i32 2
1956  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1957  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1958  ret <4 x float> %vecinit4
1959}
1960
1961define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
1962; SSE-LABEL: insertps_5:
1963; SSE:       ## %bb.0:
1964; SSE-NEXT:    insertps $92, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x5c]
1965; SSE-NEXT:    ## xmm0 = xmm0[0],xmm1[1],zero,zero
1966; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1967;
1968; AVX1-LABEL: insertps_5:
1969; AVX1:       ## %bb.0:
1970; AVX1-NEXT:    vinsertps $92, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x5c]
1971; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],zero,zero
1972; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1973;
1974; AVX512-LABEL: insertps_5:
1975; AVX512:       ## %bb.0:
1976; AVX512-NEXT:    vinsertps $92, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x5c]
1977; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],zero,zero
1978; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1979  %vecext = extractelement <4 x float> %A, i32 0
1980  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1981  %vecext1 = extractelement <4 x float> %B, i32 1
1982  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1983  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1984  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1985  ret <4 x float> %vecinit4
1986}
1987
1988define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
1989; SSE-LABEL: insertps_6:
1990; SSE:       ## %bb.0:
1991; SSE-NEXT:    insertps $169, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xa9]
1992; SSE-NEXT:    ## xmm0 = zero,xmm0[1],xmm1[2],zero
1993; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1994;
1995; AVX1-LABEL: insertps_6:
1996; AVX1:       ## %bb.0:
1997; AVX1-NEXT:    vinsertps $169, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xa9]
1998; AVX1-NEXT:    ## xmm0 = zero,xmm0[1],xmm1[2],zero
1999; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2000;
2001; AVX512-LABEL: insertps_6:
2002; AVX512:       ## %bb.0:
2003; AVX512-NEXT:    vinsertps $169, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xa9]
2004; AVX512-NEXT:    ## xmm0 = zero,xmm0[1],xmm1[2],zero
2005; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2006  %vecext = extractelement <4 x float> %A, i32 1
2007  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
2008  %vecext1 = extractelement <4 x float> %B, i32 2
2009  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
2010  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
2011  ret <4 x float> %vecinit3
2012}
2013
2014define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
2015; SSE-LABEL: insertps_7:
2016; SSE:       ## %bb.0:
2017; SSE-NEXT:    insertps $106, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x6a]
2018; SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[1],zero
2019; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2020;
2021; AVX1-LABEL: insertps_7:
2022; AVX1:       ## %bb.0:
2023; AVX1-NEXT:    vinsertps $106, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x6a]
2024; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[1],zero
2025; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2026;
2027; AVX512-LABEL: insertps_7:
2028; AVX512:       ## %bb.0:
2029; AVX512-NEXT:    vinsertps $106, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x6a]
2030; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[1],zero
2031; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2032  %vecext = extractelement <4 x float> %A, i32 0
2033  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
2034  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
2035  %vecext2 = extractelement <4 x float> %B, i32 1
2036  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
2037  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
2038  ret <4 x float> %vecinit4
2039}
2040
2041define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
2042; SSE-LABEL: insertps_8:
2043; SSE:       ## %bb.0:
2044; SSE-NEXT:    insertps $28, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x1c]
2045; SSE-NEXT:    ## xmm0 = xmm0[0],xmm1[0],zero,zero
2046; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2047;
2048; AVX1-LABEL: insertps_8:
2049; AVX1:       ## %bb.0:
2050; AVX1-NEXT:    vinsertps $28, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x1c]
2051; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[0],zero,zero
2052; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2053;
2054; AVX512-LABEL: insertps_8:
2055; AVX512:       ## %bb.0:
2056; AVX512-NEXT:    vinsertps $28, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x1c]
2057; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[0],zero,zero
2058; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2059  %vecext = extractelement <4 x float> %A, i32 0
2060  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
2061  %vecext1 = extractelement <4 x float> %B, i32 0
2062  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
2063  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
2064  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
2065  ret <4 x float> %vecinit4
2066}
2067
2068define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
2069; SSE-LABEL: insertps_9:
2070; SSE:       ## %bb.0:
2071; SSE-NEXT:    insertps $25, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xc8,0x19]
2072; SSE-NEXT:    ## xmm1 = zero,xmm0[0],xmm1[2],zero
2073; SSE-NEXT:    movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1]
2074; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2075;
2076; AVX1-LABEL: insertps_9:
2077; AVX1:       ## %bb.0:
2078; AVX1-NEXT:    vinsertps $25, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x19]
2079; AVX1-NEXT:    ## xmm0 = zero,xmm0[0],xmm1[2],zero
2080; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2081;
2082; AVX512-LABEL: insertps_9:
2083; AVX512:       ## %bb.0:
2084; AVX512-NEXT:    vinsertps $25, %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x19]
2085; AVX512-NEXT:    ## xmm0 = zero,xmm0[0],xmm1[2],zero
2086; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2087  %vecext = extractelement <4 x float> %A, i32 0
2088  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
2089  %vecext1 = extractelement <4 x float> %B, i32 2
2090  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
2091  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
2092  ret <4 x float> %vecinit3
2093}
2094
2095define <4 x float> @insertps_10(<4 x float> %A) {
2096; SSE-LABEL: insertps_10:
2097; SSE:       ## %bb.0:
2098; SSE-NEXT:    insertps $42, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x2a]
2099; SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[0],zero
2100; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2101;
2102; AVX1-LABEL: insertps_10:
2103; AVX1:       ## %bb.0:
2104; AVX1-NEXT:    vinsertps $42, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x2a]
2105; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[0],zero
2106; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2107;
2108; AVX512-LABEL: insertps_10:
2109; AVX512:       ## %bb.0:
2110; AVX512-NEXT:    vinsertps $42, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x2a]
2111; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[0],zero
2112; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2113  %vecext = extractelement <4 x float> %A, i32 0
2114  %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0
2115  %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2
2116  ret <4 x float> %vecbuild2
2117}
2118
2119define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
2120; SSE-LABEL: build_vector_to_shuffle_1:
2121; SSE:       ## %bb.0:
2122; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
2123; SSE-NEXT:    blendps $5, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x05]
2124; SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2125; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2126;
2127; AVX1-LABEL: build_vector_to_shuffle_1:
2128; AVX1:       ## %bb.0:
2129; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
2130; AVX1-NEXT:    vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
2131; AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2132; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2133;
2134; AVX512-LABEL: build_vector_to_shuffle_1:
2135; AVX512:       ## %bb.0:
2136; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
2137; AVX512-NEXT:    vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
2138; AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2139; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2140  %vecext = extractelement <4 x float> %A, i32 1
2141  %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
2142  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
2143  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2144  ret <4 x float> %vecinit3
2145}
2146
2147define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
2148; SSE-LABEL: build_vector_to_shuffle_2:
2149; SSE:       ## %bb.0:
2150; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
2151; SSE-NEXT:    blendps $13, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x0d]
2152; SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2153; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2154;
2155; AVX1-LABEL: build_vector_to_shuffle_2:
2156; AVX1:       ## %bb.0:
2157; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
2158; AVX1-NEXT:    vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
2159; AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2160; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2161;
2162; AVX512-LABEL: build_vector_to_shuffle_2:
2163; AVX512:       ## %bb.0:
2164; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
2165; AVX512-NEXT:    vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
2166; AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2167; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2168  %vecext = extractelement <4 x float> %A, i32 1
2169  %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
2170  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
2171  ret <4 x float> %vecinit1
2172}
2173