1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -verify-machineinstrs -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
13; CHECK-LABEL: stack_fold_addpd:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16; CHECK-NEXT:    #APP
17; CHECK-NEXT:    nop
18; CHECK-NEXT:    #NO_APP
19; CHECK-NEXT:    vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
20; CHECK-NEXT:    retq
21  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
22  %2 = fadd <2 x double> %a0, %a1
23  ret <2 x double> %2
24}
25
26define <4 x double> @stack_fold_addpd_ymm(<4 x double> %a0, <4 x double> %a1) {
27; CHECK-LABEL: stack_fold_addpd_ymm:
28; CHECK:       # %bb.0:
29; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
30; CHECK-NEXT:    #APP
31; CHECK-NEXT:    nop
32; CHECK-NEXT:    #NO_APP
33; CHECK-NEXT:    vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
34; CHECK-NEXT:    retq
35  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
36  %2 = fadd <4 x double> %a0, %a1
37  ret <4 x double> %2
38}
39
40define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
41; CHECK-LABEL: stack_fold_addps:
42; CHECK:       # %bb.0:
43; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
44; CHECK-NEXT:    #APP
45; CHECK-NEXT:    nop
46; CHECK-NEXT:    #NO_APP
47; CHECK-NEXT:    vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
48; CHECK-NEXT:    retq
49  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
50  %2 = fadd <4 x float> %a0, %a1
51  ret <4 x float> %2
52}
53
54define <8 x float> @stack_fold_addps_ymm(<8 x float> %a0, <8 x float> %a1) {
55; CHECK-LABEL: stack_fold_addps_ymm:
56; CHECK:       # %bb.0:
57; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
58; CHECK-NEXT:    #APP
59; CHECK-NEXT:    nop
60; CHECK-NEXT:    #NO_APP
61; CHECK-NEXT:    vaddps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
62; CHECK-NEXT:    retq
63  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
64  %2 = fadd <8 x float> %a0, %a1
65  ret <8 x float> %2
66}
67
68define double @stack_fold_addsd(double %a0, double %a1) {
69; CHECK-LABEL: stack_fold_addsd:
70; CHECK:       # %bb.0:
71; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
72; CHECK-NEXT:    #APP
73; CHECK-NEXT:    nop
74; CHECK-NEXT:    #NO_APP
75; CHECK-NEXT:    vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
76; CHECK-NEXT:    retq
77  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
78  %2 = fadd double %a0, %a1
79  ret double %2
80}
81
82define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
83; CHECK-LABEL: stack_fold_addsd_int:
84; CHECK:       # %bb.0:
85; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
86; CHECK-NEXT:    #APP
87; CHECK-NEXT:    nop
88; CHECK-NEXT:    #NO_APP
89; CHECK-NEXT:    vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
90; CHECK-NEXT:    retq
91  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
92  %2 = extractelement <2 x double> %a0, i32 0
93  %3 = extractelement <2 x double> %a1, i32 0
94  %4 = fadd double %2, %3
95  %5 = insertelement <2 x double> %a0, double %4, i32 0
96  ret <2 x double> %5
97}
98declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
99
100define float @stack_fold_addss(float %a0, float %a1) {
101; CHECK-LABEL: stack_fold_addss:
102; CHECK:       # %bb.0:
103; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
104; CHECK-NEXT:    #APP
105; CHECK-NEXT:    nop
106; CHECK-NEXT:    #NO_APP
107; CHECK-NEXT:    vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
108; CHECK-NEXT:    retq
109  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
110  %2 = fadd float %a0, %a1
111  ret float %2
112}
113
114define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
115; CHECK-LABEL: stack_fold_addss_int:
116; CHECK:       # %bb.0:
117; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
118; CHECK-NEXT:    #APP
119; CHECK-NEXT:    nop
120; CHECK-NEXT:    #NO_APP
121; CHECK-NEXT:    vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
122; CHECK-NEXT:    retq
123  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
124  %2 = extractelement <4 x float> %a0, i32 0
125  %3 = extractelement <4 x float> %a1, i32 0
126  %4 = fadd float %2, %3
127  %5 = insertelement <4 x float> %a0, float %4, i32 0
128  ret <4 x float> %5
129}
130declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
131
132define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
133; CHECK-LABEL: stack_fold_addsubpd:
134; CHECK:       # %bb.0:
135; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
136; CHECK-NEXT:    #APP
137; CHECK-NEXT:    nop
138; CHECK-NEXT:    #NO_APP
139; CHECK-NEXT:    vaddsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
140; CHECK-NEXT:    retq
141  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
142  %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
143  ret <2 x double> %2
144}
145declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
146
147define <4 x double> @stack_fold_addsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
148; CHECK-LABEL: stack_fold_addsubpd_ymm:
149; CHECK:       # %bb.0:
150; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
151; CHECK-NEXT:    #APP
152; CHECK-NEXT:    nop
153; CHECK-NEXT:    #NO_APP
154; CHECK-NEXT:    vaddsubpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
155; CHECK-NEXT:    retq
156  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
157  %2 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
158  ret <4 x double> %2
159}
160declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
161
162define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
163; CHECK-LABEL: stack_fold_addsubps:
164; CHECK:       # %bb.0:
165; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
166; CHECK-NEXT:    #APP
167; CHECK-NEXT:    nop
168; CHECK-NEXT:    #NO_APP
169; CHECK-NEXT:    vaddsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
170; CHECK-NEXT:    retq
171  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
172  %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
173  ret <4 x float> %2
174}
175declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
176
177define <8 x float> @stack_fold_addsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
178; CHECK-LABEL: stack_fold_addsubps_ymm:
179; CHECK:       # %bb.0:
180; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
181; CHECK-NEXT:    #APP
182; CHECK-NEXT:    nop
183; CHECK-NEXT:    #NO_APP
184; CHECK-NEXT:    vaddsubps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
185; CHECK-NEXT:    retq
186  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
187  %2 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
188  ret <8 x float> %2
189}
190declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
191
192define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
193; CHECK-LABEL: stack_fold_andnpd:
194; CHECK:       # %bb.0:
195; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
196; CHECK-NEXT:    #APP
197; CHECK-NEXT:    nop
198; CHECK-NEXT:    #NO_APP
199; CHECK-NEXT:    vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
200; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
201; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
202; CHECK-NEXT:    retq
203  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
204  %2 = bitcast <2 x double> %a0 to <2 x i64>
205  %3 = bitcast <2 x double> %a1 to <2 x i64>
206  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
207  %5 = and <2 x i64> %4, %3
208  %6 = bitcast <2 x i64> %5 to <2 x double>
209  ; fadd forces execution domain
210  %7 = fadd <2 x double> %6, <double 0x0, double 0x0>
211  ret <2 x double> %7
212}
213
214define <4 x double> @stack_fold_andnpd_ymm(<4 x double> %a0, <4 x double> %a1) {
215; CHECK-LABEL: stack_fold_andnpd_ymm:
216; CHECK:       # %bb.0:
217; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
218; CHECK-NEXT:    #APP
219; CHECK-NEXT:    nop
220; CHECK-NEXT:    #NO_APP
221; CHECK-NEXT:    vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
222; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
223; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
224; CHECK-NEXT:    retq
225  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
226  %2 = bitcast <4 x double> %a0 to <4 x i64>
227  %3 = bitcast <4 x double> %a1 to <4 x i64>
228  %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
229  %5 = and <4 x i64> %4, %3
230  %6 = bitcast <4 x i64> %5 to <4 x double>
231  ; fadd forces execution domain
232  %7 = fadd <4 x double> %6, <double 0x0, double 0x0, double 0x0, double 0x0>
233  ret <4 x double> %7
234}
235
236define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
237; CHECK-LABEL: stack_fold_andnps:
238; CHECK:       # %bb.0:
239; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
240; CHECK-NEXT:    #APP
241; CHECK-NEXT:    nop
242; CHECK-NEXT:    #NO_APP
243; CHECK-NEXT:    vandnps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
244; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
245; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
246; CHECK-NEXT:    retq
247  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
248  %2 = bitcast <4 x float> %a0 to <2 x i64>
249  %3 = bitcast <4 x float> %a1 to <2 x i64>
250  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
251  %5 = and <2 x i64> %4, %3
252  %6 = bitcast <2 x i64> %5 to <4 x float>
253  ; fadd forces execution domain
254  %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0>
255  ret <4 x float> %7
256}
257
258define <8 x float> @stack_fold_andnps_ymm(<8 x float> %a0, <8 x float> %a1) {
259; CHECK-LABEL: stack_fold_andnps_ymm:
260; CHECK:       # %bb.0:
261; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
262; CHECK-NEXT:    #APP
263; CHECK-NEXT:    nop
264; CHECK-NEXT:    #NO_APP
265; CHECK-NEXT:    vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
266; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
267; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
268; CHECK-NEXT:    retq
269  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
270  %2 = bitcast <8 x float> %a0 to <4 x i64>
271  %3 = bitcast <8 x float> %a1 to <4 x i64>
272  %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
273  %5 = and <4 x i64> %4, %3
274  %6 = bitcast <4 x i64> %5 to <8 x float>
275  ; fadd forces execution domain
276  %7 = fadd <8 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
277  ret <8 x float> %7
278}
279
280define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
281; CHECK-LABEL: stack_fold_andpd:
282; CHECK:       # %bb.0:
283; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
284; CHECK-NEXT:    #APP
285; CHECK-NEXT:    nop
286; CHECK-NEXT:    #NO_APP
287; CHECK-NEXT:    vandpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
288; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
289; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
290; CHECK-NEXT:    retq
291  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
292  %2 = bitcast <2 x double> %a0 to <2 x i64>
293  %3 = bitcast <2 x double> %a1 to <2 x i64>
294  %4 = and <2 x i64> %2, %3
295  %5 = bitcast <2 x i64> %4 to <2 x double>
296  ; fadd forces execution domain
297  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
298  ret <2 x double> %6
299}
300
301define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) {
302; CHECK-LABEL: stack_fold_andpd_ymm:
303; CHECK:       # %bb.0:
304; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
305; CHECK-NEXT:    #APP
306; CHECK-NEXT:    nop
307; CHECK-NEXT:    #NO_APP
308; CHECK-NEXT:    vandpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
309; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
310; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
311; CHECK-NEXT:    retq
312  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
313  %2 = bitcast <4 x double> %a0 to <4 x i64>
314  %3 = bitcast <4 x double> %a1 to <4 x i64>
315  %4 = and <4 x i64> %2, %3
316  %5 = bitcast <4 x i64> %4 to <4 x double>
317  ; fadd forces execution domain
318  %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
319  ret <4 x double> %6
320}
321
322define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
323; CHECK-LABEL: stack_fold_andps:
324; CHECK:       # %bb.0:
325; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
326; CHECK-NEXT:    #APP
327; CHECK-NEXT:    nop
328; CHECK-NEXT:    #NO_APP
329; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
330; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
331; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
332; CHECK-NEXT:    retq
333  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
334  %2 = bitcast <4 x float> %a0 to <2 x i64>
335  %3 = bitcast <4 x float> %a1 to <2 x i64>
336  %4 = and <2 x i64> %2, %3
337  %5 = bitcast <2 x i64> %4 to <4 x float>
338  ; fadd forces execution domain
339  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
340  ret <4 x float> %6
341}
342
343define <8 x float> @stack_fold_andps_ymm(<8 x float> %a0, <8 x float> %a1) {
344; CHECK-LABEL: stack_fold_andps_ymm:
345; CHECK:       # %bb.0:
346; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
347; CHECK-NEXT:    #APP
348; CHECK-NEXT:    nop
349; CHECK-NEXT:    #NO_APP
350; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
351; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
352; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
353; CHECK-NEXT:    retq
354  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
355  %2 = bitcast <8 x float> %a0 to <4 x i64>
356  %3 = bitcast <8 x float> %a1 to <4 x i64>
357  %4 = and <4 x i64> %2, %3
358  %5 = bitcast <4 x i64> %4 to <8 x float>
359  ; fadd forces execution domain
360  %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
361  ret <8 x float> %6
362}
363
364define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
365; CHECK-LABEL: stack_fold_blendpd:
366; CHECK:       # %bb.0:
367; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
368; CHECK-NEXT:    #APP
369; CHECK-NEXT:    nop
370; CHECK-NEXT:    #NO_APP
371; CHECK-NEXT:    vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
372; CHECK-NEXT:    # xmm0 = xmm0[0],mem[1]
373; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
374; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
375; CHECK-NEXT:    retq
376  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
377  %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1
378  ; fadd forces execution domain
379  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
380  ret <2 x double> %3
381}
382
383define <4 x double> @stack_fold_blendpd_ymm(<4 x double> %a0, <4 x double> %a1) {
384; CHECK-LABEL: stack_fold_blendpd_ymm:
385; CHECK:       # %bb.0:
386; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
387; CHECK-NEXT:    #APP
388; CHECK-NEXT:    nop
389; CHECK-NEXT:    #NO_APP
390; CHECK-NEXT:    vblendpd $6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
391; CHECK-NEXT:    # ymm0 = ymm0[0],mem[1,2],ymm0[3]
392; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
393; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
394; CHECK-NEXT:    retq
395  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
396  %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %a0, <4 x double> %a1
397  ; fadd forces execution domain
398  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
399  ret <4 x double> %3}
400
401define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
402  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
403  %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1
404  ; fadd forces execution domain
405  %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
406  ret <4 x float> %3
407}
408
409define <8 x float> @stack_fold_blendps_ymm(<8 x float> %a0, <8 x float> %a1) {
410; CHECK-LABEL: stack_fold_blendps_ymm:
411; CHECK:       # %bb.0:
412; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
413; CHECK-NEXT:    #APP
414; CHECK-NEXT:    nop
415; CHECK-NEXT:    #NO_APP
416; CHECK-NEXT:    vblendps $102, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
417; CHECK-NEXT:    # ymm0 = ymm0[0],mem[1,2],ymm0[3,4],mem[5,6],ymm0[7]
418; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
419; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
420; CHECK-NEXT:    retq
421  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
422  %2 = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %a0, <8 x float> %a1
423  ; fadd forces execution domain
424  %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
425  ret <8 x float> %3
426}
427
428define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
429; CHECK-LABEL: stack_fold_blendvpd:
430; CHECK:       # %bb.0:
431; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
432; CHECK-NEXT:    #APP
433; CHECK-NEXT:    nop
434; CHECK-NEXT:    #NO_APP
435; CHECK-NEXT:    vblendvpd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
436; CHECK-NEXT:    retq
437  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
438  %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
439  ret <2 x double> %2
440}
441declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
442
443define <4 x double> @stack_fold_blendvpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %c) {
444; CHECK-LABEL: stack_fold_blendvpd_ymm:
445; CHECK:       # %bb.0:
446; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
447; CHECK-NEXT:    #APP
448; CHECK-NEXT:    nop
449; CHECK-NEXT:    #NO_APP
450; CHECK-NEXT:    vblendvpd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
451; CHECK-NEXT:    retq
452  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
453  %2 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a1, <4 x double> %c, <4 x double> %a0)
454  ret <4 x double> %2
455}
456declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
457
458define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
459; CHECK-LABEL: stack_fold_blendvps:
460; CHECK:       # %bb.0:
461; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
462; CHECK-NEXT:    #APP
463; CHECK-NEXT:    nop
464; CHECK-NEXT:    #NO_APP
465; CHECK-NEXT:    vblendvps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
466; CHECK-NEXT:    retq
467  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
468  %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
469  ret <4 x float> %2
470}
471declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
472
473define <8 x float> @stack_fold_blendvps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %c) {
474; CHECK-LABEL: stack_fold_blendvps_ymm:
475; CHECK:       # %bb.0:
476; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
477; CHECK-NEXT:    #APP
478; CHECK-NEXT:    nop
479; CHECK-NEXT:    #NO_APP
480; CHECK-NEXT:    vblendvps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
481; CHECK-NEXT:    retq
482  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
483  %2 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a1, <8 x float> %c, <8 x float> %a0)
484  ret <8 x float> %2
485}
486declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
487
488define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
489; CHECK-LABEL: stack_fold_cmppd:
490; CHECK:       # %bb.0:
491; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
492; CHECK-NEXT:    #APP
493; CHECK-NEXT:    nop
494; CHECK-NEXT:    #NO_APP
495; CHECK-NEXT:    vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
496; CHECK-NEXT:    retq
497  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
498  %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
499  ret <2 x double> %2
500}
501declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
502
503define <4 x double> @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {
504; CHECK-LABEL: stack_fold_cmppd_ymm:
505; CHECK:       # %bb.0:
506; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
507; CHECK-NEXT:    #APP
508; CHECK-NEXT:    nop
509; CHECK-NEXT:    #NO_APP
510; CHECK-NEXT:    vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
511; CHECK-NEXT:    retq
512  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
513  %2 = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
514  ret <4 x double> %2
515}
516declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
517
518define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
519; CHECK-LABEL: stack_fold_cmpps:
520; CHECK:       # %bb.0:
521; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
522; CHECK-NEXT:    #APP
523; CHECK-NEXT:    nop
524; CHECK-NEXT:    #NO_APP
525; CHECK-NEXT:    vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
526; CHECK-NEXT:    retq
527  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
528  %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
529  ret <4 x float> %2
530}
531declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
532
533define <8 x float> @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {
534; CHECK-LABEL: stack_fold_cmpps_ymm:
535; CHECK:       # %bb.0:
536; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
537; CHECK-NEXT:    #APP
538; CHECK-NEXT:    nop
539; CHECK-NEXT:    #NO_APP
540; CHECK-NEXT:    vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
541; CHECK-NEXT:    retq
542  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
543  %2 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 0)
544  ret <8 x float> %2
545}
546declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
547
548define i32 @stack_fold_cmpsd(double %a0, double %a1) {
549; CHECK-LABEL: stack_fold_cmpsd:
550; CHECK:       # %bb.0:
551; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
552; CHECK-NEXT:    #APP
553; CHECK-NEXT:    nop
554; CHECK-NEXT:    #NO_APP
555; CHECK-NEXT:    vcmpeqsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
556; CHECK-NEXT:    vmovq %xmm0, %rax
557; CHECK-NEXT:    andl $1, %eax
558; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
559; CHECK-NEXT:    retq
560  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
561  %2 = fcmp oeq double %a0, %a1
562  %3 = zext i1 %2 to i32
563  ret i32 %3
564}
565
566define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
567; CHECK-LABEL: stack_fold_cmpsd_int:
568; CHECK:       # %bb.0:
569; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
570; CHECK-NEXT:    #APP
571; CHECK-NEXT:    nop
572; CHECK-NEXT:    #NO_APP
573; CHECK-NEXT:    vcmpeqsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
574; CHECK-NEXT:    retq
575  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
576  %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
577  ret <2 x double> %2
578}
579declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
580
581define i32 @stack_fold_cmpss(float %a0, float %a1) {
582; CHECK-LABEL: stack_fold_cmpss:
583; CHECK:       # %bb.0:
584; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
585; CHECK-NEXT:    #APP
586; CHECK-NEXT:    nop
587; CHECK-NEXT:    #NO_APP
588; CHECK-NEXT:    vcmpeqss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
589; CHECK-NEXT:    vmovd %xmm0, %eax
590; CHECK-NEXT:    andl $1, %eax
591; CHECK-NEXT:    retq
592  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
593  %2 = fcmp oeq float %a0, %a1
594  %3 = zext i1 %2 to i32
595  ret i32 %3
596}
597
598define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
599; CHECK-LABEL: stack_fold_cmpss_int:
600; CHECK:       # %bb.0:
601; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
602; CHECK-NEXT:    #APP
603; CHECK-NEXT:    nop
604; CHECK-NEXT:    #NO_APP
605; CHECK-NEXT:    vcmpeqss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
606; CHECK-NEXT:    retq
607  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
608  %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
609  ret <4 x float> %2
610}
611declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
612
613; TODO stack_fold_comisd
614
615define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
616; CHECK-LABEL: stack_fold_comisd_int:
617; CHECK:       # %bb.0:
618; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
619; CHECK-NEXT:    #APP
620; CHECK-NEXT:    nop
621; CHECK-NEXT:    #NO_APP
622; CHECK-NEXT:    vcomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
623; CHECK-NEXT:    setnp %al
624; CHECK-NEXT:    sete %cl
625; CHECK-NEXT:    andb %al, %cl
626; CHECK-NEXT:    movzbl %cl, %eax
627; CHECK-NEXT:    retq
628  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
629  %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
630  ret i32 %2
631}
632declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
633
634; TODO stack_fold_comiss
635
636define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
637; CHECK-LABEL: stack_fold_comiss_int:
638; CHECK:       # %bb.0:
639; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
640; CHECK-NEXT:    #APP
641; CHECK-NEXT:    nop
642; CHECK-NEXT:    #NO_APP
643; CHECK-NEXT:    vcomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
644; CHECK-NEXT:    setnp %al
645; CHECK-NEXT:    sete %cl
646; CHECK-NEXT:    andb %al, %cl
647; CHECK-NEXT:    movzbl %cl, %eax
648; CHECK-NEXT:    retq
649  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
650  %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
651  ret i32 %2
652}
653declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
654
655define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
656; CHECK-LABEL: stack_fold_cvtdq2pd:
657; CHECK:       # %bb.0:
658; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
659; CHECK-NEXT:    #APP
660; CHECK-NEXT:    nop
661; CHECK-NEXT:    #NO_APP
662; CHECK-NEXT:    vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
663; CHECK-NEXT:    retq
664  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
665  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
666  %3 = sitofp <2 x i32> %2 to <2 x double>
667  ret <2 x double> %3
668}
669define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
670; CHECK-LABEL: stack_fold_cvtdq2pd_int:
671; CHECK:       # %bb.0:
672; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
673; CHECK-NEXT:    #APP
674; CHECK-NEXT:    nop
675; CHECK-NEXT:    #NO_APP
676; CHECK-NEXT:    vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
677; CHECK-NEXT:    retq
678  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
679  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a0, <2 x i32> <i32 0, i32 1>
680  %cvt = sitofp <2 x i32> %2 to <2 x double>
681  ret <2 x double> %cvt
682}
683
684define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) {
685; CHECK-LABEL: stack_fold_cvtdq2pd_ymm:
686; CHECK:       # %bb.0:
687; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
688; CHECK-NEXT:    #APP
689; CHECK-NEXT:    nop
690; CHECK-NEXT:    #NO_APP
691; CHECK-NEXT:    vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
692; CHECK-NEXT:    retq
693  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
694  %2 = sitofp <4 x i32> %a0 to <4 x double>
695  ret <4 x double> %2
696}
697
698define <4 x double> @stack_fold_cvtdq2pd_ymm_int(<4 x i32> %a0) {
699; CHECK-LABEL: stack_fold_cvtdq2pd_ymm_int:
700; CHECK:       # %bb.0:
701; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
702; CHECK-NEXT:    #APP
703; CHECK-NEXT:    nop
704; CHECK-NEXT:    #NO_APP
705; CHECK-NEXT:    vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
706; CHECK-NEXT:    retq
707  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
708  %cvt = sitofp <4 x i32> %a0 to <4 x double>
709  ret <4 x double> %cvt
710}
711
712define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
713; CHECK-LABEL: stack_fold_cvtdq2ps:
714; CHECK:       # %bb.0:
715; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
716; CHECK-NEXT:    #APP
717; CHECK-NEXT:    nop
718; CHECK-NEXT:    #NO_APP
719; CHECK-NEXT:    vcvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
720; CHECK-NEXT:    retq
721  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
722  %2 = sitofp <4 x i32> %a0 to <4 x float>
723  ret <4 x float> %2
724}
725
726define <8 x float> @stack_fold_cvtdq2ps_ymm(<8 x i32> %a0) {
727; CHECK-LABEL: stack_fold_cvtdq2ps_ymm:
728; CHECK:       # %bb.0:
729; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
730; CHECK-NEXT:    #APP
731; CHECK-NEXT:    nop
732; CHECK-NEXT:    #NO_APP
733; CHECK-NEXT:    vcvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
734; CHECK-NEXT:    retq
735  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
736  %2 = sitofp <8 x i32> %a0 to <8 x float>
737  ret <8 x float> %2
738}
739
740define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
741; CHECK-LABEL: stack_fold_cvtpd2dq:
742; CHECK:       # %bb.0:
743; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
744; CHECK-NEXT:    #APP
745; CHECK-NEXT:    nop
746; CHECK-NEXT:    #NO_APP
747; CHECK-NEXT:    vcvtpd2dqx {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
748; CHECK-NEXT:    retq
749  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
750  %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
751  ret <4 x i32> %2
752}
753declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
754
755define <4 x i32> @stack_fold_cvtpd2dq_ymm(<4 x double> %a0) {
756; CHECK-LABEL: stack_fold_cvtpd2dq_ymm:
757; CHECK:       # %bb.0:
758; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
759; CHECK-NEXT:    #APP
760; CHECK-NEXT:    nop
761; CHECK-NEXT:    #NO_APP
762; CHECK-NEXT:    vcvtpd2dqy {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload
763; CHECK-NEXT:    vzeroupper
764; CHECK-NEXT:    retq
765  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
766  %2 = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
767  ret <4 x i32> %2
768}
769declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
770
771define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
772; CHECK-LABEL: stack_fold_cvtpd2ps:
773; CHECK:       # %bb.0:
774; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
775; CHECK-NEXT:    #APP
776; CHECK-NEXT:    nop
777; CHECK-NEXT:    #NO_APP
778; CHECK-NEXT:    vcvtpd2psx {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
779; CHECK-NEXT:    retq
780  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
781  %2 = fptrunc <2 x double> %a0 to <2 x float>
782  ret <2 x float> %2
783}
784
785define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) {
786; CHECK-LABEL: stack_fold_cvtpd2ps_ymm:
787; CHECK:       # %bb.0:
788; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
789; CHECK-NEXT:    #APP
790; CHECK-NEXT:    nop
791; CHECK-NEXT:    #NO_APP
792; CHECK-NEXT:    vcvtpd2psy {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload
793; CHECK-NEXT:    vzeroupper
794; CHECK-NEXT:    retq
795  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
796  %2 = fptrunc <4 x double> %a0 to <4 x float>
797  ret <4 x float> %2
798}
799
800define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) {
801; CHECK-LABEL: stack_fold_cvtph2ps:
802; CHECK:       # %bb.0:
803; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
804; CHECK-NEXT:    #APP
805; CHECK-NEXT:    nop
806; CHECK-NEXT:    #NO_APP
807; CHECK-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
808; CHECK-NEXT:    retq
809  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
810  %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0)
811  ret <4 x float> %2
812}
813declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
814
815define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) {
816; CHECK-LABEL: stack_fold_cvtph2ps_ymm:
817; CHECK:       # %bb.0:
818; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
819; CHECK-NEXT:    #APP
820; CHECK-NEXT:    nop
821; CHECK-NEXT:    #NO_APP
822; CHECK-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
823; CHECK-NEXT:    retq
824  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
825  %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0)
826  ret <8 x float> %2
827}
828declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
829
830define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
831; CHECK-LABEL: stack_fold_cvtps2dq:
832; CHECK:       # %bb.0:
833; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
834; CHECK-NEXT:    #APP
835; CHECK-NEXT:    nop
836; CHECK-NEXT:    #NO_APP
837; CHECK-NEXT:    vcvtps2dq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
838; CHECK-NEXT:    retq
839  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
840  %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
841  ret <4 x i32> %2
842}
843declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
844
845define <8 x i32> @stack_fold_cvtps2dq_ymm(<8 x float> %a0) {
846; CHECK-LABEL: stack_fold_cvtps2dq_ymm:
847; CHECK:       # %bb.0:
848; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
849; CHECK-NEXT:    #APP
850; CHECK-NEXT:    nop
851; CHECK-NEXT:    #NO_APP
852; CHECK-NEXT:    vcvtps2dq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
853; CHECK-NEXT:    retq
854  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
855  %2 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
856  ret <8 x i32> %2
857}
858declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
859
860define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
861; CHECK-LABEL: stack_fold_cvtps2pd:
862; CHECK:       # %bb.0:
863; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
864; CHECK-NEXT:    #APP
865; CHECK-NEXT:    nop
866; CHECK-NEXT:    #NO_APP
867; CHECK-NEXT:    vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
868; CHECK-NEXT:    retq
869  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
870  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
871  %3 = fpext <2 x float> %2 to <2 x double>
872  ret <2 x double> %3
873}
874
875define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
876; CHECK-LABEL: stack_fold_cvtps2pd_int:
877; CHECK:       # %bb.0:
878; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
879; CHECK-NEXT:    #APP
880; CHECK-NEXT:    nop
881; CHECK-NEXT:    #NO_APP
882; CHECK-NEXT:    vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
883; CHECK-NEXT:    retq
884  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
885  %2 = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32> <i32 0, i32 1>
886  %cvtps2pd = fpext <2 x float> %2 to <2 x double>
887  ret <2 x double> %cvtps2pd
888}
889
890define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) {
891; CHECK-LABEL: stack_fold_cvtps2pd_ymm:
892; CHECK:       # %bb.0:
893; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
894; CHECK-NEXT:    #APP
895; CHECK-NEXT:    nop
896; CHECK-NEXT:    #NO_APP
897; CHECK-NEXT:    vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
898; CHECK-NEXT:    retq
899  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
900  %2 = fpext <4 x float> %a0 to <4 x double>
901  ret <4 x double> %2
902}
903
904define <4 x double> @stack_fold_cvtps2pd_ymm_int(<4 x float> %a0) {
905; CHECK-LABEL: stack_fold_cvtps2pd_ymm_int:
906; CHECK:       # %bb.0:
907; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
908; CHECK-NEXT:    #APP
909; CHECK-NEXT:    nop
910; CHECK-NEXT:    #NO_APP
911; CHECK-NEXT:    vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
912; CHECK-NEXT:    retq
913  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
914  %cvtps2pd = fpext <4 x float> %a0 to <4 x double>
915  ret <4 x double> %cvtps2pd
916}
917
918define <8 x i16> @stack_fold_cvtps2ph_ymm(<8 x float> %a0) {
919; CHECK-LABEL: stack_fold_cvtps2ph_ymm:
920; CHECK:       # %bb.0:
921; CHECK-NEXT:    vcvtps2ph $0, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
922; CHECK-NEXT:    #APP
923; CHECK-NEXT:    nop
924; CHECK-NEXT:    #NO_APP
925; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
926; CHECK-NEXT:    vzeroupper
927; CHECK-NEXT:    retq
928  %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
929  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
930  ret <8 x i16> %1
931}
932declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
933
934; TODO stack_fold_cvtsd2si
935
936define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
937; CHECK-LABEL: stack_fold_cvtsd2si_int:
938; CHECK:       # %bb.0:
939; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
940; CHECK-NEXT:    #APP
941; CHECK-NEXT:    nop
942; CHECK-NEXT:    #NO_APP
943; CHECK-NEXT:    vcvtsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
944; CHECK-NEXT:    retq
945  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
946  %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
947  ret i32 %2
948}
949declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
950
951; TODO stack_fold_cvtsd2si64
952
953define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
954; CHECK-LABEL: stack_fold_cvtsd2si64_int:
955; CHECK:       # %bb.0:
956; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
957; CHECK-NEXT:    #APP
958; CHECK-NEXT:    nop
959; CHECK-NEXT:    #NO_APP
960; CHECK-NEXT:    vcvtsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
961; CHECK-NEXT:    retq
962  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
963  %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
964  ret i64 %2
965}
966declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
967
968define double @stack_fold_cvtsi2sd(i32 %a0) {
969; CHECK-LABEL: stack_fold_cvtsi2sd:
970; CHECK:       # %bb.0:
971; CHECK-NEXT:    pushq %rbp
972; CHECK-NEXT:    .cfi_def_cfa_offset 16
973; CHECK-NEXT:    pushq %r15
974; CHECK-NEXT:    .cfi_def_cfa_offset 24
975; CHECK-NEXT:    pushq %r14
976; CHECK-NEXT:    .cfi_def_cfa_offset 32
977; CHECK-NEXT:    pushq %r13
978; CHECK-NEXT:    .cfi_def_cfa_offset 40
979; CHECK-NEXT:    pushq %r12
980; CHECK-NEXT:    .cfi_def_cfa_offset 48
981; CHECK-NEXT:    pushq %rbx
982; CHECK-NEXT:    .cfi_def_cfa_offset 56
983; CHECK-NEXT:    .cfi_offset %rbx, -56
984; CHECK-NEXT:    .cfi_offset %r12, -48
985; CHECK-NEXT:    .cfi_offset %r13, -40
986; CHECK-NEXT:    .cfi_offset %r14, -32
987; CHECK-NEXT:    .cfi_offset %r15, -24
988; CHECK-NEXT:    .cfi_offset %rbp, -16
989; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
990; CHECK-NEXT:    #APP
991; CHECK-NEXT:    nop
992; CHECK-NEXT:    #NO_APP
993; CHECK-NEXT:    vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
994; CHECK-NEXT:    popq %rbx
995; CHECK-NEXT:    .cfi_def_cfa_offset 48
996; CHECK-NEXT:    popq %r12
997; CHECK-NEXT:    .cfi_def_cfa_offset 40
998; CHECK-NEXT:    popq %r13
999; CHECK-NEXT:    .cfi_def_cfa_offset 32
1000; CHECK-NEXT:    popq %r14
1001; CHECK-NEXT:    .cfi_def_cfa_offset 24
1002; CHECK-NEXT:    popq %r15
1003; CHECK-NEXT:    .cfi_def_cfa_offset 16
1004; CHECK-NEXT:    popq %rbp
1005; CHECK-NEXT:    .cfi_def_cfa_offset 8
1006; CHECK-NEXT:    retq
1007  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1008  %2 = sitofp i32 %a0 to double
1009  ret double %2
1010}
1011
1012define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) {
1013; CHECK-LABEL: stack_fold_cvtsi2sd_int:
1014; CHECK:       # %bb.0:
1015; CHECK-NEXT:    pushq %rbp
1016; CHECK-NEXT:    .cfi_def_cfa_offset 16
1017; CHECK-NEXT:    pushq %r15
1018; CHECK-NEXT:    .cfi_def_cfa_offset 24
1019; CHECK-NEXT:    pushq %r14
1020; CHECK-NEXT:    .cfi_def_cfa_offset 32
1021; CHECK-NEXT:    pushq %r13
1022; CHECK-NEXT:    .cfi_def_cfa_offset 40
1023; CHECK-NEXT:    pushq %r12
1024; CHECK-NEXT:    .cfi_def_cfa_offset 48
1025; CHECK-NEXT:    pushq %rbx
1026; CHECK-NEXT:    .cfi_def_cfa_offset 56
1027; CHECK-NEXT:    .cfi_offset %rbx, -56
1028; CHECK-NEXT:    .cfi_offset %r12, -48
1029; CHECK-NEXT:    .cfi_offset %r13, -40
1030; CHECK-NEXT:    .cfi_offset %r14, -32
1031; CHECK-NEXT:    .cfi_offset %r15, -24
1032; CHECK-NEXT:    .cfi_offset %rbp, -16
1033; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1034; CHECK-NEXT:    #APP
1035; CHECK-NEXT:    nop
1036; CHECK-NEXT:    #NO_APP
1037; CHECK-NEXT:    vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1038; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1039; CHECK-NEXT:    popq %rbx
1040; CHECK-NEXT:    .cfi_def_cfa_offset 48
1041; CHECK-NEXT:    popq %r12
1042; CHECK-NEXT:    .cfi_def_cfa_offset 40
1043; CHECK-NEXT:    popq %r13
1044; CHECK-NEXT:    .cfi_def_cfa_offset 32
1045; CHECK-NEXT:    popq %r14
1046; CHECK-NEXT:    .cfi_def_cfa_offset 24
1047; CHECK-NEXT:    popq %r15
1048; CHECK-NEXT:    .cfi_def_cfa_offset 16
1049; CHECK-NEXT:    popq %rbp
1050; CHECK-NEXT:    .cfi_def_cfa_offset 8
1051; CHECK-NEXT:    retq
1052  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1053  %2 = sitofp i32 %a0 to double
1054  %3 = insertelement <2 x double> zeroinitializer, double %2, i64 0
1055  ret <2 x double> %3
1056}
1057
1058define double @stack_fold_cvtsi642sd(i64 %a0) {
1059; CHECK-LABEL: stack_fold_cvtsi642sd:
1060; CHECK:       # %bb.0:
1061; CHECK-NEXT:    pushq %rbp
1062; CHECK-NEXT:    .cfi_def_cfa_offset 16
1063; CHECK-NEXT:    pushq %r15
1064; CHECK-NEXT:    .cfi_def_cfa_offset 24
1065; CHECK-NEXT:    pushq %r14
1066; CHECK-NEXT:    .cfi_def_cfa_offset 32
1067; CHECK-NEXT:    pushq %r13
1068; CHECK-NEXT:    .cfi_def_cfa_offset 40
1069; CHECK-NEXT:    pushq %r12
1070; CHECK-NEXT:    .cfi_def_cfa_offset 48
1071; CHECK-NEXT:    pushq %rbx
1072; CHECK-NEXT:    .cfi_def_cfa_offset 56
1073; CHECK-NEXT:    .cfi_offset %rbx, -56
1074; CHECK-NEXT:    .cfi_offset %r12, -48
1075; CHECK-NEXT:    .cfi_offset %r13, -40
1076; CHECK-NEXT:    .cfi_offset %r14, -32
1077; CHECK-NEXT:    .cfi_offset %r15, -24
1078; CHECK-NEXT:    .cfi_offset %rbp, -16
1079; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1080; CHECK-NEXT:    #APP
1081; CHECK-NEXT:    nop
1082; CHECK-NEXT:    #NO_APP
1083; CHECK-NEXT:    vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
1084; CHECK-NEXT:    popq %rbx
1085; CHECK-NEXT:    .cfi_def_cfa_offset 48
1086; CHECK-NEXT:    popq %r12
1087; CHECK-NEXT:    .cfi_def_cfa_offset 40
1088; CHECK-NEXT:    popq %r13
1089; CHECK-NEXT:    .cfi_def_cfa_offset 32
1090; CHECK-NEXT:    popq %r14
1091; CHECK-NEXT:    .cfi_def_cfa_offset 24
1092; CHECK-NEXT:    popq %r15
1093; CHECK-NEXT:    .cfi_def_cfa_offset 16
1094; CHECK-NEXT:    popq %rbp
1095; CHECK-NEXT:    .cfi_def_cfa_offset 8
1096; CHECK-NEXT:    retq
1097  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1098  %2 = sitofp i64 %a0 to double
1099  ret double %2
1100}
1101
1102define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) {
1103; CHECK-LABEL: stack_fold_cvtsi642sd_int:
1104; CHECK:       # %bb.0:
1105; CHECK-NEXT:    pushq %rbp
1106; CHECK-NEXT:    .cfi_def_cfa_offset 16
1107; CHECK-NEXT:    pushq %r15
1108; CHECK-NEXT:    .cfi_def_cfa_offset 24
1109; CHECK-NEXT:    pushq %r14
1110; CHECK-NEXT:    .cfi_def_cfa_offset 32
1111; CHECK-NEXT:    pushq %r13
1112; CHECK-NEXT:    .cfi_def_cfa_offset 40
1113; CHECK-NEXT:    pushq %r12
1114; CHECK-NEXT:    .cfi_def_cfa_offset 48
1115; CHECK-NEXT:    pushq %rbx
1116; CHECK-NEXT:    .cfi_def_cfa_offset 56
1117; CHECK-NEXT:    .cfi_offset %rbx, -56
1118; CHECK-NEXT:    .cfi_offset %r12, -48
1119; CHECK-NEXT:    .cfi_offset %r13, -40
1120; CHECK-NEXT:    .cfi_offset %r14, -32
1121; CHECK-NEXT:    .cfi_offset %r15, -24
1122; CHECK-NEXT:    .cfi_offset %rbp, -16
1123; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1124; CHECK-NEXT:    #APP
1125; CHECK-NEXT:    nop
1126; CHECK-NEXT:    #NO_APP
1127; CHECK-NEXT:    vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
1128; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1129; CHECK-NEXT:    popq %rbx
1130; CHECK-NEXT:    .cfi_def_cfa_offset 48
1131; CHECK-NEXT:    popq %r12
1132; CHECK-NEXT:    .cfi_def_cfa_offset 40
1133; CHECK-NEXT:    popq %r13
1134; CHECK-NEXT:    .cfi_def_cfa_offset 32
1135; CHECK-NEXT:    popq %r14
1136; CHECK-NEXT:    .cfi_def_cfa_offset 24
1137; CHECK-NEXT:    popq %r15
1138; CHECK-NEXT:    .cfi_def_cfa_offset 16
1139; CHECK-NEXT:    popq %rbp
1140; CHECK-NEXT:    .cfi_def_cfa_offset 8
1141; CHECK-NEXT:    retq
1142  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1143  %2 = sitofp i64 %a0 to double
1144  %3 = insertelement <2 x double> zeroinitializer, double %2, i64 0
1145  ret <2 x double> %3
1146}
1147
1148define float @stack_fold_cvtsi2ss(i32 %a0) {
1149; CHECK-LABEL: stack_fold_cvtsi2ss:
1150; CHECK:       # %bb.0:
1151; CHECK-NEXT:    pushq %rbp
1152; CHECK-NEXT:    .cfi_def_cfa_offset 16
1153; CHECK-NEXT:    pushq %r15
1154; CHECK-NEXT:    .cfi_def_cfa_offset 24
1155; CHECK-NEXT:    pushq %r14
1156; CHECK-NEXT:    .cfi_def_cfa_offset 32
1157; CHECK-NEXT:    pushq %r13
1158; CHECK-NEXT:    .cfi_def_cfa_offset 40
1159; CHECK-NEXT:    pushq %r12
1160; CHECK-NEXT:    .cfi_def_cfa_offset 48
1161; CHECK-NEXT:    pushq %rbx
1162; CHECK-NEXT:    .cfi_def_cfa_offset 56
1163; CHECK-NEXT:    .cfi_offset %rbx, -56
1164; CHECK-NEXT:    .cfi_offset %r12, -48
1165; CHECK-NEXT:    .cfi_offset %r13, -40
1166; CHECK-NEXT:    .cfi_offset %r14, -32
1167; CHECK-NEXT:    .cfi_offset %r15, -24
1168; CHECK-NEXT:    .cfi_offset %rbp, -16
1169; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1170; CHECK-NEXT:    #APP
1171; CHECK-NEXT:    nop
1172; CHECK-NEXT:    #NO_APP
1173; CHECK-NEXT:    vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1174; CHECK-NEXT:    popq %rbx
1175; CHECK-NEXT:    .cfi_def_cfa_offset 48
1176; CHECK-NEXT:    popq %r12
1177; CHECK-NEXT:    .cfi_def_cfa_offset 40
1178; CHECK-NEXT:    popq %r13
1179; CHECK-NEXT:    .cfi_def_cfa_offset 32
1180; CHECK-NEXT:    popq %r14
1181; CHECK-NEXT:    .cfi_def_cfa_offset 24
1182; CHECK-NEXT:    popq %r15
1183; CHECK-NEXT:    .cfi_def_cfa_offset 16
1184; CHECK-NEXT:    popq %rbp
1185; CHECK-NEXT:    .cfi_def_cfa_offset 8
1186; CHECK-NEXT:    retq
1187  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1188  %2 = sitofp i32 %a0 to float
1189  ret float %2
1190}
1191
1192define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
1193; CHECK-LABEL: stack_fold_cvtsi2ss_int:
1194; CHECK:       # %bb.0:
1195; CHECK-NEXT:    pushq %rbp
1196; CHECK-NEXT:    .cfi_def_cfa_offset 16
1197; CHECK-NEXT:    pushq %r15
1198; CHECK-NEXT:    .cfi_def_cfa_offset 24
1199; CHECK-NEXT:    pushq %r14
1200; CHECK-NEXT:    .cfi_def_cfa_offset 32
1201; CHECK-NEXT:    pushq %r13
1202; CHECK-NEXT:    .cfi_def_cfa_offset 40
1203; CHECK-NEXT:    pushq %r12
1204; CHECK-NEXT:    .cfi_def_cfa_offset 48
1205; CHECK-NEXT:    pushq %rbx
1206; CHECK-NEXT:    .cfi_def_cfa_offset 56
1207; CHECK-NEXT:    .cfi_offset %rbx, -56
1208; CHECK-NEXT:    .cfi_offset %r12, -48
1209; CHECK-NEXT:    .cfi_offset %r13, -40
1210; CHECK-NEXT:    .cfi_offset %r14, -32
1211; CHECK-NEXT:    .cfi_offset %r15, -24
1212; CHECK-NEXT:    .cfi_offset %rbp, -16
1213; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1214; CHECK-NEXT:    #APP
1215; CHECK-NEXT:    nop
1216; CHECK-NEXT:    #NO_APP
1217; CHECK-NEXT:    vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1218; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1219; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1220; CHECK-NEXT:    popq %rbx
1221; CHECK-NEXT:    .cfi_def_cfa_offset 48
1222; CHECK-NEXT:    popq %r12
1223; CHECK-NEXT:    .cfi_def_cfa_offset 40
1224; CHECK-NEXT:    popq %r13
1225; CHECK-NEXT:    .cfi_def_cfa_offset 32
1226; CHECK-NEXT:    popq %r14
1227; CHECK-NEXT:    .cfi_def_cfa_offset 24
1228; CHECK-NEXT:    popq %r15
1229; CHECK-NEXT:    .cfi_def_cfa_offset 16
1230; CHECK-NEXT:    popq %rbp
1231; CHECK-NEXT:    .cfi_def_cfa_offset 8
1232; CHECK-NEXT:    retq
1233  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1234  %2 = sitofp i32 %a0 to float
1235  %3 = insertelement <4 x float> zeroinitializer, float %2, i64 0
1236  ret <4 x float> %3
1237}
1238
1239define float @stack_fold_cvtsi642ss(i64 %a0) {
1240; CHECK-LABEL: stack_fold_cvtsi642ss:
1241; CHECK:       # %bb.0:
1242; CHECK-NEXT:    pushq %rbp
1243; CHECK-NEXT:    .cfi_def_cfa_offset 16
1244; CHECK-NEXT:    pushq %r15
1245; CHECK-NEXT:    .cfi_def_cfa_offset 24
1246; CHECK-NEXT:    pushq %r14
1247; CHECK-NEXT:    .cfi_def_cfa_offset 32
1248; CHECK-NEXT:    pushq %r13
1249; CHECK-NEXT:    .cfi_def_cfa_offset 40
1250; CHECK-NEXT:    pushq %r12
1251; CHECK-NEXT:    .cfi_def_cfa_offset 48
1252; CHECK-NEXT:    pushq %rbx
1253; CHECK-NEXT:    .cfi_def_cfa_offset 56
1254; CHECK-NEXT:    .cfi_offset %rbx, -56
1255; CHECK-NEXT:    .cfi_offset %r12, -48
1256; CHECK-NEXT:    .cfi_offset %r13, -40
1257; CHECK-NEXT:    .cfi_offset %r14, -32
1258; CHECK-NEXT:    .cfi_offset %r15, -24
1259; CHECK-NEXT:    .cfi_offset %rbp, -16
1260; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1261; CHECK-NEXT:    #APP
1262; CHECK-NEXT:    nop
1263; CHECK-NEXT:    #NO_APP
1264; CHECK-NEXT:    vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
1265; CHECK-NEXT:    popq %rbx
1266; CHECK-NEXT:    .cfi_def_cfa_offset 48
1267; CHECK-NEXT:    popq %r12
1268; CHECK-NEXT:    .cfi_def_cfa_offset 40
1269; CHECK-NEXT:    popq %r13
1270; CHECK-NEXT:    .cfi_def_cfa_offset 32
1271; CHECK-NEXT:    popq %r14
1272; CHECK-NEXT:    .cfi_def_cfa_offset 24
1273; CHECK-NEXT:    popq %r15
1274; CHECK-NEXT:    .cfi_def_cfa_offset 16
1275; CHECK-NEXT:    popq %rbp
1276; CHECK-NEXT:    .cfi_def_cfa_offset 8
1277; CHECK-NEXT:    retq
1278  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1279  %2 = sitofp i64 %a0 to float
1280  ret float %2
1281}
1282
1283define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
1284; CHECK-LABEL: stack_fold_cvtsi642ss_int:
1285; CHECK:       # %bb.0:
1286; CHECK-NEXT:    pushq %rbp
1287; CHECK-NEXT:    .cfi_def_cfa_offset 16
1288; CHECK-NEXT:    pushq %r15
1289; CHECK-NEXT:    .cfi_def_cfa_offset 24
1290; CHECK-NEXT:    pushq %r14
1291; CHECK-NEXT:    .cfi_def_cfa_offset 32
1292; CHECK-NEXT:    pushq %r13
1293; CHECK-NEXT:    .cfi_def_cfa_offset 40
1294; CHECK-NEXT:    pushq %r12
1295; CHECK-NEXT:    .cfi_def_cfa_offset 48
1296; CHECK-NEXT:    pushq %rbx
1297; CHECK-NEXT:    .cfi_def_cfa_offset 56
1298; CHECK-NEXT:    .cfi_offset %rbx, -56
1299; CHECK-NEXT:    .cfi_offset %r12, -48
1300; CHECK-NEXT:    .cfi_offset %r13, -40
1301; CHECK-NEXT:    .cfi_offset %r14, -32
1302; CHECK-NEXT:    .cfi_offset %r15, -24
1303; CHECK-NEXT:    .cfi_offset %rbp, -16
1304; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1305; CHECK-NEXT:    #APP
1306; CHECK-NEXT:    nop
1307; CHECK-NEXT:    #NO_APP
1308; CHECK-NEXT:    vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
1309; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1310; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1311; CHECK-NEXT:    popq %rbx
1312; CHECK-NEXT:    .cfi_def_cfa_offset 48
1313; CHECK-NEXT:    popq %r12
1314; CHECK-NEXT:    .cfi_def_cfa_offset 40
1315; CHECK-NEXT:    popq %r13
1316; CHECK-NEXT:    .cfi_def_cfa_offset 32
1317; CHECK-NEXT:    popq %r14
1318; CHECK-NEXT:    .cfi_def_cfa_offset 24
1319; CHECK-NEXT:    popq %r15
1320; CHECK-NEXT:    .cfi_def_cfa_offset 16
1321; CHECK-NEXT:    popq %rbp
1322; CHECK-NEXT:    .cfi_def_cfa_offset 8
1323; CHECK-NEXT:    retq
1324  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1325  %2 = sitofp i64 %a0 to float
1326  %3 = insertelement <4 x float> zeroinitializer, float %2, i64 0
1327  ret <4 x float> %3
1328}
1329
1330; TODO stack_fold_cvtss2si
1331
1332define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
1333; CHECK-LABEL: stack_fold_cvtss2si_int:
1334; CHECK:       # %bb.0:
1335; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1336; CHECK-NEXT:    #APP
1337; CHECK-NEXT:    nop
1338; CHECK-NEXT:    #NO_APP
1339; CHECK-NEXT:    vcvtss2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
1340; CHECK-NEXT:    retq
1341  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1342  %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
1343  ret i32 %2
1344}
1345declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
1346
1347; TODO stack_fold_cvtss2si64
1348
1349define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) {
1350; CHECK-LABEL: stack_fold_cvtss2si64_int:
1351; CHECK:       # %bb.0:
1352; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1353; CHECK-NEXT:    #APP
1354; CHECK-NEXT:    nop
1355; CHECK-NEXT:    #NO_APP
1356; CHECK-NEXT:    vcvtss2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
1357; CHECK-NEXT:    retq
1358  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1359  %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
1360  ret i64 %2
1361}
1362declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
1363
1364define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {
1365; CHECK-LABEL: stack_fold_cvttpd2dq:
1366; CHECK:       # %bb.0:
1367; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1368; CHECK-NEXT:    #APP
1369; CHECK-NEXT:    nop
1370; CHECK-NEXT:    #NO_APP
1371; CHECK-NEXT:    vcvttpd2dqx {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1372; CHECK-NEXT:    retq
1373  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1374  %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
1375  ret <4 x i32> %2
1376}
1377declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
1378
1379define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) {
1380; CHECK-LABEL: stack_fold_cvttpd2dq_ymm:
1381; CHECK:       # %bb.0:
1382; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1383; CHECK-NEXT:    #APP
1384; CHECK-NEXT:    nop
1385; CHECK-NEXT:    #NO_APP
1386; CHECK-NEXT:    vcvttpd2dqy {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload
1387; CHECK-NEXT:    vzeroupper
1388; CHECK-NEXT:    retq
1389  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1390  %2 = fptosi <4 x double> %a0 to <4 x i32>
1391  ret <4 x i32> %2
1392}
1393
1394define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) {
1395; CHECK-LABEL: stack_fold_cvttps2dq:
1396; CHECK:       # %bb.0:
1397; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1398; CHECK-NEXT:    #APP
1399; CHECK-NEXT:    nop
1400; CHECK-NEXT:    #NO_APP
1401; CHECK-NEXT:    vcvttps2dq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1402; CHECK-NEXT:    retq
1403  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1404  %2 = fptosi <4 x float> %a0 to <4 x i32>
1405  ret <4 x i32> %2
1406}
1407
1408define <8 x i32> @stack_fold_cvttps2dq_ymm(<8 x float> %a0) {
1409; CHECK-LABEL: stack_fold_cvttps2dq_ymm:
1410; CHECK:       # %bb.0:
1411; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1412; CHECK-NEXT:    #APP
1413; CHECK-NEXT:    nop
1414; CHECK-NEXT:    #NO_APP
1415; CHECK-NEXT:    vcvttps2dq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1416; CHECK-NEXT:    retq
1417  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1418  %2 = fptosi <8 x float> %a0 to <8 x i32>
1419  ret <8 x i32> %2
1420}
1421
1422define i32 @stack_fold_cvttsd2si(double %a0) {
1423; CHECK-LABEL: stack_fold_cvttsd2si:
1424; CHECK:       # %bb.0:
1425; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1426; CHECK-NEXT:    #APP
1427; CHECK-NEXT:    nop
1428; CHECK-NEXT:    #NO_APP
1429; CHECK-NEXT:    vcvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 8-byte Folded Reload
1430; CHECK-NEXT:    retq
1431  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1432  %2 = fptosi double %a0 to i32
1433  ret i32 %2
1434}
1435
1436define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) {
1437; CHECK-LABEL: stack_fold_cvttsd2si_int:
1438; CHECK:       # %bb.0:
1439; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1440; CHECK-NEXT:    #APP
1441; CHECK-NEXT:    nop
1442; CHECK-NEXT:    #NO_APP
1443; CHECK-NEXT:    vcvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
1444; CHECK-NEXT:    retq
1445  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1446  %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
1447  ret i32 %2
1448}
1449declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
1450
1451define i64 @stack_fold_cvttsd2si64(double %a0) {
1452; CHECK-LABEL: stack_fold_cvttsd2si64:
1453; CHECK:       # %bb.0:
1454; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1455; CHECK-NEXT:    #APP
1456; CHECK-NEXT:    nop
1457; CHECK-NEXT:    #NO_APP
1458; CHECK-NEXT:    vcvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
1459; CHECK-NEXT:    retq
1460  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1461  %2 = fptosi double %a0 to i64
1462  ret i64 %2
1463}
1464
1465define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) {
1466; CHECK-LABEL: stack_fold_cvttsd2si64_int:
1467; CHECK:       # %bb.0:
1468; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1469; CHECK-NEXT:    #APP
1470; CHECK-NEXT:    nop
1471; CHECK-NEXT:    #NO_APP
1472; CHECK-NEXT:    vcvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
1473; CHECK-NEXT:    retq
1474  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1475  %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
1476  ret i64 %2
1477}
1478declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
1479
1480define i32 @stack_fold_cvttss2si(float %a0) {
1481; CHECK-LABEL: stack_fold_cvttss2si:
1482; CHECK:       # %bb.0:
1483; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1484; CHECK-NEXT:    #APP
1485; CHECK-NEXT:    nop
1486; CHECK-NEXT:    #NO_APP
1487; CHECK-NEXT:    vcvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
1488; CHECK-NEXT:    retq
1489  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1490  %2 = fptosi float %a0 to i32
1491  ret i32 %2
1492}
1493
1494define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) {
1495; CHECK-LABEL: stack_fold_cvttss2si_int:
1496; CHECK:       # %bb.0:
1497; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1498; CHECK-NEXT:    #APP
1499; CHECK-NEXT:    nop
1500; CHECK-NEXT:    #NO_APP
1501; CHECK-NEXT:    vcvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
1502; CHECK-NEXT:    retq
1503  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1504  %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
1505  ret i32 %2
1506}
1507declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
1508
1509define i64 @stack_fold_cvttss2si64(float %a0) {
1510; CHECK-LABEL: stack_fold_cvttss2si64:
1511; CHECK:       # %bb.0:
1512; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1513; CHECK-NEXT:    #APP
1514; CHECK-NEXT:    nop
1515; CHECK-NEXT:    #NO_APP
1516; CHECK-NEXT:    vcvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 4-byte Folded Reload
1517; CHECK-NEXT:    retq
1518  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1519  %2 = fptosi float %a0 to i64
1520  ret i64 %2
1521}
1522
1523define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
1524; CHECK-LABEL: stack_fold_cvttss2si64_int:
1525; CHECK:       # %bb.0:
1526; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1527; CHECK-NEXT:    #APP
1528; CHECK-NEXT:    nop
1529; CHECK-NEXT:    #NO_APP
1530; CHECK-NEXT:    vcvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
1531; CHECK-NEXT:    retq
1532  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1533  %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
1534  ret i64 %2
1535}
1536declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
1537
1538define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
1539; CHECK-LABEL: stack_fold_divpd:
1540; CHECK:       # %bb.0:
1541; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1542; CHECK-NEXT:    #APP
1543; CHECK-NEXT:    nop
1544; CHECK-NEXT:    #NO_APP
1545; CHECK-NEXT:    vdivpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1546; CHECK-NEXT:    retq
1547  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1548  %2 = fdiv <2 x double> %a0, %a1
1549  ret <2 x double> %2
1550}
1551
1552define <4 x double> @stack_fold_divpd_ymm(<4 x double> %a0, <4 x double> %a1) {
1553; CHECK-LABEL: stack_fold_divpd_ymm:
1554; CHECK:       # %bb.0:
1555; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1556; CHECK-NEXT:    #APP
1557; CHECK-NEXT:    nop
1558; CHECK-NEXT:    #NO_APP
1559; CHECK-NEXT:    vdivpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1560; CHECK-NEXT:    retq
1561  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1562  %2 = fdiv <4 x double> %a0, %a1
1563  ret <4 x double> %2
1564}
1565
1566define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) {
1567; CHECK-LABEL: stack_fold_divps:
1568; CHECK:       # %bb.0:
1569; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1570; CHECK-NEXT:    #APP
1571; CHECK-NEXT:    nop
1572; CHECK-NEXT:    #NO_APP
1573; CHECK-NEXT:    vdivps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1574; CHECK-NEXT:    retq
1575  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1576  %2 = fdiv <4 x float> %a0, %a1
1577  ret <4 x float> %2
1578}
1579
1580define <8 x float> @stack_fold_divps_ymm(<8 x float> %a0, <8 x float> %a1) {
1581; CHECK-LABEL: stack_fold_divps_ymm:
1582; CHECK:       # %bb.0:
1583; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1584; CHECK-NEXT:    #APP
1585; CHECK-NEXT:    nop
1586; CHECK-NEXT:    #NO_APP
1587; CHECK-NEXT:    vdivps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1588; CHECK-NEXT:    retq
1589  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1590  %2 = fdiv <8 x float> %a0, %a1
1591  ret <8 x float> %2
1592}
1593
1594define double @stack_fold_divsd(double %a0, double %a1) {
1595; CHECK-LABEL: stack_fold_divsd:
1596; CHECK:       # %bb.0:
1597; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1598; CHECK-NEXT:    #APP
1599; CHECK-NEXT:    nop
1600; CHECK-NEXT:    #NO_APP
1601; CHECK-NEXT:    vdivsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
1602; CHECK-NEXT:    retq
1603  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1604  %2 = fdiv double %a0, %a1
1605  ret double %2
1606}
1607
1608define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
1609; CHECK-LABEL: stack_fold_divsd_int:
1610; CHECK:       # %bb.0:
1611; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1612; CHECK-NEXT:    #APP
1613; CHECK-NEXT:    nop
1614; CHECK-NEXT:    #NO_APP
1615; CHECK-NEXT:    vdivsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1616; CHECK-NEXT:    retq
1617  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1618  %2 = extractelement <2 x double> %a0, i32 0
1619  %3 = extractelement <2 x double> %a1, i32 0
1620  %4 = fdiv double %2, %3
1621  %5 = insertelement <2 x double> %a0, double %4, i32 0
1622  ret <2 x double> %5
1623}
1624
1625define float @stack_fold_divss(float %a0, float %a1) {
1626; CHECK-LABEL: stack_fold_divss:
1627; CHECK:       # %bb.0:
1628; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1629; CHECK-NEXT:    #APP
1630; CHECK-NEXT:    nop
1631; CHECK-NEXT:    #NO_APP
1632; CHECK-NEXT:    vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1633; CHECK-NEXT:    retq
1634  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1635  %2 = fdiv float %a0, %a1
1636  ret float %2
1637}
1638
1639define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
1640; CHECK-LABEL: stack_fold_divss_int:
1641; CHECK:       # %bb.0:
1642; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1643; CHECK-NEXT:    #APP
1644; CHECK-NEXT:    nop
1645; CHECK-NEXT:    #NO_APP
1646; CHECK-NEXT:    vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1647; CHECK-NEXT:    retq
1648  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1649  %2 = extractelement <4 x float> %a0, i32 0
1650  %3 = extractelement <4 x float> %a1, i32 0
1651  %4 = fdiv float %2, %3
1652  %5 = insertelement <4 x float> %a0, float %4, i32 0
1653  ret <4 x float> %5
1654}
1655
1656define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {
1657; CHECK-LABEL: stack_fold_dppd:
1658; CHECK:       # %bb.0:
1659; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1660; CHECK-NEXT:    #APP
1661; CHECK-NEXT:    nop
1662; CHECK-NEXT:    #NO_APP
1663; CHECK-NEXT:    vdppd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1664; CHECK-NEXT:    retq
1665  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1666  %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
1667  ret <2 x double> %2
1668}
1669declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
1670
1671define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
1672; CHECK-LABEL: stack_fold_dpps:
1673; CHECK:       # %bb.0:
1674; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1675; CHECK-NEXT:    #APP
1676; CHECK-NEXT:    nop
1677; CHECK-NEXT:    #NO_APP
1678; CHECK-NEXT:    vdpps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1679; CHECK-NEXT:    retq
1680  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1681  %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
1682  ret <4 x float> %2
1683}
1684declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
1685
1686define <8 x float> @stack_fold_dpps_ymm(<8 x float> %a0, <8 x float> %a1) {
1687; CHECK-LABEL: stack_fold_dpps_ymm:
1688; CHECK:       # %bb.0:
1689; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1690; CHECK-NEXT:    #APP
1691; CHECK-NEXT:    nop
1692; CHECK-NEXT:    #NO_APP
1693; CHECK-NEXT:    vdpps $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1694; CHECK-NEXT:    retq
1695  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1696  %2 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
1697  ret <8 x float> %2
1698}
1699declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
1700
1701define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) {
1702; CHECK-LABEL: stack_fold_extractf128:
1703; CHECK:       # %bb.0:
1704; CHECK-NEXT:    vextractf128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
1705; CHECK-NEXT:    #APP
1706; CHECK-NEXT:    nop
1707; CHECK-NEXT:    #NO_APP
1708; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1709; CHECK-NEXT:    vzeroupper
1710; CHECK-NEXT:    retq
1711  %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1712  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1713  ret <4 x float> %1
1714}
1715
1716define i32 @stack_fold_extractps(<4 x float> %a0, <4 x float> %a1) {
1717; CHECK-LABEL: stack_fold_extractps:
1718; CHECK:       # %bb.0:
1719; CHECK-NEXT:    pushq %rbp
1720; CHECK-NEXT:    .cfi_def_cfa_offset 16
1721; CHECK-NEXT:    pushq %r15
1722; CHECK-NEXT:    .cfi_def_cfa_offset 24
1723; CHECK-NEXT:    pushq %r14
1724; CHECK-NEXT:    .cfi_def_cfa_offset 32
1725; CHECK-NEXT:    pushq %r13
1726; CHECK-NEXT:    .cfi_def_cfa_offset 40
1727; CHECK-NEXT:    pushq %r12
1728; CHECK-NEXT:    .cfi_def_cfa_offset 48
1729; CHECK-NEXT:    pushq %rbx
1730; CHECK-NEXT:    .cfi_def_cfa_offset 56
1731; CHECK-NEXT:    .cfi_offset %rbx, -56
1732; CHECK-NEXT:    .cfi_offset %r12, -48
1733; CHECK-NEXT:    .cfi_offset %r13, -40
1734; CHECK-NEXT:    .cfi_offset %r14, -32
1735; CHECK-NEXT:    .cfi_offset %r15, -24
1736; CHECK-NEXT:    .cfi_offset %rbp, -16
1737; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1738; CHECK-NEXT:    vextractps $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1739; CHECK-NEXT:    #APP
1740; CHECK-NEXT:    nop
1741; CHECK-NEXT:    #NO_APP
1742; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1743; CHECK-NEXT:    popq %rbx
1744; CHECK-NEXT:    .cfi_def_cfa_offset 48
1745; CHECK-NEXT:    popq %r12
1746; CHECK-NEXT:    .cfi_def_cfa_offset 40
1747; CHECK-NEXT:    popq %r13
1748; CHECK-NEXT:    .cfi_def_cfa_offset 32
1749; CHECK-NEXT:    popq %r14
1750; CHECK-NEXT:    .cfi_def_cfa_offset 24
1751; CHECK-NEXT:    popq %r15
1752; CHECK-NEXT:    .cfi_def_cfa_offset 16
1753; CHECK-NEXT:    popq %rbp
1754; CHECK-NEXT:    .cfi_def_cfa_offset 8
1755; CHECK-NEXT:    retq
1756  ; fadd forces execution domain
1757  %1 = fadd <4 x float> %a0, %a1
1758  %2 = extractelement <4 x float> %1, i32 1
1759  %3 = bitcast float %2 to i32
1760  %4 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1761  ret i32 %3
1762}
1763
1764define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
1765; CHECK-LABEL: stack_fold_haddpd:
1766; CHECK:       # %bb.0:
1767; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1768; CHECK-NEXT:    #APP
1769; CHECK-NEXT:    nop
1770; CHECK-NEXT:    #NO_APP
1771; CHECK-NEXT:    vhaddpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1772; CHECK-NEXT:    retq
1773  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1774  %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
1775  ret <2 x double> %2
1776}
1777declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
1778
1779define <4 x double> @stack_fold_haddpd_ymm(<4 x double> %a0, <4 x double> %a1) {
1780; CHECK-LABEL: stack_fold_haddpd_ymm:
1781; CHECK:       # %bb.0:
1782; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1783; CHECK-NEXT:    #APP
1784; CHECK-NEXT:    nop
1785; CHECK-NEXT:    #NO_APP
1786; CHECK-NEXT:    vhaddpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1787; CHECK-NEXT:    retq
1788  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1789  %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
1790  ret <4 x double> %2
1791}
1792declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
1793
1794define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) {
1795; CHECK-LABEL: stack_fold_haddps:
1796; CHECK:       # %bb.0:
1797; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1798; CHECK-NEXT:    #APP
1799; CHECK-NEXT:    nop
1800; CHECK-NEXT:    #NO_APP
1801; CHECK-NEXT:    vhaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1802; CHECK-NEXT:    retq
1803  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1804  %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
1805  ret <4 x float> %2
1806}
1807declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
1808
1809define <8 x float> @stack_fold_haddps_ymm(<8 x float> %a0, <8 x float> %a1) {
1810; CHECK-LABEL: stack_fold_haddps_ymm:
1811; CHECK:       # %bb.0:
1812; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1813; CHECK-NEXT:    #APP
1814; CHECK-NEXT:    nop
1815; CHECK-NEXT:    #NO_APP
1816; CHECK-NEXT:    vhaddps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1817; CHECK-NEXT:    retq
1818  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1819  %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
1820  ret <8 x float> %2
1821}
1822declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
1823
1824define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) {
1825; CHECK-LABEL: stack_fold_hsubpd:
1826; CHECK:       # %bb.0:
1827; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1828; CHECK-NEXT:    #APP
1829; CHECK-NEXT:    nop
1830; CHECK-NEXT:    #NO_APP
1831; CHECK-NEXT:    vhsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1832; CHECK-NEXT:    retq
1833  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1834  %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
1835  ret <2 x double> %2
1836}
1837declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
1838
1839define <4 x double> @stack_fold_hsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
1840; CHECK-LABEL: stack_fold_hsubpd_ymm:
1841; CHECK:       # %bb.0:
1842; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1843; CHECK-NEXT:    #APP
1844; CHECK-NEXT:    nop
1845; CHECK-NEXT:    #NO_APP
1846; CHECK-NEXT:    vhsubpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1847; CHECK-NEXT:    retq
1848  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1849  %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
1850  ret <4 x double> %2
1851}
1852declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
1853
1854define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
1855; CHECK-LABEL: stack_fold_hsubps:
1856; CHECK:       # %bb.0:
1857; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1858; CHECK-NEXT:    #APP
1859; CHECK-NEXT:    nop
1860; CHECK-NEXT:    #NO_APP
1861; CHECK-NEXT:    vhsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1862; CHECK-NEXT:    retq
1863  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1864  %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
1865  ret <4 x float> %2
1866}
1867declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
1868
1869define <8 x float> @stack_fold_hsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
1870; CHECK-LABEL: stack_fold_hsubps_ymm:
1871; CHECK:       # %bb.0:
1872; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1873; CHECK-NEXT:    #APP
1874; CHECK-NEXT:    nop
1875; CHECK-NEXT:    #NO_APP
1876; CHECK-NEXT:    vhsubps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1877; CHECK-NEXT:    retq
1878  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1879  %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
1880  ret <8 x float> %2
1881}
1882declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
1883
1884define <8 x float> @stack_fold_insertf128(<4 x float> %a0, <4 x float> %a1) {
1885; CHECK-LABEL: stack_fold_insertf128:
1886; CHECK:       # %bb.0:
1887; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1888; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1889; CHECK-NEXT:    #APP
1890; CHECK-NEXT:    nop
1891; CHECK-NEXT:    #NO_APP
1892; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1893; CHECK-NEXT:    retq
1894  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1895  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1896  ret <8 x float> %2
1897}
1898
1899define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
1900; CHECK-LABEL: stack_fold_insertps:
1901; CHECK:       # %bb.0:
1902; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1903; CHECK-NEXT:    #APP
1904; CHECK-NEXT:    nop
1905; CHECK-NEXT:    #NO_APP
1906; CHECK-NEXT:    vinsertps $17, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1907; CHECK-NEXT:    # xmm0 = zero,mem[0],xmm0[2,3]
1908; CHECK-NEXT:    retq
1909  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1910  %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
1911  ret <4 x float> %2
1912}
1913declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
1914
1915define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
1916; CHECK-LABEL: stack_fold_maxpd:
1917; CHECK:       # %bb.0:
1918; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1919; CHECK-NEXT:    #APP
1920; CHECK-NEXT:    nop
1921; CHECK-NEXT:    #NO_APP
1922; CHECK-NEXT:    vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1923; CHECK-NEXT:    retq
1924  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1925  %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
1926  ret <2 x double> %2
1927}
1928declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
1929
1930define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
1931; CHECK-LABEL: stack_fold_maxpd_commutable:
1932; CHECK:       # %bb.0:
1933; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1934; CHECK-NEXT:    #APP
1935; CHECK-NEXT:    nop
1936; CHECK-NEXT:    #NO_APP
1937; CHECK-NEXT:    vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1938; CHECK-NEXT:    retq
1939  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1940  %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
1941  ret <2 x double> %2
1942}
1943
1944define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
1945; CHECK-LABEL: stack_fold_maxpd_ymm:
1946; CHECK:       # %bb.0:
1947; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1948; CHECK-NEXT:    #APP
1949; CHECK-NEXT:    nop
1950; CHECK-NEXT:    #NO_APP
1951; CHECK-NEXT:    vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1952; CHECK-NEXT:    retq
1953  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1954  %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
1955  ret <4 x double> %2
1956}
1957declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
1958
1959define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
1960; CHECK-LABEL: stack_fold_maxpd_ymm_commutable:
1961; CHECK:       # %bb.0:
1962; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1963; CHECK-NEXT:    #APP
1964; CHECK-NEXT:    nop
1965; CHECK-NEXT:    #NO_APP
1966; CHECK-NEXT:    vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1967; CHECK-NEXT:    retq
1968  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1969  %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
1970  ret <4 x double> %2
1971}
1972
1973define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
1974; CHECK-LABEL: stack_fold_maxps:
1975; CHECK:       # %bb.0:
1976; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1977; CHECK-NEXT:    #APP
1978; CHECK-NEXT:    nop
1979; CHECK-NEXT:    #NO_APP
1980; CHECK-NEXT:    vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1981; CHECK-NEXT:    retq
1982  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1983  %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
1984  ret <4 x float> %2
1985}
1986declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
1987
1988define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
1989; CHECK-LABEL: stack_fold_maxps_commutable:
1990; CHECK:       # %bb.0:
1991; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1992; CHECK-NEXT:    #APP
1993; CHECK-NEXT:    nop
1994; CHECK-NEXT:    #NO_APP
1995; CHECK-NEXT:    vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1996; CHECK-NEXT:    retq
1997  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1998  %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
1999  ret <4 x float> %2
2000}
2001
2002define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
2003; CHECK-LABEL: stack_fold_maxps_ymm:
2004; CHECK:       # %bb.0:
2005; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2006; CHECK-NEXT:    #APP
2007; CHECK-NEXT:    nop
2008; CHECK-NEXT:    #NO_APP
2009; CHECK-NEXT:    vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2010; CHECK-NEXT:    retq
2011  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2012  %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
2013  ret <8 x float> %2
2014}
2015declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
2016
2017define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
2018; CHECK-LABEL: stack_fold_maxps_ymm_commutable:
2019; CHECK:       # %bb.0:
2020; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2021; CHECK-NEXT:    #APP
2022; CHECK-NEXT:    nop
2023; CHECK-NEXT:    #NO_APP
2024; CHECK-NEXT:    vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2025; CHECK-NEXT:    retq
2026  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2027  %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
2028  ret <8 x float> %2
2029}
2030
2031define double @stack_fold_maxsd(double %a0, double %a1) #0 {
2032; CHECK-LABEL: stack_fold_maxsd:
2033; CHECK:       # %bb.0:
2034; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2035; CHECK-NEXT:    #APP
2036; CHECK-NEXT:    nop
2037; CHECK-NEXT:    #NO_APP
2038; CHECK-NEXT:    vmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2039; CHECK-NEXT:    retq
2040  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2041  %2 = fcmp ogt double %a0, %a1
2042  %3 = select i1 %2, double %a0, double %a1
2043  ret double %3
2044}
2045
2046define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
2047; CHECK-LABEL: stack_fold_maxsd_commutable:
2048; CHECK:       # %bb.0:
2049; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2050; CHECK-NEXT:    #APP
2051; CHECK-NEXT:    nop
2052; CHECK-NEXT:    #NO_APP
2053; CHECK-NEXT:    vmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2054; CHECK-NEXT:    retq
2055  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2056  %2 = fcmp ogt double %a0, %a1
2057  %3 = select i1 %2, double %a0, double %a1
2058  ret double %3
2059}
2060
2061define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
2062; CHECK-LABEL: stack_fold_maxsd_int:
2063; CHECK:       # %bb.0:
2064; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2065; CHECK-NEXT:    #APP
2066; CHECK-NEXT:    nop
2067; CHECK-NEXT:    #NO_APP
2068; CHECK-NEXT:    vmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2069; CHECK-NEXT:    retq
2070  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2071  %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
2072  ret <2 x double> %2
2073}
2074declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
2075
2076define float @stack_fold_maxss(float %a0, float %a1) #0 {
2077; CHECK-LABEL: stack_fold_maxss:
2078; CHECK:       # %bb.0:
2079; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2080; CHECK-NEXT:    #APP
2081; CHECK-NEXT:    nop
2082; CHECK-NEXT:    #NO_APP
2083; CHECK-NEXT:    vmaxss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2084; CHECK-NEXT:    retq
2085  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2086  %2 = fcmp ogt float %a0, %a1
2087  %3 = select i1 %2, float %a0, float %a1
2088  ret float %3
2089}
2090
2091define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
2092; CHECK-LABEL: stack_fold_maxss_commutable:
2093; CHECK:       # %bb.0:
2094; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2095; CHECK-NEXT:    #APP
2096; CHECK-NEXT:    nop
2097; CHECK-NEXT:    #NO_APP
2098; CHECK-NEXT:    vmaxss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2099; CHECK-NEXT:    retq
2100  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2101  %2 = fcmp ogt float %a0, %a1
2102  %3 = select i1 %2, float %a0, float %a1
2103  ret float %3
2104}
2105
2106define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
2107; CHECK-LABEL: stack_fold_maxss_int:
2108; CHECK:       # %bb.0:
2109; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2110; CHECK-NEXT:    #APP
2111; CHECK-NEXT:    nop
2112; CHECK-NEXT:    #NO_APP
2113; CHECK-NEXT:    vmaxss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2114; CHECK-NEXT:    retq
2115  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2116  %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
2117  ret <4 x float> %2
2118}
2119declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
2120
2121define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
2122; CHECK-LABEL: stack_fold_minpd:
2123; CHECK:       # %bb.0:
2124; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2125; CHECK-NEXT:    #APP
2126; CHECK-NEXT:    nop
2127; CHECK-NEXT:    #NO_APP
2128; CHECK-NEXT:    vminpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2129; CHECK-NEXT:    retq
2130  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2131  %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
2132  ret <2 x double> %2
2133}
2134declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
2135
2136define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
2137; CHECK-LABEL: stack_fold_minpd_commutable:
2138; CHECK:       # %bb.0:
2139; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2140; CHECK-NEXT:    #APP
2141; CHECK-NEXT:    nop
2142; CHECK-NEXT:    #NO_APP
2143; CHECK-NEXT:    vminpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2144; CHECK-NEXT:    retq
2145  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2146  %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
2147  ret <2 x double> %2
2148}
2149
2150define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
2151; CHECK-LABEL: stack_fold_minpd_ymm:
2152; CHECK:       # %bb.0:
2153; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2154; CHECK-NEXT:    #APP
2155; CHECK-NEXT:    nop
2156; CHECK-NEXT:    #NO_APP
2157; CHECK-NEXT:    vminpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2158; CHECK-NEXT:    retq
2159  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2160  %2 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
2161  ret <4 x double> %2
2162}
2163declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
2164
2165define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
2166; CHECK-LABEL: stack_fold_minpd_ymm_commutable:
2167; CHECK:       # %bb.0:
2168; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2169; CHECK-NEXT:    #APP
2170; CHECK-NEXT:    nop
2171; CHECK-NEXT:    #NO_APP
2172; CHECK-NEXT:    vminpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2173; CHECK-NEXT:    retq
2174  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2175  %2 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
2176  ret <4 x double> %2
2177}
2178
2179define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
2180; CHECK-LABEL: stack_fold_minps:
2181; CHECK:       # %bb.0:
2182; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2183; CHECK-NEXT:    #APP
2184; CHECK-NEXT:    nop
2185; CHECK-NEXT:    #NO_APP
2186; CHECK-NEXT:    vminps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2187; CHECK-NEXT:    retq
2188  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2189  %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
2190  ret <4 x float> %2
2191}
2192declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
2193
2194define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
2195; CHECK-LABEL: stack_fold_minps_commutable:
2196; CHECK:       # %bb.0:
2197; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2198; CHECK-NEXT:    #APP
2199; CHECK-NEXT:    nop
2200; CHECK-NEXT:    #NO_APP
2201; CHECK-NEXT:    vminps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2202; CHECK-NEXT:    retq
2203  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2204  %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
2205  ret <4 x float> %2
2206}
2207
2208define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
2209; CHECK-LABEL: stack_fold_minps_ymm:
2210; CHECK:       # %bb.0:
2211; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2212; CHECK-NEXT:    #APP
2213; CHECK-NEXT:    nop
2214; CHECK-NEXT:    #NO_APP
2215; CHECK-NEXT:    vminps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2216; CHECK-NEXT:    retq
2217  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2218  %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
2219  ret <8 x float> %2
2220}
2221declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
2222
2223define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
2224; CHECK-LABEL: stack_fold_minps_ymm_commutable:
2225; CHECK:       # %bb.0:
2226; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2227; CHECK-NEXT:    #APP
2228; CHECK-NEXT:    nop
2229; CHECK-NEXT:    #NO_APP
2230; CHECK-NEXT:    vminps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2231; CHECK-NEXT:    retq
2232  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2233  %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
2234  ret <8 x float> %2
2235}
2236
2237define double @stack_fold_minsd(double %a0, double %a1) #0 {
2238; CHECK-LABEL: stack_fold_minsd:
2239; CHECK:       # %bb.0:
2240; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2241; CHECK-NEXT:    #APP
2242; CHECK-NEXT:    nop
2243; CHECK-NEXT:    #NO_APP
2244; CHECK-NEXT:    vminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2245; CHECK-NEXT:    retq
2246  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2247  %2 = fcmp olt double %a0, %a1
2248  %3 = select i1 %2, double %a0, double %a1
2249  ret double %3
2250}
2251
2252define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
2253; CHECK-LABEL: stack_fold_minsd_commutable:
2254; CHECK:       # %bb.0:
2255; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2256; CHECK-NEXT:    #APP
2257; CHECK-NEXT:    nop
2258; CHECK-NEXT:    #NO_APP
2259; CHECK-NEXT:    vminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2260; CHECK-NEXT:    retq
2261  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2262  %2 = fcmp olt double %a0, %a1
2263  %3 = select i1 %2, double %a0, double %a1
2264  ret double %3
2265}
2266
2267define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
2268; CHECK-LABEL: stack_fold_minsd_int:
2269; CHECK:       # %bb.0:
2270; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2271; CHECK-NEXT:    #APP
2272; CHECK-NEXT:    nop
2273; CHECK-NEXT:    #NO_APP
2274; CHECK-NEXT:    vminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2275; CHECK-NEXT:    retq
2276  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2277  %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
2278  ret <2 x double> %2
2279}
2280declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
2281
2282define float @stack_fold_minss(float %a0, float %a1) #0 {
2283; CHECK-LABEL: stack_fold_minss:
2284; CHECK:       # %bb.0:
2285; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2286; CHECK-NEXT:    #APP
2287; CHECK-NEXT:    nop
2288; CHECK-NEXT:    #NO_APP
2289; CHECK-NEXT:    vminss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2290; CHECK-NEXT:    retq
2291  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2292  %2 = fcmp olt float %a0, %a1
2293  %3 = select i1 %2, float %a0, float %a1
2294  ret float %3
2295}
2296
2297define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
2298; CHECK-LABEL: stack_fold_minss_commutable:
2299; CHECK:       # %bb.0:
2300; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2301; CHECK-NEXT:    #APP
2302; CHECK-NEXT:    nop
2303; CHECK-NEXT:    #NO_APP
2304; CHECK-NEXT:    vminss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2305; CHECK-NEXT:    retq
2306  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2307  %2 = fcmp olt float %a0, %a1
2308  %3 = select i1 %2, float %a0, float %a1
2309  ret float %3
2310}
2311
2312define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
2313; CHECK-LABEL: stack_fold_minss_int:
2314; CHECK:       # %bb.0:
2315; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2316; CHECK-NEXT:    #APP
2317; CHECK-NEXT:    nop
2318; CHECK-NEXT:    #NO_APP
2319; CHECK-NEXT:    vminss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2320; CHECK-NEXT:    retq
2321  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2322  %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
2323  ret <4 x float> %2
2324}
2325declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
2326
2327define <2 x double> @stack_fold_movddup(<2 x double> %a0) {
2328; CHECK-LABEL: stack_fold_movddup:
2329; CHECK:       # %bb.0:
2330; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2331; CHECK-NEXT:    #APP
2332; CHECK-NEXT:    nop
2333; CHECK-NEXT:    #NO_APP
2334; CHECK-NEXT:    vmovddup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2335; CHECK-NEXT:    # xmm0 = mem[0,0]
2336; CHECK-NEXT:    retq
2337  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2338  %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
2339  ret <2 x double> %2
2340}
2341
2342define <4 x double> @stack_fold_movddup_ymm(<4 x double> %a0) {
2343; CHECK-LABEL: stack_fold_movddup_ymm:
2344; CHECK:       # %bb.0:
2345; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2346; CHECK-NEXT:    #APP
2347; CHECK-NEXT:    nop
2348; CHECK-NEXT:    #NO_APP
2349; CHECK-NEXT:    vmovddup {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2350; CHECK-NEXT:    # ymm0 = mem[0,0,2,2]
2351; CHECK-NEXT:    retq
2352  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2353  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2354  ret <4 x double> %2
2355}
2356
2357; TODO stack_fold_movhpd (load / store)
2358; TODO stack_fold_movhps (load / store)
2359
2360; TODO stack_fold_movlpd (load / store)
2361; TODO stack_fold_movlps (load / store)
2362
2363define <4 x float> @stack_fold_movshdup(<4 x float> %a0) {
2364; CHECK-LABEL: stack_fold_movshdup:
2365; CHECK:       # %bb.0:
2366; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2367; CHECK-NEXT:    #APP
2368; CHECK-NEXT:    nop
2369; CHECK-NEXT:    #NO_APP
2370; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2371; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
2372; CHECK-NEXT:    retq
2373  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2374  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2375  ret <4 x float> %2
2376}
2377
2378define <8 x float> @stack_fold_movshdup_ymm(<8 x float> %a0) {
2379; CHECK-LABEL: stack_fold_movshdup_ymm:
2380; CHECK:       # %bb.0:
2381; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2382; CHECK-NEXT:    #APP
2383; CHECK-NEXT:    nop
2384; CHECK-NEXT:    #NO_APP
2385; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2386; CHECK-NEXT:    # ymm0 = mem[1,1,3,3,5,5,7,7]
2387; CHECK-NEXT:    retq
2388  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2389  %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2390  ret <8 x float> %2
2391}
2392
2393define <4 x float> @stack_fold_movsldup(<4 x float> %a0) {
2394; CHECK-LABEL: stack_fold_movsldup:
2395; CHECK:       # %bb.0:
2396; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2397; CHECK-NEXT:    #APP
2398; CHECK-NEXT:    nop
2399; CHECK-NEXT:    #NO_APP
2400; CHECK-NEXT:    vmovsldup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2401; CHECK-NEXT:    # xmm0 = mem[0,0,2,2]
2402; CHECK-NEXT:    retq
2403  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2404  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2405  ret <4 x float> %2
2406}
2407
2408define <8 x float> @stack_fold_movsldup_ymm(<8 x float> %a0) {
2409; CHECK-LABEL: stack_fold_movsldup_ymm:
2410; CHECK:       # %bb.0:
2411; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2412; CHECK-NEXT:    #APP
2413; CHECK-NEXT:    nop
2414; CHECK-NEXT:    #NO_APP
2415; CHECK-NEXT:    vmovsldup {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2416; CHECK-NEXT:    # ymm0 = mem[0,0,2,2,4,4,6,6]
2417; CHECK-NEXT:    retq
2418  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2419  %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2420  ret <8 x float> %2
2421}
2422
2423define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) {
2424; CHECK-LABEL: stack_fold_mulpd:
2425; CHECK:       # %bb.0:
2426; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2427; CHECK-NEXT:    #APP
2428; CHECK-NEXT:    nop
2429; CHECK-NEXT:    #NO_APP
2430; CHECK-NEXT:    vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2431; CHECK-NEXT:    retq
2432  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2433  %2 = fmul <2 x double> %a0, %a1
2434  ret <2 x double> %2
2435}
2436
2437define <4 x double> @stack_fold_mulpd_ymm(<4 x double> %a0, <4 x double> %a1) {
2438; CHECK-LABEL: stack_fold_mulpd_ymm:
2439; CHECK:       # %bb.0:
2440; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2441; CHECK-NEXT:    #APP
2442; CHECK-NEXT:    nop
2443; CHECK-NEXT:    #NO_APP
2444; CHECK-NEXT:    vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2445; CHECK-NEXT:    retq
2446  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2447  %2 = fmul <4 x double> %a0, %a1
2448  ret <4 x double> %2
2449}
2450
2451define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) {
2452; CHECK-LABEL: stack_fold_mulps:
2453; CHECK:       # %bb.0:
2454; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2455; CHECK-NEXT:    #APP
2456; CHECK-NEXT:    nop
2457; CHECK-NEXT:    #NO_APP
2458; CHECK-NEXT:    vmulps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2459; CHECK-NEXT:    retq
2460  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2461  %2 = fmul <4 x float> %a0, %a1
2462  ret <4 x float> %2
2463}
2464
2465define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) {
2466; CHECK-LABEL: stack_fold_mulps_ymm:
2467; CHECK:       # %bb.0:
2468; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2469; CHECK-NEXT:    #APP
2470; CHECK-NEXT:    nop
2471; CHECK-NEXT:    #NO_APP
2472; CHECK-NEXT:    vmulps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2473; CHECK-NEXT:    retq
2474  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2475  %2 = fmul <8 x float> %a0, %a1
2476  ret <8 x float> %2
2477}
2478
2479define double @stack_fold_mulsd(double %a0, double %a1) {
2480; CHECK-LABEL: stack_fold_mulsd:
2481; CHECK:       # %bb.0:
2482; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2483; CHECK-NEXT:    #APP
2484; CHECK-NEXT:    nop
2485; CHECK-NEXT:    #NO_APP
2486; CHECK-NEXT:    vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2487; CHECK-NEXT:    retq
2488  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2489  %2 = fmul double %a0, %a1
2490  ret double %2
2491}
2492
2493define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
2494; CHECK-LABEL: stack_fold_mulsd_int:
2495; CHECK:       # %bb.0:
2496; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2497; CHECK-NEXT:    #APP
2498; CHECK-NEXT:    nop
2499; CHECK-NEXT:    #NO_APP
2500; CHECK-NEXT:    vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2501; CHECK-NEXT:    retq
2502  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2503  %2 = extractelement <2 x double> %a0, i32 0
2504  %3 = extractelement <2 x double> %a1, i32 0
2505  %4 = fmul double %2, %3
2506  %5 = insertelement <2 x double> %a0, double %4, i32 0
2507  ret <2 x double> %5
2508}
2509
2510define float @stack_fold_mulss(float %a0, float %a1) {
2511; CHECK-LABEL: stack_fold_mulss:
2512; CHECK:       # %bb.0:
2513; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2514; CHECK-NEXT:    #APP
2515; CHECK-NEXT:    nop
2516; CHECK-NEXT:    #NO_APP
2517; CHECK-NEXT:    vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2518; CHECK-NEXT:    retq
2519  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2520  %2 = fmul float %a0, %a1
2521  ret float %2
2522}
2523
2524define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
2525; CHECK-LABEL: stack_fold_mulss_int:
2526; CHECK:       # %bb.0:
2527; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2528; CHECK-NEXT:    #APP
2529; CHECK-NEXT:    nop
2530; CHECK-NEXT:    #NO_APP
2531; CHECK-NEXT:    vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2532; CHECK-NEXT:    retq
2533  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2534  %2 = extractelement <4 x float> %a0, i32 0
2535  %3 = extractelement <4 x float> %a1, i32 0
2536  %4 = fmul float %2, %3
2537  %5 = insertelement <4 x float> %a0, float %4, i32 0
2538  ret <4 x float> %5
2539}
2540
2541define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
2542; CHECK-LABEL: stack_fold_orpd:
2543; CHECK:       # %bb.0:
2544; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2545; CHECK-NEXT:    #APP
2546; CHECK-NEXT:    nop
2547; CHECK-NEXT:    #NO_APP
2548; CHECK-NEXT:    vorpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2549; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
2550; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
2551; CHECK-NEXT:    retq
2552  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2553  %2 = bitcast <2 x double> %a0 to <2 x i64>
2554  %3 = bitcast <2 x double> %a1 to <2 x i64>
2555  %4 = or <2 x i64> %2, %3
2556  %5 = bitcast <2 x i64> %4 to <2 x double>
2557  ; fadd forces execution domain
2558  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
2559  ret <2 x double> %6
2560}
2561
2562define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) {
2563; CHECK-LABEL: stack_fold_orpd_ymm:
2564; CHECK:       # %bb.0:
2565; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2566; CHECK-NEXT:    #APP
2567; CHECK-NEXT:    nop
2568; CHECK-NEXT:    #NO_APP
2569; CHECK-NEXT:    vorpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2570; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
2571; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
2572; CHECK-NEXT:    retq
2573  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2574  %2 = bitcast <4 x double> %a0 to <4 x i64>
2575  %3 = bitcast <4 x double> %a1 to <4 x i64>
2576  %4 = or <4 x i64> %2, %3
2577  %5 = bitcast <4 x i64> %4 to <4 x double>
2578  ; fadd forces execution domain
2579  %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
2580  ret <4 x double> %6
2581}
2582
2583define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
2584; CHECK-LABEL: stack_fold_orps:
2585; CHECK:       # %bb.0:
2586; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2587; CHECK-NEXT:    #APP
2588; CHECK-NEXT:    nop
2589; CHECK-NEXT:    #NO_APP
2590; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2591; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2592; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2593; CHECK-NEXT:    retq
2594  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2595  %2 = bitcast <4 x float> %a0 to <2 x i64>
2596  %3 = bitcast <4 x float> %a1 to <2 x i64>
2597  %4 = or <2 x i64> %2, %3
2598  %5 = bitcast <2 x i64> %4 to <4 x float>
2599  ; fadd forces execution domain
2600  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
2601  ret <4 x float> %6
2602}
2603
2604define <8 x float> @stack_fold_orps_ymm(<8 x float> %a0, <8 x float> %a1) {
2605; CHECK-LABEL: stack_fold_orps_ymm:
2606; CHECK:       # %bb.0:
2607; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2608; CHECK-NEXT:    #APP
2609; CHECK-NEXT:    nop
2610; CHECK-NEXT:    #NO_APP
2611; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2612; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2613; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2614; CHECK-NEXT:    retq
2615  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2616  %2 = bitcast <8 x float> %a0 to <4 x i64>
2617  %3 = bitcast <8 x float> %a1 to <4 x i64>
2618  %4 = or <4 x i64> %2, %3
2619  %5 = bitcast <4 x i64> %4 to <8 x float>
2620  ; fadd forces execution domain
2621  %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
2622  ret <8 x float> %6
2623}
2624
2625define <8 x float> @stack_fold_perm2f128(<8 x float> %a0, <8 x float> %a1) {
2626; CHECK-LABEL: stack_fold_perm2f128:
2627; CHECK:       # %bb.0:
2628; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2629; CHECK-NEXT:    #APP
2630; CHECK-NEXT:    nop
2631; CHECK-NEXT:    #NO_APP
2632; CHECK-NEXT:    vperm2f128 $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2633; CHECK-NEXT:    # ymm0 = ymm0[2,3],mem[0,1]
2634; CHECK-NEXT:    retq
2635  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2636  %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
2637  ret <8 x float> %2
2638}
2639
2640define <2 x double> @stack_fold_permilpd(<2 x double> %a0) {
2641; CHECK-LABEL: stack_fold_permilpd:
2642; CHECK:       # %bb.0:
2643; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2644; CHECK-NEXT:    #APP
2645; CHECK-NEXT:    nop
2646; CHECK-NEXT:    #NO_APP
2647; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2648; CHECK-NEXT:    # xmm0 = mem[1,0]
2649; CHECK-NEXT:    retq
2650  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2651  %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0>
2652  ret <2 x double> %2
2653}
2654
2655define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) {
2656; CHECK-LABEL: stack_fold_permilpd_ymm:
2657; CHECK:       # %bb.0:
2658; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2659; CHECK-NEXT:    #APP
2660; CHECK-NEXT:    nop
2661; CHECK-NEXT:    #NO_APP
2662; CHECK-NEXT:    vpermilpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2663; CHECK-NEXT:    # ymm0 = mem[1,0,3,2]
2664; CHECK-NEXT:    retq
2665  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2666  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
2667  ret <4 x double> %2
2668}
2669
2670define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) {
2671; CHECK-LABEL: stack_fold_permilpdvar:
2672; CHECK:       # %bb.0:
2673; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2674; CHECK-NEXT:    #APP
2675; CHECK-NEXT:    nop
2676; CHECK-NEXT:    #NO_APP
2677; CHECK-NEXT:    vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2678; CHECK-NEXT:    retq
2679  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2680  %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
2681  ret <2 x double> %2
2682}
2683declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
2684
2685define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) {
2686; CHECK-LABEL: stack_fold_permilpdvar_ymm:
2687; CHECK:       # %bb.0:
2688; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2689; CHECK-NEXT:    #APP
2690; CHECK-NEXT:    nop
2691; CHECK-NEXT:    #NO_APP
2692; CHECK-NEXT:    vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2693; CHECK-NEXT:    retq
2694  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2695  %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
2696  ret <4 x double> %2
2697}
2698declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
2699
2700define <4 x float> @stack_fold_permilps(<4 x float> %a0) {
2701; CHECK-LABEL: stack_fold_permilps:
2702; CHECK:       # %bb.0:
2703; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2704; CHECK-NEXT:    #APP
2705; CHECK-NEXT:    nop
2706; CHECK-NEXT:    #NO_APP
2707; CHECK-NEXT:    vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2708; CHECK-NEXT:    # xmm0 = mem[3,2,1,0]
2709; CHECK-NEXT:    retq
2710  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2711  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2712  ret <4 x float> %2
2713}
2714
2715define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) {
2716; CHECK-LABEL: stack_fold_permilps_ymm:
2717; CHECK:       # %bb.0:
2718; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2719; CHECK-NEXT:    #APP
2720; CHECK-NEXT:    nop
2721; CHECK-NEXT:    #NO_APP
2722; CHECK-NEXT:    vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2723; CHECK-NEXT:    # ymm0 = mem[3,2,1,0,7,6,5,4]
2724; CHECK-NEXT:    retq
2725  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2726  %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
2727  ret <8 x float> %2
2728}
2729
2730define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) {
2731; CHECK-LABEL: stack_fold_permilpsvar:
2732; CHECK:       # %bb.0:
2733; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2734; CHECK-NEXT:    #APP
2735; CHECK-NEXT:    nop
2736; CHECK-NEXT:    #NO_APP
2737; CHECK-NEXT:    vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2738; CHECK-NEXT:    retq
2739  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2740  %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
2741  ret <4 x float> %2
2742}
2743declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
2744
2745define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) {
2746; CHECK-LABEL: stack_fold_permilpsvar_ymm:
2747; CHECK:       # %bb.0:
2748; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2749; CHECK-NEXT:    #APP
2750; CHECK-NEXT:    nop
2751; CHECK-NEXT:    #NO_APP
2752; CHECK-NEXT:    vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2753; CHECK-NEXT:    retq
2754  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2755  %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
2756  ret <8 x float> %2
2757}
2758declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
2759
2760; TODO stack_fold_rcpps
2761
2762define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
2763; CHECK-LABEL: stack_fold_rcpps_int:
2764; CHECK:       # %bb.0:
2765; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2766; CHECK-NEXT:    #APP
2767; CHECK-NEXT:    nop
2768; CHECK-NEXT:    #NO_APP
2769; CHECK-NEXT:    vrcpps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2770; CHECK-NEXT:    retq
2771  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2772  %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
2773  ret <4 x float> %2
2774}
2775declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
2776
2777; TODO stack_fold_rcpps_ymm
2778
2779define <8 x float> @stack_fold_rcpps_ymm_int(<8 x float> %a0) {
2780; CHECK-LABEL: stack_fold_rcpps_ymm_int:
2781; CHECK:       # %bb.0:
2782; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2783; CHECK-NEXT:    #APP
2784; CHECK-NEXT:    nop
2785; CHECK-NEXT:    #NO_APP
2786; CHECK-NEXT:    vrcpps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2787; CHECK-NEXT:    retq
2788  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2789  %2 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
2790  ret <8 x float> %2
2791}
2792declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
2793
2794; TODO stack_fold_rcpss
2795; TODO stack_fold_rcpss_int
2796
2797define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
2798; CHECK-LABEL: stack_fold_roundpd:
2799; CHECK:       # %bb.0:
2800; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2801; CHECK-NEXT:    #APP
2802; CHECK-NEXT:    nop
2803; CHECK-NEXT:    #NO_APP
2804; CHECK-NEXT:    vroundpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2805; CHECK-NEXT:    retq
2806  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2807  %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
2808  ret <2 x double> %2
2809}
2810declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
2811
2812define <4 x double> @stack_fold_roundpd_ymm(<4 x double> %a0) {
2813; CHECK-LABEL: stack_fold_roundpd_ymm:
2814; CHECK:       # %bb.0:
2815; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2816; CHECK-NEXT:    #APP
2817; CHECK-NEXT:    nop
2818; CHECK-NEXT:    #NO_APP
2819; CHECK-NEXT:    vroundpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2820; CHECK-NEXT:    retq
2821  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2822  %2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)
2823  ret <4 x double> %2
2824}
2825declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
2826
2827define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
2828; CHECK-LABEL: stack_fold_roundps:
2829; CHECK:       # %bb.0:
2830; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2831; CHECK-NEXT:    #APP
2832; CHECK-NEXT:    nop
2833; CHECK-NEXT:    #NO_APP
2834; CHECK-NEXT:    vroundps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2835; CHECK-NEXT:    retq
2836  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2837  %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
2838  ret <4 x float> %2
2839}
2840declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
2841
2842define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) {
2843; CHECK-LABEL: stack_fold_roundps_ymm:
2844; CHECK:       # %bb.0:
2845; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2846; CHECK-NEXT:    #APP
2847; CHECK-NEXT:    nop
2848; CHECK-NEXT:    #NO_APP
2849; CHECK-NEXT:    vroundps $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2850; CHECK-NEXT:    retq
2851  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2852  %2 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)
2853  ret <8 x float> %2
2854}
2855declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
2856
2857define double @stack_fold_roundsd(double %a0) optsize {
2858; CHECK-LABEL: stack_fold_roundsd:
2859; CHECK:       # %bb.0:
2860; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2861; CHECK-NEXT:    #APP
2862; CHECK-NEXT:    nop
2863; CHECK-NEXT:    #NO_APP
2864; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2865; CHECK-NEXT:    vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2866; CHECK-NEXT:    retq
2867  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2868  %2 = call double @llvm.floor.f64(double %a0)
2869  ret double %2
2870}
2871
2872define double @stack_fold_roundsd_minsize(double %a0) minsize {
2873; CHECK-LABEL: stack_fold_roundsd_minsize:
2874; CHECK:       # %bb.0:
2875; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2876; CHECK-NEXT:    #APP
2877; CHECK-NEXT:    nop
2878; CHECK-NEXT:    #NO_APP
2879; CHECK-NEXT:    vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2880; CHECK-NEXT:    retq
2881  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2882  %2 = call double @llvm.floor.f64(double %a0)
2883  ret double %2
2884}
2885declare double @llvm.floor.f64(double) nounwind readnone
2886
2887define <2 x double> @stack_fold_roundsd_int(<2 x double> %a0, <2 x double> %a1) optsize {
2888; CHECK-LABEL: stack_fold_roundsd_int:
2889; CHECK:       # %bb.0:
2890; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2891; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2892; CHECK-NEXT:    #APP
2893; CHECK-NEXT:    nop
2894; CHECK-NEXT:    #NO_APP
2895; CHECK-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2896; CHECK-NEXT:    vroundsd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2897; CHECK-NEXT:    retq
2898  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2899  %2 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7)
2900  ret <2 x double> %2
2901}
2902declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
2903
2904define float @stack_fold_roundss(float %a0) optsize {
2905; CHECK-LABEL: stack_fold_roundss:
2906; CHECK:       # %bb.0:
2907; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2908; CHECK-NEXT:    #APP
2909; CHECK-NEXT:    nop
2910; CHECK-NEXT:    #NO_APP
2911; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2912; CHECK-NEXT:    vroundss $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2913; CHECK-NEXT:    retq
2914  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2915  %2 = call float @llvm.floor.f32(float %a0)
2916  ret float %2
2917}
2918declare float @llvm.floor.f32(float) nounwind readnone
2919
2920define <4 x float> @stack_fold_roundss_int(<4 x float> %a0, <4 x float> %a1) optsize {
2921; CHECK-LABEL: stack_fold_roundss_int:
2922; CHECK:       # %bb.0:
2923; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2924; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2925; CHECK-NEXT:    #APP
2926; CHECK-NEXT:    nop
2927; CHECK-NEXT:    #NO_APP
2928; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2929; CHECK-NEXT:    vroundss $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2930; CHECK-NEXT:    retq
2931  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2932  %2 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7)
2933  ret <4 x float> %2
2934}
2935declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
2936
2937; TODO stack_fold_rsqrtps
2938
2939define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
2940; CHECK-LABEL: stack_fold_rsqrtps_int:
2941; CHECK:       # %bb.0:
2942; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2943; CHECK-NEXT:    #APP
2944; CHECK-NEXT:    nop
2945; CHECK-NEXT:    #NO_APP
2946; CHECK-NEXT:    vrsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2947; CHECK-NEXT:    retq
2948  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2949  %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
2950  ret <4 x float> %2
2951}
2952declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
2953
2954; TODO stack_fold_rsqrtps_ymm
2955
2956define <8 x float> @stack_fold_rsqrtps_ymm_int(<8 x float> %a0) {
2957; CHECK-LABEL: stack_fold_rsqrtps_ymm_int:
2958; CHECK:       # %bb.0:
2959; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2960; CHECK-NEXT:    #APP
2961; CHECK-NEXT:    nop
2962; CHECK-NEXT:    #NO_APP
2963; CHECK-NEXT:    vrsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2964; CHECK-NEXT:    retq
2965  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2966  %2 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
2967  ret <8 x float> %2
2968}
2969declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
2970
2971; TODO stack_fold_rsqrtss
2972; TODO stack_fold_rsqrtss_int
2973
2974define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
2975; CHECK-LABEL: stack_fold_shufpd:
2976; CHECK:       # %bb.0:
2977; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2978; CHECK-NEXT:    #APP
2979; CHECK-NEXT:    nop
2980; CHECK-NEXT:    #NO_APP
2981; CHECK-NEXT:    vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2982; CHECK-NEXT:    # xmm0 = xmm0[1],mem[0]
2983; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
2984; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
2985; CHECK-NEXT:    retq
2986  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2987  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
2988  ; fadd forces execution domain
2989  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
2990  ret <2 x double> %3
2991}
2992
2993define <4 x double> @stack_fold_shufpd_ymm(<4 x double> %a0, <4 x double> %a1) {
2994; CHECK-LABEL: stack_fold_shufpd_ymm:
2995; CHECK:       # %bb.0:
2996; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2997; CHECK-NEXT:    #APP
2998; CHECK-NEXT:    nop
2999; CHECK-NEXT:    #NO_APP
3000; CHECK-NEXT:    vshufpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3001; CHECK-NEXT:    # ymm0 = ymm0[1],mem[0],ymm0[3],mem[2]
3002; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
3003; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
3004; CHECK-NEXT:    retq
3005  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3006  %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
3007  ; fadd forces execution domain
3008  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
3009  ret <4 x double> %3
3010}
3011
3012define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
3013; CHECK-LABEL: stack_fold_shufps:
3014; CHECK:       # %bb.0:
3015; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3016; CHECK-NEXT:    #APP
3017; CHECK-NEXT:    nop
3018; CHECK-NEXT:    #NO_APP
3019; CHECK-NEXT:    vshufps $200, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3020; CHECK-NEXT:    # xmm0 = xmm0[0,2],mem[0,3]
3021; CHECK-NEXT:    retq
3022  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3023  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
3024  ret <4 x float> %2
3025}
3026
3027define <8 x float> @stack_fold_shufps_ymm(<8 x float> %a0, <8 x float> %a1) {
3028; CHECK-LABEL: stack_fold_shufps_ymm:
3029; CHECK:       # %bb.0:
3030; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3031; CHECK-NEXT:    #APP
3032; CHECK-NEXT:    nop
3033; CHECK-NEXT:    #NO_APP
3034; CHECK-NEXT:    vshufps $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3035; CHECK-NEXT:    # ymm0 = ymm0[0,1],mem[1,2],ymm0[4,5],mem[5,6]
3036; CHECK-NEXT:    retq
3037  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3038  %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 13, i32 14>
3039  ret <8 x float> %2
3040}
3041
3042define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) {
3043; CHECK-LABEL: stack_fold_sqrtpd:
3044; CHECK:       # %bb.0:
3045; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3046; CHECK-NEXT:    #APP
3047; CHECK-NEXT:    nop
3048; CHECK-NEXT:    #NO_APP
3049; CHECK-NEXT:    vsqrtpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3050; CHECK-NEXT:    retq
3051  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3052  %2 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0)
3053  ret <2 x double> %2
3054}
3055declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
3056
3057define <4 x double> @stack_fold_sqrtpd_ymm(<4 x double> %a0) {
3058; CHECK-LABEL: stack_fold_sqrtpd_ymm:
3059; CHECK:       # %bb.0:
3060; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3061; CHECK-NEXT:    #APP
3062; CHECK-NEXT:    nop
3063; CHECK-NEXT:    #NO_APP
3064; CHECK-NEXT:    vsqrtpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3065; CHECK-NEXT:    retq
3066  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3067  %2 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0)
3068  ret <4 x double> %2
3069}
3070declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
3071
3072define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
3073; CHECK-LABEL: stack_fold_sqrtps:
3074; CHECK:       # %bb.0:
3075; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3076; CHECK-NEXT:    #APP
3077; CHECK-NEXT:    nop
3078; CHECK-NEXT:    #NO_APP
3079; CHECK-NEXT:    vsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3080; CHECK-NEXT:    retq
3081  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3082  %2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
3083  ret <4 x float> %2
3084}
3085declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
3086
3087define <8 x float> @stack_fold_sqrtps_ymm(<8 x float> %a0) {
3088; CHECK-LABEL: stack_fold_sqrtps_ymm:
3089; CHECK:       # %bb.0:
3090; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3091; CHECK-NEXT:    #APP
3092; CHECK-NEXT:    nop
3093; CHECK-NEXT:    #NO_APP
3094; CHECK-NEXT:    vsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3095; CHECK-NEXT:    retq
3096  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3097  %2 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0)
3098  ret <8 x float> %2
3099}
3100declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
3101
3102define double @stack_fold_sqrtsd(double %a0) optsize {
3103; CHECK-LABEL: stack_fold_sqrtsd:
3104; CHECK:       # %bb.0:
3105; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3106; CHECK-NEXT:    #APP
3107; CHECK-NEXT:    nop
3108; CHECK-NEXT:    #NO_APP
3109; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3110; CHECK-NEXT:    vsqrtsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
3111; CHECK-NEXT:    retq
3112  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3113  %2 = call double @llvm.sqrt.f64(double %a0)
3114  ret double %2
3115}
3116declare double @llvm.sqrt.f64(double) nounwind readnone
3117
3118; TODO stack_fold_sqrtsd_int
3119
3120define float @stack_fold_sqrtss(float %a0) optsize {
3121; CHECK-LABEL: stack_fold_sqrtss:
3122; CHECK:       # %bb.0:
3123; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3124; CHECK-NEXT:    #APP
3125; CHECK-NEXT:    nop
3126; CHECK-NEXT:    #NO_APP
3127; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3128; CHECK-NEXT:    vsqrtss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
3129; CHECK-NEXT:    retq
3130  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3131  %2 = call float @llvm.sqrt.f32(float %a0)
3132  ret float %2
3133}
3134declare float @llvm.sqrt.f32(float) nounwind readnone
3135
3136; TODO stack_fold_sqrtss_int
3137
3138define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
3139; CHECK-LABEL: stack_fold_subpd:
3140; CHECK:       # %bb.0:
3141; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3142; CHECK-NEXT:    #APP
3143; CHECK-NEXT:    nop
3144; CHECK-NEXT:    #NO_APP
3145; CHECK-NEXT:    vsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3146; CHECK-NEXT:    retq
3147  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3148  %2 = fsub <2 x double> %a0, %a1
3149  ret <2 x double> %2
3150}
3151
3152define <4 x double> @stack_fold_subpd_ymm(<4 x double> %a0, <4 x double> %a1) {
3153; CHECK-LABEL: stack_fold_subpd_ymm:
3154; CHECK:       # %bb.0:
3155; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3156; CHECK-NEXT:    #APP
3157; CHECK-NEXT:    nop
3158; CHECK-NEXT:    #NO_APP
3159; CHECK-NEXT:    vsubpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3160; CHECK-NEXT:    retq
3161  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3162  %2 = fsub <4 x double> %a0, %a1
3163  ret <4 x double> %2
3164}
3165
3166define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) {
3167; CHECK-LABEL: stack_fold_subps:
3168; CHECK:       # %bb.0:
3169; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3170; CHECK-NEXT:    #APP
3171; CHECK-NEXT:    nop
3172; CHECK-NEXT:    #NO_APP
3173; CHECK-NEXT:    vsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3174; CHECK-NEXT:    retq
3175  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3176  %2 = fsub <4 x float> %a0, %a1
3177  ret <4 x float> %2
3178}
3179
3180define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) {
3181; CHECK-LABEL: stack_fold_subps_ymm:
3182; CHECK:       # %bb.0:
3183; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3184; CHECK-NEXT:    #APP
3185; CHECK-NEXT:    nop
3186; CHECK-NEXT:    #NO_APP
3187; CHECK-NEXT:    vsubps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3188; CHECK-NEXT:    retq
3189  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3190  %2 = fsub <8 x float> %a0, %a1
3191  ret <8 x float> %2
3192}
3193
3194define double @stack_fold_subsd(double %a0, double %a1) {
3195; CHECK-LABEL: stack_fold_subsd:
3196; CHECK:       # %bb.0:
3197; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3198; CHECK-NEXT:    #APP
3199; CHECK-NEXT:    nop
3200; CHECK-NEXT:    #NO_APP
3201; CHECK-NEXT:    vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
3202; CHECK-NEXT:    retq
3203  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3204  %2 = fsub double %a0, %a1
3205  ret double %2
3206}
3207
3208define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
3209; CHECK-LABEL: stack_fold_subsd_int:
3210; CHECK:       # %bb.0:
3211; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3212; CHECK-NEXT:    #APP
3213; CHECK-NEXT:    nop
3214; CHECK-NEXT:    #NO_APP
3215; CHECK-NEXT:    vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3216; CHECK-NEXT:    retq
3217  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3218  %2 = extractelement <2 x double> %a0, i32 0
3219  %3 = extractelement <2 x double> %a1, i32 0
3220  %4 = fsub double %2, %3
3221  %5 = insertelement <2 x double> %a0, double %4, i32 0
3222  ret <2 x double> %5
3223}
3224
3225define float @stack_fold_subss(float %a0, float %a1) {
3226; CHECK-LABEL: stack_fold_subss:
3227; CHECK:       # %bb.0:
3228; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3229; CHECK-NEXT:    #APP
3230; CHECK-NEXT:    nop
3231; CHECK-NEXT:    #NO_APP
3232; CHECK-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
3233; CHECK-NEXT:    retq
3234  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3235  %2 = fsub float %a0, %a1
3236  ret float %2
3237}
3238
3239define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
3240; CHECK-LABEL: stack_fold_subss_int:
3241; CHECK:       # %bb.0:
3242; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3243; CHECK-NEXT:    #APP
3244; CHECK-NEXT:    nop
3245; CHECK-NEXT:    #NO_APP
3246; CHECK-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3247; CHECK-NEXT:    retq
3248  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3249  %2 = extractelement <4 x float> %a0, i32 0
3250  %3 = extractelement <4 x float> %a1, i32 0
3251  %4 = fsub float %2, %3
3252  %5 = insertelement <4 x float> %a0, float %4, i32 0
3253  ret <4 x float> %5
3254}
3255
3256define i32 @stack_fold_testpd(<2 x double> %a0, <2 x double> %a1) {
3257; CHECK-LABEL: stack_fold_testpd:
3258; CHECK:       # %bb.0:
3259; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3260; CHECK-NEXT:    #APP
3261; CHECK-NEXT:    nop
3262; CHECK-NEXT:    #NO_APP
3263; CHECK-NEXT:    xorl %eax, %eax
3264; CHECK-NEXT:    vtestpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3265; CHECK-NEXT:    setb %al
3266; CHECK-NEXT:    retq
3267  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3268  %2 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
3269  ret i32 %2
3270}
3271declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
3272
3273define i32 @stack_fold_testpd_ymm(<4 x double> %a0, <4 x double> %a1) {
3274; CHECK-LABEL: stack_fold_testpd_ymm:
3275; CHECK:       # %bb.0:
3276; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3277; CHECK-NEXT:    #APP
3278; CHECK-NEXT:    nop
3279; CHECK-NEXT:    #NO_APP
3280; CHECK-NEXT:    xorl %eax, %eax
3281; CHECK-NEXT:    vtestpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3282; CHECK-NEXT:    setb %al
3283; CHECK-NEXT:    vzeroupper
3284; CHECK-NEXT:    retq
3285  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3286  %2 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
3287  ret i32 %2
3288}
3289declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
3290
3291define i32 @stack_fold_testps(<4 x float> %a0, <4 x float> %a1) {
3292; CHECK-LABEL: stack_fold_testps:
3293; CHECK:       # %bb.0:
3294; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3295; CHECK-NEXT:    #APP
3296; CHECK-NEXT:    nop
3297; CHECK-NEXT:    #NO_APP
3298; CHECK-NEXT:    xorl %eax, %eax
3299; CHECK-NEXT:    vtestps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3300; CHECK-NEXT:    setb %al
3301; CHECK-NEXT:    retq
3302  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3303  %2 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
3304  ret i32 %2
3305}
3306declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
3307
3308define i32 @stack_fold_testps_ymm(<8 x float> %a0, <8 x float> %a1) {
3309; CHECK-LABEL: stack_fold_testps_ymm:
3310; CHECK:       # %bb.0:
3311; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3312; CHECK-NEXT:    #APP
3313; CHECK-NEXT:    nop
3314; CHECK-NEXT:    #NO_APP
3315; CHECK-NEXT:    xorl %eax, %eax
3316; CHECK-NEXT:    vtestps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3317; CHECK-NEXT:    setb %al
3318; CHECK-NEXT:    vzeroupper
3319; CHECK-NEXT:    retq
3320  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3321  %2 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
3322  ret i32 %2
3323}
3324declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
3325
3326define i32 @stack_fold_ucomisd(double %a0, double %a1) {
3327; CHECK-LABEL: stack_fold_ucomisd:
3328; CHECK:       # %bb.0:
3329; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3330; CHECK-NEXT:    #APP
3331; CHECK-NEXT:    nop
3332; CHECK-NEXT:    #NO_APP
3333; CHECK-NEXT:    xorl %eax, %eax
3334; CHECK-NEXT:    vucomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
3335; CHECK-NEXT:    sete %al
3336; CHECK-NEXT:    leal -1(%rax,%rax), %eax
3337; CHECK-NEXT:    retq
3338  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3339  %2 = fcmp ueq double %a0, %a1
3340  %3 = select i1 %2, i32 1, i32 -1
3341  ret i32 %3
3342}
3343
3344define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) {
3345; CHECK-LABEL: stack_fold_ucomisd_int:
3346; CHECK:       # %bb.0:
3347; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3348; CHECK-NEXT:    #APP
3349; CHECK-NEXT:    nop
3350; CHECK-NEXT:    #NO_APP
3351; CHECK-NEXT:    vucomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3352; CHECK-NEXT:    setnp %al
3353; CHECK-NEXT:    sete %cl
3354; CHECK-NEXT:    andb %al, %cl
3355; CHECK-NEXT:    movzbl %cl, %eax
3356; CHECK-NEXT:    retq
3357  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3358  %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
3359  ret i32 %2
3360}
3361declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
3362
3363define i32 @stack_fold_ucomiss(float %a0, float %a1) {
3364; CHECK-LABEL: stack_fold_ucomiss:
3365; CHECK:       # %bb.0:
3366; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3367; CHECK-NEXT:    #APP
3368; CHECK-NEXT:    nop
3369; CHECK-NEXT:    #NO_APP
3370; CHECK-NEXT:    xorl %eax, %eax
3371; CHECK-NEXT:    vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
3372; CHECK-NEXT:    sete %al
3373; CHECK-NEXT:    leal -1(%rax,%rax), %eax
3374; CHECK-NEXT:    retq
3375  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3376  %2 = fcmp ueq float %a0, %a1
3377  %3 = select i1 %2, i32 1, i32 -1
3378  ret i32 %3
3379}
3380
3381define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) {
3382; CHECK-LABEL: stack_fold_ucomiss_int:
3383; CHECK:       # %bb.0:
3384; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3385; CHECK-NEXT:    #APP
3386; CHECK-NEXT:    nop
3387; CHECK-NEXT:    #NO_APP
3388; CHECK-NEXT:    vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3389; CHECK-NEXT:    setnp %al
3390; CHECK-NEXT:    sete %cl
3391; CHECK-NEXT:    andb %al, %cl
3392; CHECK-NEXT:    movzbl %cl, %eax
3393; CHECK-NEXT:    retq
3394  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3395  %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
3396  ret i32 %2
3397}
3398declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
3399
3400define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) {
3401; CHECK-LABEL: stack_fold_unpckhpd:
3402; CHECK:       # %bb.0:
3403; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3404; CHECK-NEXT:    #APP
3405; CHECK-NEXT:    nop
3406; CHECK-NEXT:    #NO_APP
3407; CHECK-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3408; CHECK-NEXT:    # xmm0 = xmm0[1],mem[1]
3409; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
3410; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
3411; CHECK-NEXT:    retq
3412  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3413  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
3414  ; fadd forces execution domain
3415  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
3416  ret <2 x double> %3
3417}
3418
3419define <4 x double> @stack_fold_unpckhpd_ymm(<4 x double> %a0, <4 x double> %a1) {
3420; CHECK-LABEL: stack_fold_unpckhpd_ymm:
3421; CHECK:       # %bb.0:
3422; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3423; CHECK-NEXT:    #APP
3424; CHECK-NEXT:    nop
3425; CHECK-NEXT:    #NO_APP
3426; CHECK-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3427; CHECK-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
3428; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
3429; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
3430; CHECK-NEXT:    retq
3431  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3432  %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
3433  ; fadd forces execution domain
3434  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
3435  ret <4 x double> %3
3436}
3437
3438define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) {
3439; CHECK-LABEL: stack_fold_unpckhps:
3440; CHECK:       # %bb.0:
3441; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3442; CHECK-NEXT:    #APP
3443; CHECK-NEXT:    nop
3444; CHECK-NEXT:    #NO_APP
3445; CHECK-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3446; CHECK-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
3447; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
3448; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
3449; CHECK-NEXT:    retq
3450  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3451  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3452  ; fadd forces execution domain
3453  %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
3454  ret <4 x float> %3
3455}
3456
3457define <8 x float> @stack_fold_unpckhps_ymm(<8 x float> %a0, <8 x float> %a1) {
3458; CHECK-LABEL: stack_fold_unpckhps_ymm:
3459; CHECK:       # %bb.0:
3460; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3461; CHECK-NEXT:    #APP
3462; CHECK-NEXT:    nop
3463; CHECK-NEXT:    #NO_APP
3464; CHECK-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3465; CHECK-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
3466; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
3467; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
3468; CHECK-NEXT:    retq
3469  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3470  %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
3471  ; fadd forces execution domain
3472  %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
3473  ret <8 x float> %3
3474}
3475
3476define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) {
3477; CHECK-LABEL: stack_fold_unpcklpd:
3478; CHECK:       # %bb.0:
3479; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3480; CHECK-NEXT:    #APP
3481; CHECK-NEXT:    nop
3482; CHECK-NEXT:    #NO_APP
3483; CHECK-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3484; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
3485; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
3486; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
3487; CHECK-NEXT:    retq
3488  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3489  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
3490  ; fadd forces execution domain
3491  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
3492  ret <2 x double> %3
3493}
3494
3495define <4 x double> @stack_fold_unpcklpd_ymm(<4 x double> %a0, <4 x double> %a1) {
3496; CHECK-LABEL: stack_fold_unpcklpd_ymm:
3497; CHECK:       # %bb.0:
3498; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3499; CHECK-NEXT:    #APP
3500; CHECK-NEXT:    nop
3501; CHECK-NEXT:    #NO_APP
3502; CHECK-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3503; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
3504; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
3505; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
3506; CHECK-NEXT:    retq
3507  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3508  %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
3509  ; fadd forces execution domain
3510  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
3511  ret <4 x double> %3
3512}
3513
3514define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) {
3515; CHECK-LABEL: stack_fold_unpcklps:
3516; CHECK:       # %bb.0:
3517; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3518; CHECK-NEXT:    #APP
3519; CHECK-NEXT:    nop
3520; CHECK-NEXT:    #NO_APP
3521; CHECK-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3522; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3523; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
3524; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
3525; CHECK-NEXT:    retq
3526  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3527  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3528  ; fadd forces execution domain
3529  %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
3530  ret <4 x float> %3
3531}
3532
3533define <8 x float> @stack_fold_unpcklps_ymm(<8 x float> %a0, <8 x float> %a1) {
3534; CHECK-LABEL: stack_fold_unpcklps_ymm:
3535; CHECK:       # %bb.0:
3536; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3537; CHECK-NEXT:    #APP
3538; CHECK-NEXT:    nop
3539; CHECK-NEXT:    #NO_APP
3540; CHECK-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3541; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
3542; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
3543; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
3544; CHECK-NEXT:    retq
3545  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3546  %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
3547  ; fadd forces execution domain
3548  %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
3549  ret <8 x float> %3
3550}
3551
3552define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
3553; CHECK-LABEL: stack_fold_xorpd:
3554; CHECK:       # %bb.0:
3555; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3556; CHECK-NEXT:    #APP
3557; CHECK-NEXT:    nop
3558; CHECK-NEXT:    #NO_APP
3559; CHECK-NEXT:    vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3560; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
3561; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
3562; CHECK-NEXT:    retq
3563  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3564  %2 = bitcast <2 x double> %a0 to <2 x i64>
3565  %3 = bitcast <2 x double> %a1 to <2 x i64>
3566  %4 = xor <2 x i64> %2, %3
3567  %5 = bitcast <2 x i64> %4 to <2 x double>
3568  ; fadd forces execution domain
3569  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
3570  ret <2 x double> %6
3571}
3572
3573define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) {
3574; CHECK-LABEL: stack_fold_xorpd_ymm:
3575; CHECK:       # %bb.0:
3576; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3577; CHECK-NEXT:    #APP
3578; CHECK-NEXT:    nop
3579; CHECK-NEXT:    #NO_APP
3580; CHECK-NEXT:    vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3581; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
3582; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
3583; CHECK-NEXT:    retq
3584  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3585  %2 = bitcast <4 x double> %a0 to <4 x i64>
3586  %3 = bitcast <4 x double> %a1 to <4 x i64>
3587  %4 = xor <4 x i64> %2, %3
3588  %5 = bitcast <4 x i64> %4 to <4 x double>
3589  ; fadd forces execution domain
3590  %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
3591  ret <4 x double> %6
3592}
3593
3594define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
3595; CHECK-LABEL: stack_fold_xorps:
3596; CHECK:       # %bb.0:
3597; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3598; CHECK-NEXT:    #APP
3599; CHECK-NEXT:    nop
3600; CHECK-NEXT:    #NO_APP
3601; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3602; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
3603; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
3604; CHECK-NEXT:    retq
3605  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3606  %2 = bitcast <4 x float> %a0 to <2 x i64>
3607  %3 = bitcast <4 x float> %a1 to <2 x i64>
3608  %4 = xor <2 x i64> %2, %3
3609  %5 = bitcast <2 x i64> %4 to <4 x float>
3610  ; fadd forces execution domain
3611  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
3612  ret <4 x float> %6
3613}
3614
3615define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
3616; CHECK-LABEL: stack_fold_xorps_ymm:
3617; CHECK:       # %bb.0:
3618; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3619; CHECK-NEXT:    #APP
3620; CHECK-NEXT:    nop
3621; CHECK-NEXT:    #NO_APP
3622; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3623; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
3624; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
3625; CHECK-NEXT:    retq
3626  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3627  %2 = bitcast <8 x float> %a0 to <4 x i64>
3628  %3 = bitcast <8 x float> %a1 to <4 x i64>
3629  %4 = xor <4 x i64> %2, %3
3630  %5 = bitcast <4 x i64> %4 to <8 x float>
3631  ; fadd forces execution domain
3632  %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
3633  ret <8 x float> %6
3634}
3635
3636attributes #0 = { "unsafe-fp-math"="false" }
3637attributes #1 = { "unsafe-fp-math"="true" }
3638