1; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
2
3declare <4 x float> @do_sse(<4 x float>)
4declare <8 x float> @do_avx(<8 x float>)
5declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
6@x = common global <4 x float> zeroinitializer, align 16
7@g = common global <8 x float> zeroinitializer, align 32
8
9;; Basic checking - don't emit any vzeroupper instruction
10
11; CHECK: _test00
12define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
13entry:
14  ; CHECK-NOT: vzeroupper
15  %add.i = fadd <4 x float> %a, %b
16  %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind
17  ; CHECK: ret
18  ret <4 x float> %call3
19}
20
21;; Check parameter 256-bit parameter passing
22
23; CHECK: _test01
24define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp {
25entry:
26  %tmp = load <4 x float>* @x, align 16
27  ; CHECK: vzeroupper
28  ; CHECK-NEXT: callq _do_sse
29  %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
30  store <4 x float> %call, <4 x float>* @x, align 16
31  ; CHECK-NOT: vzeroupper
32  ; CHECK: callq _do_sse
33  %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind
34  store <4 x float> %call2, <4 x float>* @x, align 16
35  ; CHECK: ret
36  ret <8 x float> %c
37}
38
39;; Test the pass convergence and also that vzeroupper is only issued when necessary,
40;; for this function it should be only once
41
42; CHECK: _test02
43define <4 x float> @test02(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
44entry:
45  %add.i = fadd <4 x float> %a, %b
46  br label %for.body
47
48for.body:                                         ; preds = %for.body, %entry
49  ; CHECK: LBB
50  ; CHECK-NOT: vzeroupper
51  %i.018 = phi i32 [ 0, %entry ], [ %1, %for.body ]
52  %c.017 = phi <4 x float> [ %add.i, %entry ], [ %call14, %for.body ]
53  ; CHECK: callq _do_sse
54  %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
55  ; CHECK-NEXT: callq _do_sse
56  %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind
57  %tmp11 = load <8 x float>* @g, align 32
58  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind
59  ; CHECK: vzeroupper
60  ; CHECK-NEXT: callq _do_sse
61  %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind
62  %1 = add nsw i32 %i.018, 1
63  %exitcond = icmp eq i32 %1, 4
64  br i1 %exitcond, label %for.end, label %for.body
65
66for.end:                                          ; preds = %for.body
67  ret <4 x float> %call14
68}
69
70;; Check that we also perform vzeroupper when we return from a function.
71
72; CHECK: _test03
73define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
74entry:
75  %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
76  ; CHECK-NOT: vzeroupper
77  ; CHECK: call
78  %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
79  %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
80  ; CHECK: vzeroupper
81  ; CHECK: ret
82  ret <4 x float> %shuf2
83}
84