1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -O2 -tail-dup-size=1000 -tail-dup-placement-threshold=1000 -enable-tail-merge=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
2
3; Need to to trigger tail duplication this during
4; MachineBlockPlacement, since calls aren't tail duplicated pre-RA.
5
6declare void @nonconvergent_func() #0
7declare void @convergent_func() #1
8declare void @llvm.amdgcn.s.barrier() #1
9declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2
10declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #2
11declare void @llvm.amdgcn.ds.gws.sema.release.all(i32 %offset) #2
12
13; barrier shouldn't be duplicated.
14
15; GCN-LABEL: {{^}}taildup_barrier:
16; GCN: s_barrier
17; GCN-NOT: s_barrier
18define void @taildup_barrier(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond) #0 {
19entry:
20  br i1 %cond, label %bb1, label %bb2
21
22bb1:
23  store i32 0, i32 addrspace(1)* %a
24  br label %call
25
26bb2:
27  store i32 1, i32 addrspace(1)* %a
28  br label %call
29
30call:
31  call void @llvm.amdgcn.s.barrier()
32  br label %ret
33
34ret:
35  ret void
36}
37
38; GCN-LABEL: {{^}}taildup_convergent_call:
39; GCN: s_swappc_b64
40; GCN-NOT: s_swappc_b64
41define void @taildup_convergent_call(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond) #1 {
42entry:
43  br i1 %cond, label %bb1, label %bb2
44
45bb1:
46  store i32 0, i32 addrspace(1)* %a
47  br label %call
48
49bb2:
50  store i32 1, i32 addrspace(1)* %a
51  br label %call
52
53call:
54  call void @convergent_func()
55  br label %ret
56
57ret:
58  ret void
59}
60
61; TODO: Currently there is only one convergent call pseudo, but this
62; theoretically could use a nonconvergent variant.
63; GCN-LABEL: {{^}}taildup_nonconvergent_call:
64; GCN: s_swappc_b64
65; GCN-NOT: s_swappc_b64
66define void @taildup_nonconvergent_call(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond) #1 {
67entry:
68  br i1 %cond, label %bb1, label %bb2
69
70bb1:
71  store i32 0, i32 addrspace(1)* %a
72  br label %call
73
74bb2:
75  store i32 1, i32 addrspace(1)* %a
76  br label %call
77
78call:
79  call void @nonconvergent_func()
80  br label %ret
81
82ret:
83  ret void
84}
85
86; GCN-LABEL: {{^}}taildup_convergent_tailcall:
87; GCN: s_setpc_b64
88; GCN-NOT: s_setpc_b64
89define void @taildup_convergent_tailcall(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond) #1 {
90entry:
91  br i1 %cond, label %bb1, label %bb2
92
93bb1:
94  store i32 0, i32 addrspace(1)* %a
95  br label %call
96
97bb2:
98  store i32 1, i32 addrspace(1)* %a
99  br label %call
100
101call:
102  tail call void @convergent_func()
103  ret void
104}
105
106; GCN-LABEL: {{^}}taildup_gws_init:
107; GCN: ds_gws_init
108; GCN-NOT: ds_gws_init
109define amdgpu_kernel void @taildup_gws_init(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
110entry:
111  br i1 %cond, label %bb1, label %bb2
112
113bb1:
114  store i32 0, i32 addrspace(1)* %a
115  br label %call
116
117bb2:
118  store i32 1, i32 addrspace(1)* %a
119  br label %call
120
121call:
122  call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
123  br label %ret
124
125ret:
126  ret void
127}
128
129; GCN-LABEL: {{^}}taildup_gws_barrier:
130; GCN: ds_gws_barrier
131; GCN-NOT: ds_gws_barrier
132define amdgpu_kernel void @taildup_gws_barrier(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
133entry:
134  br i1 %cond, label %bb1, label %bb2
135
136bb1:
137  store i32 0, i32 addrspace(1)* %a
138  br label %call
139
140bb2:
141  store i32 1, i32 addrspace(1)* %a
142  br label %call
143
144call:
145  call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
146  br label %ret
147
148ret:
149  ret void
150}
151
152; GCN-LABEL: {{^}}taildup_gws_sema_release_all:
153; GCN: ds_gws_sema_release_all
154; GCN-NOT: ds_gws
155define amdgpu_kernel void @taildup_gws_sema_release_all(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %offset) #0 {
156entry:
157  br i1 %cond, label %bb1, label %bb2
158
159bb1:
160  store i32 0, i32 addrspace(1)* %a
161  br label %call
162
163bb2:
164  store i32 1, i32 addrspace(1)* %a
165  br label %call
166
167call:
168  call void @llvm.amdgcn.ds.gws.sema.release.all(i32 %offset)
169  br label %ret
170
171ret:
172  ret void
173}
174
175attributes #0 = { nounwind }
176attributes #1 = { nounwind convergent }
177attributes #2 = { convergent inaccessiblememonly nounwind }
178