1; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GCN
2; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3
4; GCN-LABEL: {{^}}full_mask:
5; GCN: s_mov_b64 exec, -1
6; GCN: v_add_f32_e32 v0,
7define amdgpu_ps float @full_mask(float %a, float %b) {
8main_body:
9  %s = fadd float %a, %b
10  call void @llvm.amdgcn.init.exec(i64 -1)
11  ret float %s
12}
13
14; GCN-LABEL: {{^}}partial_mask:
15; GCN: s_mov_b64 exec, 0x1e240
16; GCN: v_add_f32_e32 v0,
17define amdgpu_ps float @partial_mask(float %a, float %b) {
18main_body:
19  %s = fadd float %a, %b
20  call void @llvm.amdgcn.init.exec(i64 123456)
21  ret float %s
22}
23
24; GCN-LABEL: {{^}}input_s3off8:
25; GCN: s_bfe_u32 s0, s3, 0x70008
26; GCN: s_bfm_b64 exec, s0, 0
27; GCN: s_cmp_eq_u32 s0, 64
28; GCN: s_cmov_b64 exec, -1
29; GCN: v_add_f32_e32 v0,
30define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) {
31main_body:
32  %s = fadd float %a, %b
33  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
34  ret float %s
35}
36
37; GCN-LABEL: {{^}}input_s0off19:
38; GCN: s_bfe_u32 s0, s0, 0x70013
39; GCN: s_bfm_b64 exec, s0, 0
40; GCN: s_cmp_eq_u32 s0, 64
41; GCN: s_cmov_b64 exec, -1
42; GCN: v_add_f32_e32 v0,
43define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) {
44main_body:
45  %s = fadd float %a, %b
46  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
47  ret float %s
48}
49
50; GCN-LABEL: {{^}}reuse_input:
51; GCN: s_bfe_u32 s1, s0, 0x70013
52; GCN: s_bfm_b64 exec, s1, 0
53; GCN: s_cmp_eq_u32 s1, 64
54; GCN: s_cmov_b64 exec, -1
55; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0
56define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) {
57main_body:
58  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
59  %s = add i32 %a, %count
60  %f = sitofp i32 %s to float
61  ret float %f
62}
63
64; GCN-LABEL: {{^}}reuse_input2:
65; GCN: s_bfe_u32 s1, s0, 0x70013
66; GCN: s_bfm_b64 exec, s1, 0
67; GCN: s_cmp_eq_u32 s1, 64
68; GCN: s_cmov_b64 exec, -1
69; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0
70define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) {
71main_body:
72  %s = add i32 %a, %count
73  %f = sitofp i32 %s to float
74  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
75  ret float %f
76}
77
78; GCN-LABEL: {{^}}init_unreachable:
79;
80; This used to crash.
81define amdgpu_ps void @init_unreachable() {
82main_body:
83  call void @llvm.amdgcn.init.exec(i64 -1)
84  unreachable
85}
86
87; GCN-LABEL: {{^}}init_exec_before_frame_materialize:
88; GCN-NOT: {{^}}v_
89; GCN: s_mov_b64 exec, -1
90; GCN: v_mov
91; GCN: v_add
92define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) {
93main_body:
94  %array0 = alloca [1024 x i32], align 16, addrspace(5)
95  %array1 = alloca [20 x i32], align 16, addrspace(5)
96  call void @llvm.amdgcn.init.exec(i64 -1)
97
98  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
99  store i32 %a, i32 addrspace(5)* %ptr0, align 4
100
101  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
102  store i32 %a, i32 addrspace(5)* %ptr1, align 4
103
104  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
105  store i32 %b, i32 addrspace(5)* %ptr2, align 4
106
107  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
108  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
109
110  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
111  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
112
113  %v5 = add i32 %v3, %v4
114  %v = bitcast i32 %v5 to float
115  ret float %v
116}
117
118; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize:
119; GCN-NOT: {{^}}v_
120; GCN: s_bfe_u32 s2, s2, 0x70008
121; GCN-NEXT: s_bfm_b64 exec, s2, 0
122; GCN-NEXT: s_cmp_eq_u32 s2, 64
123; GCN-NEXT: s_cmov_b64 exec, -1
124; GCN: v_mov
125; GCN: v_add
126define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
127main_body:
128  %array0 = alloca [1024 x i32], align 16, addrspace(5)
129  %array1 = alloca [20 x i32], align 16, addrspace(5)
130  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
131
132  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
133  store i32 %a, i32 addrspace(5)* %ptr0, align 4
134
135  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
136  store i32 %a, i32 addrspace(5)* %ptr1, align 4
137
138  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
139  store i32 %b, i32 addrspace(5)* %ptr2, align 4
140
141  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
142  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
143
144  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
145  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
146
147  %v5 = add i32 %v3, %v4
148  %v = bitcast i32 %v5 to float
149  ret float %v
150}
151
152; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry:
153; GCN-NOT: {{^}}v_
154; GCN: %endif
155; GCN: s_bfe_u32 s3, s2, 0x70008
156; GCN-NEXT: s_bfm_b64 exec, s3, 0
157; GCN-NEXT: s_cmp_eq_u32 s3, 64
158; GCN-NEXT: s_cmov_b64 exec, -1
159; GCN: v_mov
160; GCN: v_add
161define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
162main_body:
163  ; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel
164  %array0 = alloca [1024 x i32], align 16, addrspace(5)
165  %array1 = alloca [20 x i32], align 16, addrspace(5)
166
167  %cc = icmp uge i32 %count, 32
168  br i1 %cc, label %endif, label %if
169
170if:
171  call void asm sideeffect "", ""()
172  br label %endif
173
174endif:
175  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
176
177  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
178  store i32 %a, i32 addrspace(5)* %ptr0, align 4
179
180  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
181  store i32 %a, i32 addrspace(5)* %ptr1, align 4
182
183  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
184  store i32 %b, i32 addrspace(5)* %ptr2, align 4
185
186  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
187  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
188
189  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
190  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
191
192  %v5 = add i32 %v3, %v4
193  %v6 = add i32 %v5, %count
194  %v = bitcast i32 %v6 to float
195  ret float %v
196}
197
198declare void @llvm.amdgcn.init.exec(i64) #1
199declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
200
201attributes #1 = { convergent }
202