1; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
2
3declare i1 @llvm.amdgcn.class.f32(float, i32) #1
4declare i1 @llvm.amdgcn.class.f64(double, i32) #1
5declare i32 @llvm.amdgcn.workitem.id.x() #1
6declare float @llvm.fabs.f32(float) #1
7declare double @llvm.fabs.f64(double) #1
8
9; SI-LABEL: {{^}}test_class_f32:
10; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
11; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
12; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
13; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
14; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
15; SI-NEXT: buffer_store_dword [[RESULT]]
16; SI: s_endpgm
17define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
18  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
19  %sext = sext i1 %result to i32
20  store i32 %sext, i32 addrspace(1)* %out, align 4
21  ret void
22}
23
24; SI-LABEL: {{^}}test_class_fabs_f32:
25; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
26; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
27; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
28; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
29; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
30; SI-NEXT: buffer_store_dword [[RESULT]]
31; SI: s_endpgm
32define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
33  %a.fabs = call float @llvm.fabs.f32(float %a) #1
34  %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
35  %sext = sext i1 %result to i32
36  store i32 %sext, i32 addrspace(1)* %out, align 4
37  ret void
38}
39
40; SI-LABEL: {{^}}test_class_fneg_f32:
41; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
42; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
43; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
44; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
45; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
46; SI-NEXT: buffer_store_dword [[RESULT]]
47; SI: s_endpgm
48define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
49  %a.fneg = fsub float -0.0, %a
50  %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
51  %sext = sext i1 %result to i32
52  store i32 %sext, i32 addrspace(1)* %out, align 4
53  ret void
54}
55
56; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
57; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
58; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
59; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
60; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
61; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
62; SI-NEXT: buffer_store_dword [[RESULT]]
63; SI: s_endpgm
64define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
65  %a.fabs = call float @llvm.fabs.f32(float %a) #1
66  %a.fneg.fabs = fsub float -0.0, %a.fabs
67  %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
68  %sext = sext i1 %result to i32
69  store i32 %sext, i32 addrspace(1)* %out, align 4
70  ret void
71}
72
73; SI-LABEL: {{^}}test_class_1_f32:
74; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
75; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}}
76; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
77; SI-NEXT: buffer_store_dword [[RESULT]]
78; SI: s_endpgm
79define amdgpu_kernel void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
80  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
81  %sext = sext i1 %result to i32
82  store i32 %sext, i32 addrspace(1)* %out, align 4
83  ret void
84}
85
86; SI-LABEL: {{^}}test_class_64_f32:
87; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
88; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}}
89; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
90; SI-NEXT: buffer_store_dword [[RESULT]]
91; SI: s_endpgm
92define amdgpu_kernel void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
93  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
94  %sext = sext i1 %result to i32
95  store i32 %sext, i32 addrspace(1)* %out, align 4
96  ret void
97}
98
99; Set all 10 bits of mask
100; SI-LABEL: {{^}}test_class_full_mask_f32:
101; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
102; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
103; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
104; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
105; SI-NEXT: buffer_store_dword [[RESULT]]
106; SI: s_endpgm
107define amdgpu_kernel void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
108  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
109  %sext = sext i1 %result to i32
110  store i32 %sext, i32 addrspace(1)* %out, align 4
111  ret void
112}
113
114; SI-LABEL: {{^}}test_class_9bit_mask_f32:
115; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
116; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
117; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
118; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
119; SI-NEXT: buffer_store_dword [[RESULT]]
120; SI: s_endpgm
121define amdgpu_kernel void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
122  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
123  %sext = sext i1 %result to i32
124  store i32 %sext, i32 addrspace(1)* %out, align 4
125  ret void
126}
127
128; SI-LABEL: {{^}}v_test_class_full_mask_f32:
129; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
130; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}}
131; SI: v_cmp_class_f32_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]]
132; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
133; SI: buffer_store_dword [[RESULT]]
134; SI: s_endpgm
135define amdgpu_kernel void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
136  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
137  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
138  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
139  %a = load float, float addrspace(1)* %gep.in
140
141  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
142  %sext = sext i1 %result to i32
143  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
144  ret void
145}
146
147; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32:
148; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
149; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]]
150; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
151; SI: buffer_store_dword [[RESULT]]
152; SI: s_endpgm
153define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
154  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
155  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
156  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
157  %b = load i32, i32 addrspace(1)* %gep.in
158
159  %result = call i1 @llvm.amdgcn.class.f32(float 1.0, i32 %b) #1
160  %sext = sext i1 %result to i32
161  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
162  ret void
163}
164
165; FIXME: Why isn't this using a literal constant operand?
166; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32:
167; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
168; SI-DAG: s_mov_b32 [[VK:s[0-9]+]], 0x44800000
169; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]]
170; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
171; SI: buffer_store_dword [[RESULT]]
172; SI: s_endpgm
173define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
174  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
175  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
176  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
177  %b = load i32, i32 addrspace(1)* %gep.in
178
179  %result = call i1 @llvm.amdgcn.class.f32(float 1024.0, i32 %b) #1
180  %sext = sext i1 %result to i32
181  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
182  ret void
183}
184
185; SI-LABEL: {{^}}test_class_f64:
186; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
187; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
188; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
189; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
190; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
191; SI-NEXT: buffer_store_dword [[RESULT]]
192; SI: s_endpgm
193define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
194  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
195  %sext = sext i1 %result to i32
196  store i32 %sext, i32 addrspace(1)* %out, align 4
197  ret void
198}
199
200; SI-LABEL: {{^}}test_class_fabs_f64:
201; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
202; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
203; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
204; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
205; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
206; SI-NEXT: buffer_store_dword [[RESULT]]
207; SI: s_endpgm
208define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
209  %a.fabs = call double @llvm.fabs.f64(double %a) #1
210  %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
211  %sext = sext i1 %result to i32
212  store i32 %sext, i32 addrspace(1)* %out, align 4
213  ret void
214}
215
216; SI-LABEL: {{^}}test_class_fneg_f64:
217; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
218; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
219; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
220; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
221; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
222; SI-NEXT: buffer_store_dword [[RESULT]]
223; SI: s_endpgm
224define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
225  %a.fneg = fsub double -0.0, %a
226  %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
227  %sext = sext i1 %result to i32
228  store i32 %sext, i32 addrspace(1)* %out, align 4
229  ret void
230}
231
232; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
233; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
234; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
235; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
236; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
237; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
238; SI-NEXT: buffer_store_dword [[RESULT]]
239; SI: s_endpgm
240define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
241  %a.fabs = call double @llvm.fabs.f64(double %a) #1
242  %a.fneg.fabs = fsub double -0.0, %a.fabs
243  %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
244  %sext = sext i1 %result to i32
245  store i32 %sext, i32 addrspace(1)* %out, align 4
246  ret void
247}
248
249; SI-LABEL: {{^}}test_class_1_f64:
250; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
251; SI: s_endpgm
252define amdgpu_kernel void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
253  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
254  %sext = sext i1 %result to i32
255  store i32 %sext, i32 addrspace(1)* %out, align 4
256  ret void
257}
258
259; SI-LABEL: {{^}}test_class_64_f64:
260; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
261; SI: s_endpgm
262define amdgpu_kernel void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
263  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
264  %sext = sext i1 %result to i32
265  store i32 %sext, i32 addrspace(1)* %out, align 4
266  ret void
267}
268
269; Set all 9 bits of mask
270; SI-LABEL: {{^}}test_class_full_mask_f64:
271; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
272; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
273; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
274; SI-NOT: vcc
275; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
276; SI-NEXT: buffer_store_dword [[RESULT]]
277; SI: s_endpgm
278define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, [8 x i32], double %a) #0 {
279  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
280  %sext = sext i1 %result to i32
281  store i32 %sext, i32 addrspace(1)* %out, align 4
282  ret void
283}
284
285; SI-LABEL: {{^}}v_test_class_full_mask_f64:
286; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]]
287; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}}
288; SI: v_cmp_class_f64_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]]
289; SI-NOT: vcc
290; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
291; SI: buffer_store_dword [[RESULT]]
292; SI: s_endpgm
293define amdgpu_kernel void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
294  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
295  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
296  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
297  %a = load double, double addrspace(1)* %in
298
299  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
300  %sext = sext i1 %result to i32
301  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
302  ret void
303}
304
305; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64:
306; XSI: v_cmp_class_f64_e32 vcc, 1.0,
307; SI: v_cmp_class_f64_e32 vcc,
308; SI: s_endpgm
309define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
310  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
311  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
312  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
313  %b = load i32, i32 addrspace(1)* %gep.in
314
315  %result = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 %b) #1
316  %sext = sext i1 %result to i32
317  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
318  ret void
319}
320
321; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
322; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
323; SI: s_endpgm
324define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
325  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
326  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
327  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
328  %b = load i32, i32 addrspace(1)* %gep.in
329
330  %result = call i1 @llvm.amdgcn.class.f64(double 1024.0, i32 %b) #1
331  %sext = sext i1 %result to i32
332  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
333  ret void
334}
335
336; SI-LABEL: {{^}}test_fold_or_class_f32_0:
337; SI-NOT: v_cmp_class
338; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
339; SI-NOT: v_cmp_class
340; SI: s_endpgm
341define amdgpu_kernel void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
342  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
343  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
344  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
345  %a = load float, float addrspace(1)* %gep.in
346
347  %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
348  %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 3) #1
349  %or = or i1 %class0, %class1
350
351  %sext = sext i1 %or to i32
352  store i32 %sext, i32 addrspace(1)* %out, align 4
353  ret void
354}
355
356; SI-LABEL: {{^}}test_fold_or3_class_f32_0:
357; SI-NOT: v_cmp_class
358; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
359; SI-NOT: v_cmp_class
360; SI: s_endpgm
361define amdgpu_kernel void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
362  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
363  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
364  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
365  %a = load float, float addrspace(1)* %gep.in
366
367  %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
368  %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 2) #1
369  %class2 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
370  %or.0 = or i1 %class0, %class1
371  %or.1 = or i1 %or.0, %class2
372
373  %sext = sext i1 %or.1 to i32
374  store i32 %sext, i32 addrspace(1)* %out, align 4
375  ret void
376}
377
378; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
379; SI-NOT: v_cmp_class
380; SI: s_movk_i32 [[MASK:s[0-9]+]], 0x3ff{{$}}
381; SI: v_cmp_class_f32_e64 s[0:1], v{{[0-9]+}}, [[MASK]]{{$}}
382; SI-NOT: v_cmp_class
383; SI: s_endpgm
384define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
385  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
386  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
387  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
388  %a = load float, float addrspace(1)* %gep.in
389
390  %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
391  %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 2) #1
392  %class2 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
393  %class3 = call i1 @llvm.amdgcn.class.f32(float %a, i32 8) #1
394  %class4 = call i1 @llvm.amdgcn.class.f32(float %a, i32 16) #1
395  %class5 = call i1 @llvm.amdgcn.class.f32(float %a, i32 32) #1
396  %class6 = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
397  %class7 = call i1 @llvm.amdgcn.class.f32(float %a, i32 128) #1
398  %class8 = call i1 @llvm.amdgcn.class.f32(float %a, i32 256) #1
399  %class9 = call i1 @llvm.amdgcn.class.f32(float %a, i32 512) #1
400  %or.0 = or i1 %class0, %class1
401  %or.1 = or i1 %or.0, %class2
402  %or.2 = or i1 %or.1, %class3
403  %or.3 = or i1 %or.2, %class4
404  %or.4 = or i1 %or.3, %class5
405  %or.5 = or i1 %or.4, %class6
406  %or.6 = or i1 %or.5, %class7
407  %or.7 = or i1 %or.6, %class8
408  %or.8 = or i1 %or.7, %class9
409  %sext = sext i1 %or.8 to i32
410  store i32 %sext, i32 addrspace(1)* %out, align 4
411  ret void
412}
413
414; SI-LABEL: {{^}}test_fold_or_class_f32_1:
415; SI-NOT: v_cmp_class
416; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
417; SI-NOT: v_cmp_class
418; SI: s_endpgm
419define amdgpu_kernel void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
420  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
421  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
422  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
423  %a = load float, float addrspace(1)* %gep.in
424
425  %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
426  %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 8) #1
427  %or = or i1 %class0, %class1
428
429  %sext = sext i1 %or to i32
430  store i32 %sext, i32 addrspace(1)* %out, align 4
431  ret void
432}
433
434; SI-LABEL: {{^}}test_fold_or_class_f32_2:
435; SI-NOT: v_cmp_class
436; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
437; SI-NOT: v_cmp_class
438; SI: s_endpgm
439define amdgpu_kernel void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
440  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
441  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
442  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
443  %a = load float, float addrspace(1)* %gep.in
444
445  %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 7) #1
446  %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 7) #1
447  %or = or i1 %class0, %class1
448
449  %sext = sext i1 %or to i32
450  store i32 %sext, i32 addrspace(1)* %out, align 4
451  ret void
452}
453
454; SI-LABEL: {{^}}test_no_fold_or_class_f32_0:
455; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}}
456; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
457; SI: s_or_b64
458; SI: s_endpgm
459define amdgpu_kernel void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
460  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
461  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
462  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
463  %a = load float, float addrspace(1)* %gep.in
464
465  %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
466  %class1 = call i1 @llvm.amdgcn.class.f32(float %b, i32 8) #1
467  %or = or i1 %class0, %class1
468
469  %sext = sext i1 %or to i32
470  store i32 %sext, i32 addrspace(1)* %out, align 4
471  ret void
472}
473
474; SI-LABEL: {{^}}test_class_0_f32:
475; SI-NOT: v_cmp_class
476; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
477; SI: buffer_store_dword [[RESULT]]
478; SI: s_endpgm
479define amdgpu_kernel void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
480  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
481  %sext = sext i1 %result to i32
482  store i32 %sext, i32 addrspace(1)* %out, align 4
483  ret void
484}
485
486; SI-LABEL: {{^}}test_class_0_f64:
487; SI-NOT: v_cmp_class
488; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
489; SI: buffer_store_dword [[RESULT]]
490; SI: s_endpgm
491define amdgpu_kernel void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
492  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
493  %sext = sext i1 %result to i32
494  store i32 %sext, i32 addrspace(1)* %out, align 4
495  ret void
496}
497
498; FIXME: Why is the extension still here?
499; SI-LABEL: {{^}}test_class_undef_f32:
500; SI-NOT: v_cmp_class
501; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1,
502; SI: buffer_store_dword
503define amdgpu_kernel void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
504  %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1
505  %sext = sext i1 %result to i32
506  store i32 %sext, i32 addrspace(1)* %out, align 4
507  ret void
508}
509
510; SI-LABEL: {{^}}test_fold_and_ord:
511; SI: s_waitcnt
512; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 32{{$}}
513; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
514; SI-NEXT: s_setpc_b64
515define i1 @test_fold_and_ord(float %a) {
516  %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
517  %ord = fcmp ord float %a, %a
518  %and = and i1 %ord, %class
519  ret i1 %and
520}
521
522; SI-LABEL: {{^}}test_fold_and_unord:
523; SI: s_waitcnt
524; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 3{{$}}
525; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
526; SI-NEXT: s_setpc_b64
527define i1 @test_fold_and_unord(float %a) {
528  %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
529  %ord = fcmp uno float %a, %a
530  %and = and i1 %ord, %class
531  ret i1 %and
532}
533
534; SI-LABEL: {{^}}test_fold_and_ord_multi_use:
535; SI: v_cmp_class
536; SI-NOT: v_cmp_class
537; SI: v_cmp_o
538; SI: s_and_b64
539define i1 @test_fold_and_ord_multi_use(float %a) {
540  %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
541  store volatile i1 %class, i1 addrspace(1)* undef
542  %ord = fcmp ord float %a, %a
543  %and = and i1 %ord, %class
544  ret i1 %and
545}
546
547attributes #0 = { nounwind }
548attributes #1 = { nounwind readnone }
549