1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI %s
3; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
4
5declare i32 @llvm.amdgcn.workitem.id.x() #1
6declare half @llvm.fabs.f16(half)
7declare float @llvm.fabs.f32(float)
8declare double @llvm.fabs.f64(double)
9
10; GCN-LABEL: {{^}}v_cnd_nan_nosgpr:
11; GCN: v_cmp_eq_u32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0
12; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]]
13; GCN-DAG: v{{[0-9]}}
14; All nan values are converted to 0xffffffff
15; GCN: s_endpgm
16define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
17  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
18  %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
19  %f = load float, float addrspace(1)* %f.gep
20  %setcc = icmp ne i32 %c, 0
21  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
22  store float %select, float addrspace(1)* %out
23  ret void
24}
25
26
27; This requires slightly trickier SGPR operand legalization since the
28; single constant bus SGPR usage is the last operand, and it should
29; never be moved.
30; However on GFX10 constant bus is limited to 2 scalar operands, not one.
31
32; GCN-LABEL: {{^}}v_cnd_nan:
33; SIVI:  v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0
34; SIVI:  v_cndmask_b32_e32 v{{[0-9]+}}, -1, v{{[0-9]+}}, vcc
35; GFX10: v_cmp_eq_u32_e64 [[CC:s\[[0-9:]+\]]], s{{[0-9]+}}, 0
36; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, -1, s{{[0-9]+}}, [[CC]]
37; GCN-DAG: v{{[0-9]}}
38; All nan values are converted to 0xffffffff
39; GCN: s_endpgm
40define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
41  %setcc = icmp ne i32 %c, 0
42  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
43  store float %select, float addrspace(1)* %out
44  ret void
45}
46
47; Test different compare and select operand types for optimal code
48; shrinking.
49; (select (cmp (sgprX, constant)), constant, sgprZ)
50
51; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32:
52; GCN: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s[0:1], {{0x4c|0x13}}
53
54; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0
55; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0
56; SIVI-DAG:  v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
57; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], [[CC]]
58; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, s[[Z]], [[CC]]
59define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 {
60  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
61  %tid.ext = sext i32 %tid to i64
62  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
63  %setcc = fcmp one float %x, 0.0
64  %select = select i1 %setcc, float 1.0, float %z
65  store float %select, float addrspace(1)* %out.gep
66  ret void
67}
68
69; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprX_f32:
70; GCN: s_load_dword [[X:s[0-9]+]]
71; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0
72; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0
73; SIVI-DAG:  v_mov_b32_e32 [[VX:v[0-9]+]], [[X]]
74; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VX]], [[CC]]
75; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[X]], [[CC]]
76define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
77  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
78  %tid.ext = sext i32 %tid to i64
79  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
80  %setcc = fcmp one float %x, 0.0
81  %select = select i1 %setcc, float 1.0, float %x
82  store float %select, float addrspace(1)* %out.gep
83  ret void
84}
85
86; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprZ_f32:
87; GCN-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
88; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0
89; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0
90; SIVI-DAG:  v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
91; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], [[CC]]
92; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 0, s[[Z]], [[CC]]
93define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 {
94  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
95  %tid.ext = sext i32 %tid to i64
96  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
97  %setcc = fcmp one float %x, 0.0
98  %select = select i1 %setcc, float 0.0, float %z
99  store float %select, float addrspace(1)* %out.gep
100  ret void
101}
102
103; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprX_f32:
104; GCN: s_load_dword [[X:s[0-9]+]]
105; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0
106; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0
107; SIVI-DAG:  v_mov_b32_e32 [[VX:v[0-9]+]], [[X]]
108; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VX]], [[CC]]
109; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 0, [[X]], [[CC]]
110define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
111  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
112  %tid.ext = sext i32 %tid to i64
113  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
114  %setcc = fcmp one float %x, 0.0
115  %select = select i1 %setcc, float 0.0, float %x
116  store float %select, float addrspace(1)* %out.gep
117  ret void
118}
119
120; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_vgprZ_f32:
121; GCN-DAG: s_load_dword [[X:s[0-9]+]]
122; GCN-DAG: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]]
123; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
124; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]]
125define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
126  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
127  %tid.ext = sext i32 %tid to i64
128  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
129  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
130  %z = load float, float addrspace(1)* %z.gep
131  %setcc = fcmp one float %x, 0.0
132  %select = select i1 %setcc, float 0.0, float %z
133  store float %select, float addrspace(1)* %out.gep
134  ret void
135}
136
137; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32:
138; GCN-DAG: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]]
139; GCN-DAG: s_load_dword [[X:s[0-9]+]]
140; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
141; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]]
142define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
143  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
144  %tid.ext = sext i32 %tid to i64
145  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
146  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
147  %z = load float, float addrspace(1)* %z.gep
148  %setcc = fcmp one float %x, 0.0
149  %select = select i1 %setcc, float 1.0, float %z
150  store float %select, float addrspace(1)* %out.gep
151  ret void
152}
153
154; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_sgprZ_f32:
155; GCN-DAG: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
156; GCN-DAG: s_load_dword [[Z:s[0-9]+]]
157; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]]
158; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
159; SIVI:     v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
160; GFX10:    v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[Z]], vcc
161define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
162  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
163  %tid.ext = sext i32 %tid to i64
164  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
165  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
166  %x = load float, float addrspace(1)* %x.gep
167  %setcc = fcmp olt float %x, 0.0
168  %select = select i1 %setcc, float 1.0, float %z
169  store float %select, float addrspace(1)* %out.gep
170  ret void
171}
172
173; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_f32:
174; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
175; GCN: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]]
176; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
177; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc
178define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
179  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
180  %tid.ext = sext i32 %tid to i64
181  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
182  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
183  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
184  %x = load volatile float, float addrspace(1)* %x.gep
185  %z = load volatile float, float addrspace(1)* %z.gep
186  %setcc = fcmp ult float %x, 0.0
187  %select = select i1 %setcc, float 1.0, float %z
188  store float %select, float addrspace(1)* %out.gep
189  ret void
190}
191
192; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i32:
193; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
194; GCN: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]]
195; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]]
196; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc
197define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
198  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
199  %tid.ext = sext i32 %tid to i64
200  %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
201  %z.gep = getelementptr inbounds i32, i32 addrspace(1)* %z.ptr, i64 %tid.ext
202  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
203  %x = load volatile i32, i32 addrspace(1)* %x.gep
204  %z = load volatile i32, i32 addrspace(1)* %z.gep
205  %setcc = icmp slt i32 %x, 0
206  %select = select i1 %setcc, i32 2, i32 %z
207  store i32 %select, i32 addrspace(1)* %out.gep
208  ret void
209}
210
211; FIXME: Why does VI make the wrong regalloc choice?
212; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i64:
213; GCN: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[X_LO:[0-9]+]]:[[X_HI:[0-9]+]]{{\]}}
214; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[Z_LO:[0-9]+]]:[[Z_HI:[0-9]+]]{{\]}}
215; SI-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
216; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc
217; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc
218
219; VI-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
220; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc
221; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc
222define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
223  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
224  %tid.ext = sext i32 %tid to i64
225  %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext
226  %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext
227  %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext
228  %x = load volatile i64, i64 addrspace(1)* %x.gep
229  %z = load volatile i64, i64 addrspace(1)* %z.gep
230  %setcc = icmp slt i64 %x, 0
231  %select = select i1 %setcc, i64 2, i64 %z
232  store i64 %select, i64 addrspace(1)* %out.gep
233  ret void
234}
235
236; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
237; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
238; GCN: {{buffer|flat|global}}_load_dwordx4
239
240; GCN: v_cmp_nge_f32_e32 vcc, 4.0, [[X]]
241; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
242; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
243; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
244; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
245define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
246  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
247  %tid.ext = sext i32 %tid to i64
248  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
249  %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext
250  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
251  %x = load volatile float, float addrspace(1)* %x.gep
252  %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep
253  %setcc = fcmp ugt float %x, 4.0
254  %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>
255  store <4 x float> %select, <4 x float> addrspace(1)* %out.gep
256  ret void
257}
258
259; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
260; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
261; GCN: {{buffer|flat|global}}_load_dwordx4
262
263; GCN: v_cmp_ge_f32_e32 vcc, 4.0, [[X]]
264; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
265; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
266; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
267; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
268define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
269  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
270  %tid.ext = sext i32 %tid to i64
271  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
272  %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext
273  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
274  %x = load volatile float, float addrspace(1)* %x.gep
275  %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep
276  %setcc = fcmp ugt float %x, 4.0
277  %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
278  store <4 x float> %select, <4 x float> addrspace(1)* %out.gep
279  ret void
280}
281
282; This must be swapped as a vector type before the condition has
283; multiple uses.
284
285; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
286; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
287; GCN: {{buffer|flat|global}}_load_dwordx4
288
289; GCN: v_cmp_le_f32_e32 vcc, 4.0, [[X]]
290; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
291; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
292; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
293; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
294define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
295  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
296  %tid.ext = sext i32 %tid to i64
297  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
298  %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext
299  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
300  %x = load volatile float, float addrspace(1)* %x.gep
301  %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep
302  %setcc = fcmp ugt float 4.0, %x
303  %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
304  store <4 x float> %select, <4 x float> addrspace(1)* %out.gep
305  ret void
306}
307
308; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1:
309; GCN: load_dword
310; GCN: load_ubyte
311; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v
312; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1,
313; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v
314; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}}
315; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s
316; GCN: store_byte
317define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
318  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
319  %tid.ext = sext i32 %tid to i64
320  %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
321  %z.gep = getelementptr inbounds i1, i1 addrspace(1)* %z.ptr, i64 %tid.ext
322  %out.gep = getelementptr inbounds i1, i1 addrspace(1)* %out, i64 %tid.ext
323  %x = load volatile i32, i32 addrspace(1)* %x.gep
324  %z = load volatile i1, i1 addrspace(1)* %z.gep
325  %setcc = icmp slt i32 %x, 0
326  %select = select i1 %setcc, i1 true, i1 %z
327  store i1 %select, i1 addrspace(1)* %out.gep
328  ret void
329}
330
331; Different types compared vs. selected
332; GCN-LABEL: {{^}}fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
333; SIVI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000
334; GCN-DAG: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
335; GCN-DAG: {{buffer|flat|global}}_load_dwordx2
336
337; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
338; SIVI-DAG:  v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc
339; GFX10-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3ff00000, v{{[0-9]+}}, vcc
340; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
341define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
342  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
343  %tid.ext = sext i32 %tid to i64
344  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
345  %z.gep = getelementptr inbounds double, double addrspace(1)* %z.ptr, i64 %tid.ext
346  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
347  %x = load volatile float, float addrspace(1)* %x.gep
348  %z = load volatile double, double addrspace(1)* %z.gep
349  %setcc = fcmp ult float %x, 0.0
350  %select = select i1 %setcc, double 1.0, double %z
351  store double %select, double addrspace(1)* %out.gep
352  ret void
353}
354
355; Different types compared vs. selected
356; GCN-LABEL: {{^}}fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
357; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
358; GCN: {{buffer|flat|global}}_load_dwordx2
359
360; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]]
361; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc
362; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
363define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
364  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
365  %tid.ext = sext i32 %tid to i64
366  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
367  %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext
368  %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext
369  %x = load volatile float, float addrspace(1)* %x.gep
370  %z = load volatile i64, i64 addrspace(1)* %z.gep
371  %setcc = fcmp one float %x, 0.0
372  %select = select i1 %setcc, i64 3, i64 %z
373  store i64 %select, i64 addrspace(1)* %out.gep
374  ret void
375}
376
377; Different types compared vs. selected
378; GCN-LABEL: {{^}}icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
379; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
380; GCN: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]]
381
382; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]]
383; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc
384define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
385  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
386  %tid.ext = sext i32 %tid to i64
387  %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
388  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
389  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
390  %x = load volatile i32, i32 addrspace(1)* %x.gep
391  %z = load volatile float, float addrspace(1)* %z.gep
392  %setcc = icmp ugt i32 %x, 1
393  %select = select i1 %setcc, float 4.0, float %z
394  store float %select, float addrspace(1)* %out.gep
395  ret void
396}
397
398; FIXME: Should be able to handle multiple uses
399
400; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
401; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
402
403; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]]
404; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc
405; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc
406define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
407  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
408  %tid.ext = sext i32 %tid to i64
409  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
410  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
411  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
412  %x = load volatile float, float addrspace(1)* %x.gep
413  %z = load volatile float, float addrspace(1)* %z.gep
414  %setcc = fcmp ugt float 4.0, %x
415  %select0 = select i1 %setcc, float -1.0, float %z
416  %select1 = select i1 %setcc, float -2.0, float %z
417  store volatile float %select0, float addrspace(1)* %out.gep
418  store volatile float %select1, float addrspace(1)* %out.gep
419  ret void
420}
421
422; Source modifiers abs/neg only work for f32
423
424; GCN-LABEL: {{^}}v_cndmask_abs_neg_f16:
425; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}},
426define amdgpu_kernel void @v_cndmask_abs_neg_f16(half addrspace(1)* %out, i32 %c, half addrspace(1)* %fptr) #0 {
427  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
428  %f.gep = getelementptr half, half addrspace(1)* %fptr, i32 %idx
429  %f = load half, half addrspace(1)* %f.gep
430  %f.abs = call half @llvm.fabs.f16(half %f)
431  %f.neg = fneg half %f
432  %setcc = icmp ne i32 %c, 0
433  %select = select i1 %setcc, half %f.abs, half %f.neg
434  store half %select, half addrspace(1)* %out
435  ret void
436}
437
438; GCN-LABEL: {{^}}v_cndmask_abs_neg_f32:
439; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, |v{{[0-9]+}}|,
440define amdgpu_kernel void @v_cndmask_abs_neg_f32(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
441  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
442  %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
443  %f = load float, float addrspace(1)* %f.gep
444  %f.abs = call float @llvm.fabs.f32(float %f)
445  %f.neg = fneg float %f
446  %setcc = icmp ne i32 %c, 0
447  %select = select i1 %setcc, float %f.abs, float %f.neg
448  store float %select, float addrspace(1)* %out
449  ret void
450}
451
452; GCN-LABEL: {{^}}v_cndmask_abs_neg_f64:
453; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}},
454; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}},
455define amdgpu_kernel void @v_cndmask_abs_neg_f64(double addrspace(1)* %out, i32 %c, double addrspace(1)* %fptr) #0 {
456  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
457  %f.gep = getelementptr double, double addrspace(1)* %fptr, i32 %idx
458  %f = load double, double addrspace(1)* %f.gep
459  %f.abs = call double @llvm.fabs.f64(double %f)
460  %f.neg = fneg double %f
461  %setcc = icmp ne i32 %c, 0
462  %select = select i1 %setcc, double %f.abs, double %f.neg
463  store double %select, double addrspace(1)* %out
464  ret void
465}
466
467attributes #0 = { nounwind }
468attributes #1 = { nounwind readnone }
469