1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
3
4declare float @llvm.fma.f32(float, float, float) #1
5declare double @llvm.fma.f64(double, double, double) #1
6declare float @llvm.fmuladd.f32(float, float, float) #1
7declare float @llvm.amdgcn.div.fixup.f32(float, float, float) #1
8
9
10; GCN-LABEL: {{^}}test_sgpr_use_twice_binop:
11; GCN: s_load_dword [[SGPR:s[0-9]+]],
12; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
13; GCN: buffer_store_dword [[RESULT]]
14define amdgpu_kernel void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
15  %dbl = fadd float %a, %a
16  store float %dbl, float addrspace(1)* %out, align 4
17  ret void
18}
19
20; GCN-LABEL: {{^}}test_sgpr_use_three_ternary_op:
21; GCN: s_load_dword [[SGPR:s[0-9]+]],
22; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
23; GCN: buffer_store_dword [[RESULT]]
24define amdgpu_kernel void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
25  %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
26  store float %fma, float addrspace(1)* %out, align 4
27  ret void
28}
29
30; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
31; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
32; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
33; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
34; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[SGPR0]], s[[SGPR0]], [[VGPR1]]
35; GCN: buffer_store_dword [[RESULT]]
36define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
37  %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
38  store float %fma, float addrspace(1)* %out, align 4
39  ret void
40}
41
42; GCN-LABEL: {{^}}test_use_s_v_s:
43; GCN-DAG: s_load_dwordx2 s{{\[}}[[SA:[0-9]+]]:[[SB:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
44; SI: buffer_load_dword [[VA0:v[0-9]+]]
45; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
46
47; GCN-NOT: v_mov_b32
48
49; VI: buffer_load_dword [[VA0:v[0-9]+]]
50; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
51
52; GCN-NOT: v_mov_b32
53; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[SB]]
54; GCN-NOT: v_mov_b32
55
56; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SA]], [[VA0]], [[VB]]
57; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SA]], [[VA1]], [[VB]]
58; GCN: buffer_store_dword [[RESULT0]]
59; GCN: buffer_store_dword [[RESULT1]]
60define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
61  %va0 = load volatile float, float addrspace(1)* %in
62  %va1 = load volatile float, float addrspace(1)* %in
63  %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1
64  %fma1 = call float @llvm.fma.f32(float %a, float %va1, float %b) #1
65  store volatile float %fma0, float addrspace(1)* %out
66  store volatile float %fma1, float addrspace(1)* %out
67  ret void
68}
69
70; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
71; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
72; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
73; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
74; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[SGPR0]], [[VGPR1]], s[[SGPR0]]
75; GCN: buffer_store_dword [[RESULT]]
76define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
77  %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
78  store float %fma, float addrspace(1)* %out, align 4
79  ret void
80}
81
82; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
83; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
84; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
85; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
86; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], s[[SGPR0]], s[[SGPR0]]
87; GCN: buffer_store_dword [[RESULT]]
88define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
89  %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
90  store float %fma, float addrspace(1)* %out, align 4
91  ret void
92}
93
94; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm:
95; GCN: s_load_dword [[SGPR:s[0-9]+]]
96; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
97; GCN: buffer_store_dword [[RESULT]]
98define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
99  %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
100  store float %fma, float addrspace(1)* %out, align 4
101  ret void
102}
103
104; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
105; GCN: s_load_dword [[SGPR:s[0-9]+]]
106; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
107; GCN: buffer_store_dword [[RESULT]]
108define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
109  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
110  store float %fma, float addrspace(1)* %out, align 4
111  ret void
112}
113
114; Don't use fma since fma c, x, y is canonicalized to fma x, c, y
115; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a:
116; GCN: s_load_dword [[SGPR:s[0-9]+]]
117; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
118; GCN: buffer_store_dword [[RESULT]]
119define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
120  %val = call float @llvm.amdgcn.div.fixup.f32(float 2.0, float %a, float %a) #1
121  store float %val, float addrspace(1)* %out, align 4
122  ret void
123}
124
125; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_kimm:
126; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]]
127; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
128; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[VK]]
129; GCN: buffer_store_dword [[RESULT]]
130define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
131  %fma = call float @llvm.fma.f32(float %a, float %a, float 1024.0) #1
132  store float %fma, float addrspace(1)* %out, align 4
133  ret void
134}
135
136; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s:
137; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]]
138; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
139; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR]]
140; GCN: buffer_store_dword [[RESULT0]]
141define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
142  %fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
143  store float %fma, float addrspace(1)* %out
144  ret void
145}
146
147; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s_x2:
148; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
149; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
150; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
151; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], s[[SGPR0]]
152; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VK]], [[VK]], s[[SGPR1]]
153; GCN: buffer_store_dword [[RESULT0]]
154; GCN: buffer_store_dword [[RESULT1]]
155; GCN: s_endpgm
156define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
157  %fma0 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
158  %fma1 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %b) #1
159  store volatile float %fma0, float addrspace(1)* %out
160  store volatile float %fma1, float addrspace(1)* %out
161  ret void
162}
163
164; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k:
165; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]]
166; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
167; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
168; GCN: buffer_store_dword [[RESULT]]
169define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
170  %fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
171  store float %fma, float addrspace(1)* %out
172  ret void
173}
174
175; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k_x2:
176; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
177; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
178; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
179; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VK]], [[VK]]
180; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR1]], [[VK]], [[VK]]
181; GCN: buffer_store_dword [[RESULT0]]
182; GCN: buffer_store_dword [[RESULT1]]
183; GCN: s_endpgm
184define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
185  %fma0 = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
186  %fma1 = call float @llvm.fma.f32(float 1024.0, float %b, float 1024.0) #1
187  store volatile float %fma0, float addrspace(1)* %out
188  store volatile float %fma1, float addrspace(1)* %out
189  ret void
190}
191
192; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k:
193; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]]
194; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
195; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
196; GCN: buffer_store_dword [[RESULT]]
197define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
198  %fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
199  store float %fma, float addrspace(1)* %out
200  ret void
201}
202
203; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k_x2:
204; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
205; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
206; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
207; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VK]], [[VK]]
208; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR1]], [[VK]], [[VK]]
209; GCN: buffer_store_dword [[RESULT0]]
210; GCN: buffer_store_dword [[RESULT1]]
211; GCN: s_endpgm
212define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
213  %fma0 = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
214  %fma1 = call float @llvm.fma.f32(float %b, float 1024.0, float 1024.0) #1
215  store volatile float %fma0, float addrspace(1)* %out
216  store volatile float %fma1, float addrspace(1)* %out
217  ret void
218}
219
220; GCN-LABEL: {{^}}test_s0_s1_k_f32:
221; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
222; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
223; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000
224; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], s[[SGPR1]]
225
226; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK0]]
227; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000
228; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK1]]
229
230; GCN: buffer_store_dword [[RESULT0]]
231; GCN: buffer_store_dword [[RESULT1]]
232define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
233  %fma0 = call float @llvm.fma.f32(float %a, float %b, float 1024.0) #1
234  %fma1 = call float @llvm.fma.f32(float %a, float %b, float 4096.0) #1
235  store volatile float %fma0, float addrspace(1)* %out
236  store volatile float %fma1, float addrspace(1)* %out
237  ret void
238}
239
240; FIXME: Immediate in SGPRs just copied to VGPRs
241; GCN-LABEL: {{^}}test_s0_s1_k_f64:
242; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
243; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x1d|0x74}}
244; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], 0x40900000
245; GCN-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0{{$}}
246
247; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]]
248; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]]
249; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}}
250
251; Same zero component is re-used for half of each immediate.
252; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
253; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}}
254
255; GCN: buffer_store_dwordx2 [[RESULT0]]
256; GCN: buffer_store_dwordx2 [[RESULT1]]
257define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) #0 {
258  %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1
259  %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1
260  store volatile double %fma0, double addrspace(1)* %out
261  store volatile double %fma1, double addrspace(1)* %out
262  ret void
263}
264
265attributes #0 = { nounwind }
266attributes #1 = { nounwind readnone }
267