1; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s
2; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s
5
6declare void @external_void_func_i1(i1) #0
7declare void @external_void_func_i1_signext(i1 signext) #0
8declare void @external_void_func_i1_zeroext(i1 zeroext) #0
9
10declare void @external_void_func_i8(i8) #0
11declare void @external_void_func_i8_signext(i8 signext) #0
12declare void @external_void_func_i8_zeroext(i8 zeroext) #0
13
14declare void @external_void_func_i16(i16) #0
15declare void @external_void_func_i16_signext(i16 signext) #0
16declare void @external_void_func_i16_zeroext(i16 zeroext) #0
17
18declare void @external_void_func_i32(i32) #0
19declare void @external_void_func_i64(i64) #0
20declare void @external_void_func_v2i64(<2 x i64>) #0
21declare void @external_void_func_v3i64(<3 x i64>) #0
22declare void @external_void_func_v4i64(<4 x i64>) #0
23
24declare void @external_void_func_f16(half) #0
25declare void @external_void_func_f32(float) #0
26declare void @external_void_func_f64(double) #0
27declare void @external_void_func_v2f32(<2 x float>) #0
28declare void @external_void_func_v2f64(<2 x double>) #0
29declare void @external_void_func_v3f64(<3 x double>) #0
30
31declare void @external_void_func_v2i16(<2 x i16>) #0
32declare void @external_void_func_v2f16(<2 x half>) #0
33declare void @external_void_func_v3i16(<3 x i16>) #0
34declare void @external_void_func_v3f16(<3 x half>) #0
35declare void @external_void_func_v4i16(<4 x i16>) #0
36declare void @external_void_func_v4f16(<4 x half>) #0
37
38declare void @external_void_func_v2i32(<2 x i32>) #0
39declare void @external_void_func_v3i32(<3 x i32>) #0
40declare void @external_void_func_v3i32_i32(<3 x i32>, i32) #0
41declare void @external_void_func_v4i32(<4 x i32>) #0
42declare void @external_void_func_v8i32(<8 x i32>) #0
43declare void @external_void_func_v16i32(<16 x i32>) #0
44declare void @external_void_func_v32i32(<32 x i32>) #0
45declare void @external_void_func_v32i32_i32(<32 x i32>, i32) #0
46
47; return value and argument
48declare i32 @external_i32_func_i32(i32) #0
49
50; Structs
51declare void @external_void_func_struct_i8_i32({ i8, i32 }) #0
52declare void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval) #0
53declare void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret, { i8, i32 } addrspace(5)* byval) #0
54
55declare void @external_void_func_v16i8(<16 x i8>) #0
56
57
58; FIXME: Should be passing -1
59; GCN-LABEL: {{^}}test_call_external_void_func_i1_imm:
60; MESA: s_mov_b32 s36, SCRATCH_RSRC_DWORD
61
62; MESA-DAG: s_mov_b64 s[0:1], s[36:37]
63
64; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
65; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4
66; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4
67; GCN-DAG: v_mov_b32_e32 v0, 1{{$}}
68; MESA-DAG: s_mov_b64 s[2:3], s[38:39]
69
70; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
71; GCN-NEXT: s_endpgm
72define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
73  call void @external_void_func_i1(i1 true)
74  ret void
75}
76
77; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext:
78; MESA: s_mov_b32 s33, s3{{$}}
79; HSA: s_mov_b32 s33, s9{{$}}
80
81; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
82; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4
83; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+4
84; GCN-NEXT: buffer_load_ubyte [[VAR:v[0-9]+]]
85; HSA-NEXT: s_mov_b32 s4, s33
86; HSA-NEXT: s_mov_b32 s32, s33
87
88; MESA-DAG: s_mov_b32 s4, s33{{$}}
89; MESA-DAG: s_mov_b32 s32, s33{{$}}
90
91; GCN: s_waitcnt vmcnt(0)
92; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1
93; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
94; GCN-NEXT: s_endpgm
95define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
96  %var = load volatile i1, i1 addrspace(1)* undef
97  call void @external_void_func_i1_signext(i1 %var)
98  ret void
99}
100
101; FIXME: load should be scheduled before getpc
102; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext:
103; MESA: s_mov_b32 s33, s3{{$}}
104
105; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
106; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
107; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+4
108; GCN-NEXT: buffer_load_ubyte v0
109
110; GCN-DAG: s_mov_b32 s4, s33{{$}}
111; GCN-DAG: s_mov_b32 s32, s33{{$}}
112
113; GCN: s_waitcnt vmcnt(0)
114; GCN-NEXT: v_and_b32_e32 v0, 1, v0
115; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
116; GCN-NEXT: s_endpgm
117define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
118  %var = load volatile i1, i1 addrspace(1)* undef
119  call void @external_void_func_i1_zeroext(i1 %var)
120  ret void
121}
122
123; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm:
124; MESA-DAG: s_mov_b32 s33, s3{{$}}
125
126; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
127; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4
128; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4
129; GCN-NEXT: v_mov_b32_e32 v0, 0x7b
130
131; HSA-DAG: s_mov_b32 s4, s33{{$}}
132; GCN-DAG: s_mov_b32 s32, s33{{$}}
133
134; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
135; GCN-NEXT: s_endpgm
136define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
137  call void @external_void_func_i8(i8 123)
138  ret void
139}
140
141; FIXME: don't wait before call
142; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext:
143; HSA-DAG: s_mov_b32 s33, s9{{$}}
144; MESA-DAG: s_mov_b32 s33, s3{{$}}
145
146; GCN-DAG: buffer_load_sbyte v0
147; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
148; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
149; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4
150
151; GCN-DAG: s_mov_b32 s4, s33
152; GCN-DAG: s_mov_b32 s32, s3
153
154; GCN: s_waitcnt vmcnt(0)
155; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
156; GCN-NEXT: s_endpgm
157define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
158  %var = load volatile i8, i8 addrspace(1)* undef
159  call void @external_void_func_i8_signext(i8 %var)
160  ret void
161}
162
163; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext:
164; MESA-DAG: s_mov_b32 s33, s3{{$}}
165; HSA-DAG: s_mov_b32 s33, s9{{$}}
166
167; GCN-DAG: buffer_load_ubyte v0
168; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
169; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
170; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4
171
172; GCN-DAG: s_mov_b32 s4, s33
173; GCN-DAG: s_mov_b32 s32, s33
174
175; GCN: s_waitcnt vmcnt(0)
176; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
177; GCN-NEXT: s_endpgm
178define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
179  %var = load volatile i8, i8 addrspace(1)* undef
180  call void @external_void_func_i8_zeroext(i8 %var)
181  ret void
182}
183
184; GCN-LABEL: {{^}}test_call_external_void_func_i16_imm:
185; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}}
186
187; GCN-DAG: s_mov_b32 s4, s33
188; GCN-DAG: s_mov_b32 s32, s33
189
190; GCN: s_swappc_b64
191define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
192  call void @external_void_func_i16(i16 123)
193  ret void
194}
195
196; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext:
197; MESA-DAG: s_mov_b32 s33, s3{{$}}
198
199; GCN-DAG: buffer_load_sshort v0
200; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
201; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
202; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4
203
204; GCN-DAG: s_mov_b32 s4, s33
205; GCN-DAG: s_mov_b32 s32, s33
206
207; GCN: s_waitcnt vmcnt(0)
208; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
209; GCN-NEXT: s_endpgm
210define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
211  %var = load volatile i16, i16 addrspace(1)* undef
212  call void @external_void_func_i16_signext(i16 %var)
213  ret void
214}
215
216; GCN-LABEL: {{^}}test_call_external_void_func_i16_zeroext:
217; MESA-DAG: s_mov_b32 s33, s3{{$}}
218
219
220; GCN-DAG: buffer_load_ushort v0
221; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
222; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4
223; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4
224
225; GCN-DAG: s_mov_b32 s4, s33
226; GCN-DAG: s_mov_b32 s32, s33
227
228; GCN: s_waitcnt vmcnt(0)
229; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
230; GCN-NEXT: s_endpgm
231define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
232  %var = load volatile i16, i16 addrspace(1)* undef
233  call void @external_void_func_i16_zeroext(i16 %var)
234  ret void
235}
236
237; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm:
238; MESA-DAG: s_mov_b32 s33, s3{{$}}
239
240; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
241; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4
242; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4
243; GCN: v_mov_b32_e32 v0, 42
244; GCN-DAG: s_mov_b32 s4, s33
245; GCN-DAG: s_mov_b32 s32, s33
246
247; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
248; GCN-NEXT: s_endpgm
249define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
250  call void @external_void_func_i32(i32 42)
251  ret void
252}
253
254; GCN-LABEL: {{^}}test_call_external_void_func_i64_imm:
255; GCN-DAG: s_movk_i32 [[K0:s[0-9]+]], 0x7b{{$}}
256; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0{{$}}
257; GCN-DAG: v_mov_b32_e32 v0, [[K0]]
258; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
259; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i64@rel32@lo+4
260; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i64@rel32@hi+4
261; GCN-DAG: v_mov_b32_e32 v1, [[K1]]
262; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
263; GCN-NEXT: s_endpgm
264define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
265  call void @external_void_func_i64(i64 123)
266  ret void
267}
268
269; GCN-LABEL: {{^}}test_call_external_void_func_v2i64:
270; GCN: buffer_load_dwordx4 v[0:3]
271; GCN: s_waitcnt
272; GCN-NEXT: s_swappc_b64
273define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
274  %val = load <2 x i64>, <2 x i64> addrspace(1)* null
275  call void @external_void_func_v2i64(<2 x i64> %val)
276  ret void
277}
278
279; GCN-LABEL: {{^}}test_call_external_void_func_v2i64_imm:
280; GCN-DAG: v_mov_b32_e32 v0, 1
281; GCN-DAG: v_mov_b32_e32 v1, 2
282; GCN-DAG: v_mov_b32_e32 v2, 3
283; GCN-DAG: v_mov_b32_e32 v3, 4
284; GCN: s_swappc_b64
285define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
286  call void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>)
287  ret void
288}
289
290; GCN-LABEL: {{^}}test_call_external_void_func_v3i64:
291; GCN: buffer_load_dwordx4 v[0:3]
292; GCN: v_mov_b32_e32 v4, 1
293; GCN: v_mov_b32_e32 v5, 2
294; GCN: s_waitcnt
295; GCN-NEXT: s_swappc_b64
296define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
297  %load = load <2 x i64>, <2 x i64> addrspace(1)* null
298  %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
299
300  call void @external_void_func_v3i64(<3 x i64> %val)
301  ret void
302}
303
304; GCN-LABEL: {{^}}test_call_external_void_func_v4i64:
305; GCN: buffer_load_dwordx4 v[0:3]
306; GCN-DAG: v_mov_b32_e32 v4, 1
307; GCN-DAG: v_mov_b32_e32 v5, 2
308; GCN-DAG: v_mov_b32_e32 v6, 3
309; GCN-DAG: v_mov_b32_e32 v7, 4
310
311; GCN: s_waitcnt
312; GCN-NEXT: s_swappc_b64
313define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
314  %load = load <2 x i64>, <2 x i64> addrspace(1)* null
315  %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
316  call void @external_void_func_v4i64(<4 x i64> %val)
317  ret void
318}
319
320; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm:
321; VI: v_mov_b32_e32 v0, 0x4400
322; CI: v_mov_b32_e32 v0, 4.0
323; GCN-NOT: v0
324; GCN: s_swappc_b64
325define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
326  call void @external_void_func_f16(half 4.0)
327  ret void
328}
329
330; GCN-LABEL: {{^}}test_call_external_void_func_f32_imm:
331; GCN: v_mov_b32_e32 v0, 4.0
332; GCN-NOT: v0
333; GCN: s_swappc_b64
334define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
335  call void @external_void_func_f32(float 4.0)
336  ret void
337}
338
339; GCN-LABEL: {{^}}test_call_external_void_func_v2f32_imm:
340; GCN-DAG: v_mov_b32_e32 v0, 1.0
341; GCN-DAG: v_mov_b32_e32 v1, 2.0
342; GCN: s_swappc_b64
343define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
344  call void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>)
345  ret void
346}
347
348; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm:
349; GCN: v_mov_b32_e32 v0, 0{{$}}
350; GCN: v_mov_b32_e32 v1, 0x40100000
351; GCN: s_swappc_b64
352define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
353  call void @external_void_func_f64(double 4.0)
354  ret void
355}
356
357; GCN-LABEL: {{^}}test_call_external_void_func_v2f64_imm:
358; GCN: v_mov_b32_e32 v0, 0{{$}}
359; GCN: v_mov_b32_e32 v1, 2.0
360; GCN: v_mov_b32_e32 v2, 0{{$}}
361; GCN: v_mov_b32_e32 v3, 0x40100000
362; GCN: s_swappc_b64
363define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
364  call void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>)
365  ret void
366}
367
368; GCN-LABEL: {{^}}test_call_external_void_func_v3f64_imm:
369; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
370; GCN-DAG: v_mov_b32_e32 v1, 2.0
371; GCN-DAG: v_mov_b32_e32 v2, 0{{$}}
372; GCN-DAG: v_mov_b32_e32 v3, 0x40100000
373; GCN-DAG: v_mov_b32_e32 v4, 0{{$}}
374; GCN-DAG: v_mov_b32_e32 v5, 0x40200000
375; GCN-DAG: s_swappc_b64
376define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
377  call void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>)
378  ret void
379}
380
381; GCN-LABEL: {{^}}test_call_external_void_func_v2i16:
382; GFX9: buffer_load_dword v0
383; GFX9-NOT: v0
384; GFX9: s_swappc_b64
385define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
386  %val = load <2 x i16>, <2 x i16> addrspace(1)* undef
387  call void @external_void_func_v2i16(<2 x i16> %val)
388  ret void
389}
390
391; GCN-LABEL: {{^}}test_call_external_void_func_v3i16:
392; GFX9: buffer_load_dwordx2 v[0:1]
393; GFX9-NOT: v0
394; GFX9-NOT: v1
395; GFX9: s_swappc_b64
396define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
397  %val = load <3 x i16>, <3 x i16> addrspace(1)* undef
398  call void @external_void_func_v3i16(<3 x i16> %val)
399  ret void
400}
401
402; FIXME: materialize constant directly in VGPR
403; GCN-LABEL: {{^}}test_call_external_void_func_v3i16_imm:
404; GFX9-DAG: s_mov_b32 [[K01:s[0-9]+]], 0x20001
405; GFX9-DAG: s_pack_ll_b32_b16 [[K23:s[0-9]+]], 3, s{{[0-9]+}}
406; GFX9: v_mov_b32_e32 v0, [[K01]]
407; GFX9: v_mov_b32_e32 v1, [[K23]]
408; GFX9: s_swappc_b64
409define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
410  call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
411  ret void
412}
413
414; GCN-LABEL: {{^}}test_call_external_void_func_v4i16:
415; GFX9: buffer_load_dwordx2 v[0:1]
416; GFX9-NOT: v0
417; GFX9-NOT: v1
418; GFX9: s_swappc_b64
419define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
420  %val = load <4 x i16>, <4 x i16> addrspace(1)* undef
421  call void @external_void_func_v4i16(<4 x i16> %val)
422  ret void
423}
424
425; GCN-LABEL: {{^}}test_call_external_void_func_v4i16_imm:
426; GFX9-DAG: v_mov_b32_e32 v0, 0x20001
427; GFX9-DAG: v_mov_b32_e32 v1, 0x40003
428; GFX9: s_swappc_b64
429define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
430  call void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
431  ret void
432}
433
434; GCN-LABEL: {{^}}test_call_external_void_func_v2f16:
435; GFX9: buffer_load_dword v0
436; GFX9-NOT: v0
437; GFX9: s_swappc_b64
438define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
439  %val = load <2 x half>, <2 x half> addrspace(1)* undef
440  call void @external_void_func_v2f16(<2 x half> %val)
441  ret void
442}
443
444; GCN-LABEL: {{^}}test_call_external_void_func_v2i32:
445; GCN: buffer_load_dwordx2 v[0:1]
446; GCN: s_waitcnt
447; GCN-NEXT: s_swappc_b64
448define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
449  %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
450  call void @external_void_func_v2i32(<2 x i32> %val)
451  ret void
452}
453
454; GCN-LABEL: {{^}}test_call_external_void_func_v2i32_imm:
455; GCN-DAG: v_mov_b32_e32 v0, 1
456; GCN-DAG: v_mov_b32_e32 v1, 2
457; GCN: s_swappc_b64
458define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
459  call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>)
460  ret void
461}
462
463; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm:
464; HSA-DAG: s_mov_b32 s33, s9
465; MESA-DAG: s_mov_b32 s33, s3{{$}}
466
467; GCN-DAG: v_mov_b32_e32 v0, 3
468; GCN-DAG: v_mov_b32_e32 v1, 4
469; GCN-DAG: v_mov_b32_e32 v2, 5
470; GCN-NOT: v3
471
472; GCN: s_swappc_b64
473define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
474  call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
475  ret void
476}
477
478; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_i32:
479; GCN-DAG: v_mov_b32_e32 v0, 3
480; GCN-DAG: v_mov_b32_e32 v1, 4
481; GCN-DAG: v_mov_b32_e32 v2, 5
482; GCN-DAG: v_mov_b32_e32 v3, 6
483define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
484  call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
485  ret void
486}
487
488; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:
489; GCN: buffer_load_dwordx4 v[0:3]
490; GCN: s_waitcnt
491; GCN-NEXT: s_swappc_b64
492define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
493  %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
494  call void @external_void_func_v4i32(<4 x i32> %val)
495  ret void
496}
497
498; GCN-LABEL: {{^}}test_call_external_void_func_v4i32_imm:
499; GCN-DAG: v_mov_b32_e32 v0, 1
500; GCN-DAG: v_mov_b32_e32 v1, 2
501; GCN-DAG: v_mov_b32_e32 v2, 3
502; GCN-DAG: v_mov_b32_e32 v3, 4
503; GCN: s_swappc_b64
504define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
505  call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
506  ret void
507}
508
509; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
510; GCN-DAG: buffer_load_dwordx4 v[0:3], off
511; GCN-DAG: buffer_load_dwordx4 v[4:7], off
512; GCN: s_waitcnt
513; GCN-NEXT: s_swappc_b64
514define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
515  %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
516  %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
517  call void @external_void_func_v8i32(<8 x i32> %val)
518  ret void
519}
520
521; GCN-LABEL: {{^}}test_call_external_void_func_v8i32_imm:
522; GCN-DAG: v_mov_b32_e32 v0, 1
523; GCN-DAG: v_mov_b32_e32 v1, 2
524; GCN-DAG: v_mov_b32_e32 v2, 3
525; GCN-DAG: v_mov_b32_e32 v3, 4
526; GCN-DAG: v_mov_b32_e32 v4, 5
527; GCN-DAG: v_mov_b32_e32 v5, 6
528; GCN-DAG: v_mov_b32_e32 v6, 7
529; GCN-DAG: v_mov_b32_e32 v7, 8
530; GCN: s_swappc_b64
531define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
532  call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
533  ret void
534}
535
536; GCN-LABEL: {{^}}test_call_external_void_func_v16i32:
537; GCN-DAG: buffer_load_dwordx4 v[0:3], off
538; GCN-DAG: buffer_load_dwordx4 v[4:7], off
539; GCN-DAG: buffer_load_dwordx4 v[8:11], off
540; GCN-DAG: buffer_load_dwordx4 v[12:15], off
541; GCN: s_waitcnt
542; GCN-NEXT: s_swappc_b64
543define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
544  %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
545  %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
546  call void @external_void_func_v16i32(<16 x i32> %val)
547  ret void
548}
549
550; GCN-LABEL: {{^}}test_call_external_void_func_v32i32:
551; GCN-DAG: buffer_load_dwordx4 v[0:3], off
552; GCN-DAG: buffer_load_dwordx4 v[4:7], off
553; GCN-DAG: buffer_load_dwordx4 v[8:11], off
554; GCN-DAG: buffer_load_dwordx4 v[12:15], off
555; GCN-DAG: buffer_load_dwordx4 v[16:19], off
556; GCN-DAG: buffer_load_dwordx4 v[20:23], off
557; GCN-DAG: buffer_load_dwordx4 v[24:27], off
558; GCN-DAG: buffer_load_dwordx4 v[28:31], off
559; GCN: s_waitcnt
560; GCN-NEXT: s_swappc_b64
561define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
562  %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
563  %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
564  call void @external_void_func_v32i32(<32 x i32> %val)
565  ret void
566}
567
568; GCN-LABEL: {{^}}test_call_external_void_func_v32i32_i32:
569; HSA-DAG: s_mov_b32 s33, s9
570; HSA-NOT: s_add_u32 s32
571
572; MESA-DAG: s_mov_b32 s33, s3{{$}}
573; MESA-NOT: s_add_u32 s32
574
575; GCN-DAG: buffer_load_dword [[VAL1:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
576; GCN-DAG: buffer_load_dwordx4 v[0:3], off
577; GCN-DAG: buffer_load_dwordx4 v[4:7], off
578; GCN-DAG: buffer_load_dwordx4 v[8:11], off
579; GCN-DAG: buffer_load_dwordx4 v[12:15], off
580; GCN-DAG: buffer_load_dwordx4 v[16:19], off
581; GCN-DAG: buffer_load_dwordx4 v[20:23], off
582; GCN-DAG: buffer_load_dwordx4 v[24:27], off
583; GCN-DAG: buffer_load_dwordx4 v[28:31], off
584
585; GCN: s_waitcnt
586; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}}
587; GCN: s_swappc_b64
588; GCN-NEXT: s_endpgm
589define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
590  %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
591  %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0
592  %val1 = load i32, i32 addrspace(1)* undef
593  call void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1)
594  ret void
595}
596
597; FIXME: No wait after call
598; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm:
599; GCN: v_mov_b32_e32 v0, 42
600; GCN: s_swappc_b64 s[30:31],
601; GCN-NEXT: s_waitcnt lgkmcnt(0)
602; GCN-NEXT: buffer_store_dword v0, off, s[36:39], 0
603define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
604  %val = call i32 @external_i32_func_i32(i32 42)
605  store volatile i32 %val, i32 addrspace(1)* %out
606  ret void
607}
608
609; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32:
610; GCN: buffer_load_ubyte v0, off
611; GCN: buffer_load_dword v1, off
612; GCN: s_waitcnt vmcnt(0)
613; GCN-NEXT: s_swappc_b64
614define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
615  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
616  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
617  call void @external_void_func_struct_i8_i32({ i8, i32 } %val)
618  ret void
619}
620
621; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32:
622; GCN-DAG: s_add_u32 [[SP:s[0-9]+]], s33, 0x400{{$}}
623
624; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
625; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
626; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], s33 offset:8
627; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], s33 offset:12
628
629; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], s33 offset:8
630; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], s33 offset:12
631
632; GCN-NOT: s_add_u32 [[SP]],
633
634; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8
635; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12
636
637; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]] offset:4
638; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:8
639
640
641; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8
642; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12
643
644; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]] offset:4
645; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8
646
647; GCN-NEXT: s_swappc_b64
648; GCN-NOT: [[SP]]
649define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 {
650  %val = alloca { i8, i32 }, align 4, addrspace(5)
651  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 0
652  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 1
653  store i8 3, i8 addrspace(5)* %gep0
654  store i32 8, i32 addrspace(5)* %gep1
655  call void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %val)
656  ret void
657}
658
659; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
660; MESA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x800{{$}}
661; HSA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x800{{$}}
662
663; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
664; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
665; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8
666; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
667
668; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8
669; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
670
671; GCN-NOT: s_add_u32 [[SP]]
672; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
673; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8
674; GCN-NEXT: s_swappc_b64
675; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16
676; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20
677; GCN-NOT: s_sub_u32 [[SP]]
678
679; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off
680; GCN: buffer_store_dword [[LOAD_OUT_VAL1]], off
681define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
682  %in.val = alloca { i8, i32 }, align 4, addrspace(5)
683  %out.val = alloca { i8, i32 }, align 4, addrspace(5)
684  %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 0
685  %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 1
686  store i8 3, i8 addrspace(5)* %in.gep0
687  store i32 8, i32 addrspace(5)* %in.gep1
688  call void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %out.val, { i8, i32 } addrspace(5)* %in.val)
689  %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 0
690  %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 1
691  %out.val0 = load i8, i8 addrspace(5)* %out.gep0
692  %out.val1 = load i32, i32 addrspace(5)* %out.gep1
693
694  store volatile i8 %out.val0, i8 addrspace(1)* undef
695  store volatile i32 %out.val1, i32 addrspace(1)* undef
696  ret void
697}
698
699; GCN-LABEL: {{^}}test_call_external_void_func_v16i8:
700define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
701  %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef
702  %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
703  call void @external_void_func_v16i8(<16 x i8> %val)
704  ret void
705}
706
707attributes #0 = { nounwind }
708attributes #1 = { nounwind readnone }
709attributes #2 = { nounwind noinline }
710