1; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA %s
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s
3
4; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty:
5; GCN: enable_sgpr_kernarg_segment_ptr = 1
6
7; HSA: kernarg_segment_byte_size = 0
8; MESA: kernarg_segment_byte_size = 16
9
10; HSA: s_load_dword s0, s[4:5], 0x0
11define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 {
12  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
13  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
14  %load = load volatile i32, i32 addrspace(4)* %cast
15  ret void
16}
17
18; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr_empty:
19; GCN: enable_sgpr_kernarg_segment_ptr = 1
20
21; HSA: kernarg_segment_byte_size = 48
22; MESA: kernarg_segment_byte_size = 16
23
24; HSA: s_load_dword s0, s[4:5], 0x0
25define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 {
26  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
27  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
28  %load = load volatile i32, i32 addrspace(4)* %cast
29  ret void
30}
31
32; GCN-LABEL: {{^}}kernel_implicitarg_ptr:
33; GCN: enable_sgpr_kernarg_segment_ptr = 1
34
35; HSA: kernarg_segment_byte_size = 112
36; MESA: kernarg_segment_byte_size = 128
37
38; HSA: s_load_dword s0, s[4:5], 0x1c
39define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
40  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
41  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
42  %load = load volatile i32, i32 addrspace(4)* %cast
43  ret void
44}
45
46; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr:
47; GCN: enable_sgpr_kernarg_segment_ptr = 1
48
49; HSA: kernarg_segment_byte_size = 160
50; MESA: kernarg_segment_byte_size = 128
51
52; HSA: s_load_dword s0, s[4:5], 0x1c
53define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
54  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
55  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
56  %load = load volatile i32, i32 addrspace(4)* %cast
57  ret void
58}
59
60; GCN-LABEL: {{^}}func_implicitarg_ptr:
61; GCN: s_waitcnt
62; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
63; GCN-NEXT: s_waitcnt
64; GCN-NEXT: s_setpc_b64
65define void @func_implicitarg_ptr() #0 {
66  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
67  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
68  %load = load volatile i32, i32 addrspace(4)* %cast
69  ret void
70}
71
72; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
73; GCN: s_waitcnt
74; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
75; GCN-NEXT: s_waitcnt
76; GCN-NEXT: s_setpc_b64
77define void @opencl_func_implicitarg_ptr() #0 {
78  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
79  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
80  %load = load volatile i32, i32 addrspace(4)* %cast
81  ret void
82}
83
84; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty:
85; GCN: enable_sgpr_kernarg_segment_ptr = 1
86; HSA: kernarg_segment_byte_size = 0
87; MESA: kernarg_segment_byte_size = 16
88; GCN-NOT: s[4:5]
89; GCN-NOT: s4
90; GCN-NOT: s5
91; GCN: s_swappc_b64
92define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 {
93  call void @func_implicitarg_ptr()
94  ret void
95}
96
97; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func_empty:
98; GCN: enable_sgpr_kernarg_segment_ptr = 1
99; HSA: kernarg_segment_byte_size = 48
100; MESA: kernarg_segment_byte_size = 16
101; GCN-NOT: s[4:5]
102; GCN-NOT: s4
103; GCN-NOT: s5
104; GCN: s_swappc_b64
105define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
106  call void @func_implicitarg_ptr()
107  ret void
108}
109
110; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func:
111; GCN: enable_sgpr_kernarg_segment_ptr = 1
112; HSA: kernarg_segment_byte_size = 112
113; MESA: kernarg_segment_byte_size = 128
114
115; HSA: s_add_u32 s4, s4, 0x70
116; MESA: s_add_u32 s4, s4, 0x70
117
118; GCN: s_addc_u32 s5, s5, 0{{$}}
119; GCN: s_swappc_b64
120define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
121  call void @func_implicitarg_ptr()
122  ret void
123}
124
125; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func:
126; GCN: enable_sgpr_kernarg_segment_ptr = 1
127; HSA: kernarg_segment_byte_size = 160
128; MESA: kernarg_segment_byte_size = 128
129
130; GCN: s_add_u32 s4, s4, 0x70
131; GCN: s_addc_u32 s5, s5, 0{{$}}
132; GCN: s_swappc_b64
133define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 {
134  call void @func_implicitarg_ptr()
135  ret void
136}
137
138; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func:
139; GCN-NOT: s4
140; GCN-NOT: s5
141; GCN-NOT: s[4:5]
142define void @func_call_implicitarg_ptr_func() #0 {
143  call void @func_implicitarg_ptr()
144  ret void
145}
146
147; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func:
148; GCN-NOT: s4
149; GCN-NOT: s5
150; GCN-NOT: s[4:5]
151define void @opencl_func_call_implicitarg_ptr_func() #0 {
152  call void @func_implicitarg_ptr()
153  ret void
154}
155
156; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
157; GCN: s_waitcnt
158; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
159; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
160; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
161; GCN: s_waitcnt lgkmcnt(0)
162define void @func_kernarg_implicitarg_ptr() #0 {
163  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
164  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
165  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
166  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
167  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
168  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
169  ret void
170}
171
172; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
173; GCN: s_waitcnt
174; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
175; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
176; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
177; GCN: s_waitcnt lgkmcnt(0)
178define void @opencl_func_kernarg_implicitarg_ptr() #0 {
179  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
180  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
181  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
182  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
183  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
184  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
185  ret void
186}
187
188; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
189; GCN: s_add_u32 s4, s4, 0x70
190; GCN: s_addc_u32 s5, s5, 0
191; GCN: s_swappc_b64
192define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
193  call void @func_kernarg_implicitarg_ptr()
194  ret void
195}
196
197; GCN-LABEL: {{^}}kernel_implicitarg_no_struct_align_padding:
198; HSA: kernarg_segment_byte_size = 120
199; MESA: kernarg_segment_byte_size = 84
200; GCN: kernarg_segment_alignment = 6
201define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>, i32) #1 {
202  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
203  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
204  %load = load volatile i32, i32 addrspace(4)* %cast
205  ret void
206}
207
208declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2
209declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
210
211attributes #0 = { nounwind noinline }
212attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" }
213attributes #2 = { nounwind readnone speculatable }
214