1// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=4 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
2// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=4 -mattr=+xnack -filetype=obj < %s > %t
3// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
4// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
5
6// READOBJ: Section Headers
7// READOBJ: .text   PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
8// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}}        000100 {{[0-9]+}}  A {{[0-9]+}} {{[0-9]+}} 64
9
10// READOBJ: Relocation section '.rela.rodata' at offset
11// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
12// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
13// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210
14// READOBJ: 00000000000000d0 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 310
15
16// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
17// READOBJ:      0000000000000000  0 FUNC    LOCAL  PROTECTED 2 minimal
18// READOBJ-NEXT: 0000000000000100  0 FUNC    LOCAL  PROTECTED 2 complete
19// READOBJ-NEXT: 0000000000000200  0 FUNC    LOCAL  PROTECTED 2 special_sgpr
20// READOBJ-NEXT: 0000000000000300  0 FUNC    LOCAL  PROTECTED 2 disabled_user_sgpr
21// READOBJ-NEXT: 0000000000000000 64 OBJECT  LOCAL  DEFAULT   3 minimal.kd
22// READOBJ-NEXT: 0000000000000040 64 OBJECT  LOCAL  DEFAULT   3 complete.kd
23// READOBJ-NEXT: 0000000000000080 64 OBJECT  LOCAL  DEFAULT   3 special_sgpr.kd
24// READOBJ-NEXT: 00000000000000c0 64 OBJECT  LOCAL  DEFAULT   3 disabled_user_sgpr.kd
25
26// OBJDUMP: Contents of section .rodata
27// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
28// minimal
29// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
30// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
31// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
32// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
33// complete
34// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
35// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
36// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
37// OBJDUMP-NEXT: 0070 c2500104 1f0f007f 7f000000 00000000
38// special_sgpr
39// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000
40// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000
41// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000
42// OBJDUMP-NEXT: 00b0 00010000 80000000 00000000 00000000
43// disabled_user_sgpr
44// OBJDUMP-NEXT: 00c0 00000000 00000000 00000000 00000000
45// OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000
46// OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000
47// OBJDUMP-NEXT: 00f0 0000ac00 80000000 00000000 00000000
48
49.text
50// ASM: .text
51
52.amdgcn_target "amdgcn-amd-amdhsa--gfx904:xnack+"
53// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx904:xnack+"
54
55.p2align 8
56.type minimal,@function
57minimal:
58  s_endpgm
59
60.p2align 8
61.type complete,@function
62complete:
63  s_endpgm
64
65.p2align 8
66.type special_sgpr,@function
67special_sgpr:
68  s_endpgm
69
70.p2align 8
71.type disabled_user_sgpr,@function
72disabled_user_sgpr:
73  s_endpgm
74
75.rodata
76// ASM: .rodata
77
78// Test that only specifying required directives is allowed, and that defaulted
79// values are omitted.
80.p2align 6
81.amdhsa_kernel minimal
82  .amdhsa_next_free_vgpr 0
83  .amdhsa_next_free_sgpr 0
84.end_amdhsa_kernel
85
86// ASM: .amdhsa_kernel minimal
87// ASM: .amdhsa_next_free_vgpr 0
88// ASM-NEXT: .amdhsa_next_free_sgpr 0
89// ASM: .end_amdhsa_kernel
90
91// Test that we can specify all available directives with non-default values.
92.p2align 6
93.amdhsa_kernel complete
94  .amdhsa_group_segment_fixed_size 1
95  .amdhsa_private_segment_fixed_size 1
96  .amdhsa_kernarg_size 8
97  .amdhsa_user_sgpr_private_segment_buffer 1
98  .amdhsa_user_sgpr_dispatch_ptr 1
99  .amdhsa_user_sgpr_queue_ptr 1
100  .amdhsa_user_sgpr_kernarg_segment_ptr 1
101  .amdhsa_user_sgpr_dispatch_id 1
102  .amdhsa_user_sgpr_flat_scratch_init 1
103  .amdhsa_user_sgpr_private_segment_size 1
104  .amdhsa_system_sgpr_private_segment_wavefront_offset 1
105  .amdhsa_system_sgpr_workgroup_id_x 0
106  .amdhsa_system_sgpr_workgroup_id_y 1
107  .amdhsa_system_sgpr_workgroup_id_z 1
108  .amdhsa_system_sgpr_workgroup_info 1
109  .amdhsa_system_vgpr_workitem_id 1
110  .amdhsa_next_free_vgpr 9
111  .amdhsa_next_free_sgpr 27
112  .amdhsa_reserve_vcc 0
113  .amdhsa_reserve_flat_scratch 0
114  .amdhsa_reserve_xnack_mask 1
115  .amdhsa_float_round_mode_32 1
116  .amdhsa_float_round_mode_16_64 1
117  .amdhsa_float_denorm_mode_32 1
118  .amdhsa_float_denorm_mode_16_64 0
119  .amdhsa_dx10_clamp 0
120  .amdhsa_ieee_mode 0
121  .amdhsa_fp16_overflow 1
122  .amdhsa_exception_fp_ieee_invalid_op 1
123  .amdhsa_exception_fp_denorm_src 1
124  .amdhsa_exception_fp_ieee_div_zero 1
125  .amdhsa_exception_fp_ieee_overflow 1
126  .amdhsa_exception_fp_ieee_underflow 1
127  .amdhsa_exception_fp_ieee_inexact 1
128  .amdhsa_exception_int_div_zero 1
129.end_amdhsa_kernel
130
131// ASM: .amdhsa_kernel complete
132// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
133// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
134// ASM-NEXT: .amdhsa_kernarg_size 8
135// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
136// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
137// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
138// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
139// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
140// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
141// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
142// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
143// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
144// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
145// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
146// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
147// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
148// ASM-NEXT: .amdhsa_next_free_vgpr 9
149// ASM-NEXT: .amdhsa_next_free_sgpr 27
150// ASM-NEXT: .amdhsa_reserve_vcc 0
151// ASM-NEXT: .amdhsa_reserve_flat_scratch 0
152// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
153// ASM-NEXT: .amdhsa_float_round_mode_32 1
154// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
155// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
156// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
157// ASM-NEXT: .amdhsa_dx10_clamp 0
158// ASM-NEXT: .amdhsa_ieee_mode 0
159// ASM-NEXT: .amdhsa_fp16_overflow 1
160// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
161// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
162// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
163// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
164// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
165// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
166// ASM-NEXT: .amdhsa_exception_int_div_zero 1
167// ASM-NEXT: .end_amdhsa_kernel
168
169// Test that we are including special SGPR usage in the granulated count.
170.p2align 6
171.amdhsa_kernel special_sgpr
172  // Same next_free_sgpr as "complete", but...
173  .amdhsa_next_free_sgpr 27
174  // ...on GFX9 this should require an additional 6 SGPRs, pushing us from
175  // 3 granules to 4
176  .amdhsa_reserve_flat_scratch 1
177
178  .amdhsa_reserve_vcc 0
179  .amdhsa_reserve_xnack_mask 1
180
181  .amdhsa_float_denorm_mode_16_64 0
182  .amdhsa_dx10_clamp 0
183  .amdhsa_ieee_mode 0
184  .amdhsa_next_free_vgpr 0
185.end_amdhsa_kernel
186
187// ASM: .amdhsa_kernel special_sgpr
188// ASM: .amdhsa_next_free_vgpr 0
189// ASM-NEXT: .amdhsa_next_free_sgpr 27
190// ASM-NEXT: .amdhsa_reserve_vcc 0
191// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
192// ASM: .amdhsa_float_denorm_mode_16_64 0
193// ASM-NEXT: .amdhsa_dx10_clamp 0
194// ASM-NEXT: .amdhsa_ieee_mode 0
195// ASM: .end_amdhsa_kernel
196
197// Test that explicitly disabling user_sgpr's does not affect the user_sgpr
198// count, i.e. this should produce the same descriptor as minimal.
199.p2align 6
200.amdhsa_kernel disabled_user_sgpr
201  .amdhsa_user_sgpr_private_segment_buffer 0
202  .amdhsa_next_free_vgpr 0
203  .amdhsa_next_free_sgpr 0
204.end_amdhsa_kernel
205
206// ASM: .amdhsa_kernel disabled_user_sgpr
207// ASM: .amdhsa_next_free_vgpr 0
208// ASM-NEXT: .amdhsa_next_free_sgpr 0
209// ASM: .end_amdhsa_kernel
210
211.section .foo
212
213.byte .amdgcn.gfx_generation_number
214// ASM: .byte 9
215
216.byte .amdgcn.gfx_generation_minor
217// ASM: .byte 0
218
219.byte .amdgcn.gfx_generation_stepping
220// ASM: .byte 4
221
222.byte .amdgcn.next_free_vgpr
223// ASM: .byte 0
224.byte .amdgcn.next_free_sgpr
225// ASM: .byte 0
226
227v_mov_b32_e32 v7, s10
228
229.byte .amdgcn.next_free_vgpr
230// ASM: .byte 8
231.byte .amdgcn.next_free_sgpr
232// ASM: .byte 11
233
234.set .amdgcn.next_free_vgpr, 0
235.set .amdgcn.next_free_sgpr, 0
236
237.byte .amdgcn.next_free_vgpr
238// ASM: .byte 0
239.byte .amdgcn.next_free_sgpr
240// ASM: .byte 0
241
242v_mov_b32_e32 v16, s3
243
244.byte .amdgcn.next_free_vgpr
245// ASM: .byte 17
246.byte .amdgcn.next_free_sgpr
247// ASM: .byte 4
248
249// Metadata
250
251.amdgpu_metadata
252  amdhsa.version:
253    - 3
254    - 0
255  amdhsa.kernels:
256    - .name:       amd_kernel_code_t_test_all
257      .symbol: amd_kernel_code_t_test_all@kd
258      .kernarg_segment_size: 8
259      .group_segment_fixed_size: 16
260      .private_segment_fixed_size: 32
261      .kernarg_segment_align: 64
262      .wavefront_size: 128
263      .sgpr_count: 14
264      .vgpr_count: 40
265      .max_flat_workgroup_size: 256
266    - .name:       amd_kernel_code_t_minimal
267      .symbol: amd_kernel_code_t_minimal@kd
268      .kernarg_segment_size: 8
269      .group_segment_fixed_size: 16
270      .private_segment_fixed_size: 32
271      .kernarg_segment_align: 64
272      .wavefront_size: 128
273      .sgpr_count: 14
274      .vgpr_count: 40
275      .max_flat_workgroup_size: 256
276.end_amdgpu_metadata
277
278// ASM:      	.amdgpu_metadata
279// ASM:      amdhsa.kernels:
280// ASM:        - .group_segment_fixed_size: 16
281// ASM:          .kernarg_segment_align: 64
282// ASM:          .kernarg_segment_size: 8
283// ASM:          .max_flat_workgroup_size: 256
284// ASM:          .name:           amd_kernel_code_t_test_all
285// ASM:          .private_segment_fixed_size: 32
286// ASM:          .sgpr_count:     14
287// ASM:          .symbol:         'amd_kernel_code_t_test_all@kd'
288// ASM:          .vgpr_count:     40
289// ASM:          .wavefront_size: 128
290// ASM:        - .group_segment_fixed_size: 16
291// ASM:          .kernarg_segment_align: 64
292// ASM:          .kernarg_segment_size: 8
293// ASM:          .max_flat_workgroup_size: 256
294// ASM:          .name:           amd_kernel_code_t_minimal
295// ASM:          .private_segment_fixed_size: 32
296// ASM:          .sgpr_count:     14
297// ASM:          .symbol:         'amd_kernel_code_t_minimal@kd'
298// ASM:          .vgpr_count:     40
299// ASM:          .wavefront_size: 128
300// ASM:      amdhsa.version:
301// ASM-NEXT:   - 3
302// ASM-NEXT:   - 0
303// ASM:      	.end_amdgpu_metadata
304