1//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Definitions of target specific functions
10//
11//===----------------------------------------------------------------------===//
12#pragma omp declare target
13
14#include "common/omptarget.h"
15#include "target_impl.h"
16#include "target_interface.h"
17
18// Implementations initially derived from hcc
19
20// Initialized with a 64-bit mask with bits set in positions less than the
21// thread's lane number in the warp
22EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
23  uint32_t lane = GetLaneId();
24  int64_t ballot = __kmpc_impl_activemask();
25  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
26  return mask & ballot;
27}
28
29// Initialized with a 64-bit mask with bits set in positions greater than the
30// thread's lane number in the warp
31EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
32  uint32_t lane = GetLaneId();
33  if (lane == (WARPSIZE - 1))
34    return 0;
35  uint64_t ballot = __kmpc_impl_activemask();
36  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
37  return mask & ballot;
38}
39
40EXTERN double __kmpc_impl_get_wtick() { return ((double)1E-9); }
41
42EXTERN double __kmpc_impl_get_wtime() {
43  // The intrinsics for measuring time have undocumented frequency
44  // This will probably need to be found by measurement on a number of
45  // architectures. Until then, return 0, which is very inaccurate as a
46  // timer but resolves the undefined symbol at link time.
47  return 0;
48}
49
50// Warp vote function
51EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
52  return __builtin_amdgcn_read_exec();
53}
54
55static void pteam_mem_barrier(uint32_t num_threads, uint32_t *barrier_state) {
56  __atomic_thread_fence(__ATOMIC_ACQUIRE);
57
58  uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE;
59
60  // Partial barrier implementation for amdgcn.
61  // Uses two 16 bit unsigned counters. One for the number of waves to have
62  // reached the barrier, and one to count how many times the barrier has been
63  // passed. These are packed in a single atomically accessed 32 bit integer.
64  // Low bits for the number of waves, assumed zero before this call.
65  // High bits to count the number of times the barrier has been passed.
66
67  // precondition: num_waves != 0;
68  // invariant: num_waves * WARPSIZE == num_threads;
69  // precondition: num_waves < 0xffffu;
70
71  // Increment the low 16 bits once, using the lowest active thread.
72  uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
73  bool isLowest = GetLaneId() == lowestActiveThread;
74
75  if (isLowest) {
76    uint32_t load = __atomic_fetch_add(barrier_state, 1,
77                                       __ATOMIC_RELAXED); // commutative
78
79    // Record the number of times the barrier has been passed
80    uint32_t generation = load & 0xffff0000u;
81
82    if ((load & 0x0000ffffu) == (num_waves - 1)) {
83      // Reached num_waves in low bits so this is the last wave.
84      // Set low bits to zero and increment high bits
85      load += 0x00010000u; // wrap is safe
86      load &= 0xffff0000u; // because bits zeroed second
87
88      // Reset the wave counter and release the waiting waves
89      __atomic_store_n(barrier_state, load, __ATOMIC_RELAXED);
90    } else {
91      // more waves still to go, spin until generation counter changes
92      do {
93        __builtin_amdgcn_s_sleep(0);
94        load = __atomic_load_n(barrier_state, __ATOMIC_RELAXED);
95      } while ((load & 0xffff0000u) == generation);
96    }
97  }
98  __atomic_thread_fence(__ATOMIC_RELEASE);
99}
100
101uint32_t __kmpc_L0_Barrier [[clang::loader_uninitialized]];
102#pragma allocate(__kmpc_L0_Barrier) allocator(omp_pteam_mem_alloc)
103
104EXTERN void __kmpc_impl_target_init() {
105  // Don't have global ctors, and shared memory is not zero init
106  __atomic_store_n(&__kmpc_L0_Barrier, 0u, __ATOMIC_RELEASE);
107}
108
109EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
110  pteam_mem_barrier(num_threads, &__kmpc_L0_Barrier);
111}
112
113namespace {
114uint32_t get_grid_dim(uint32_t n, uint16_t d) {
115  uint32_t q = n / d;
116  return q + (n > q * d);
117}
118uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
119                           uint16_t group_size) {
120  uint32_t r = grid_size - group_id * group_size;
121  return (r < group_size) ? r : group_size;
122}
123} // namespace
124
125EXTERN int __kmpc_get_hardware_num_blocks() {
126  return get_grid_dim(__builtin_amdgcn_grid_size_x(),
127                      __builtin_amdgcn_workgroup_size_x());
128}
129
130EXTERN int __kmpc_get_hardware_num_threads_in_block() {
131  return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
132                           __builtin_amdgcn_grid_size_x(),
133                           __builtin_amdgcn_workgroup_size_x());
134}
135
136EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }
137EXTERN unsigned GetWarpSize() { return WARPSIZE; }
138EXTERN unsigned GetLaneId() {
139  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
140}
141
142EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
143  return __kmpc_get_hardware_num_threads_in_block();
144}
145
146// Atomics
147uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
148  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
149}
150uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
151  return __builtin_amdgcn_atomic_inc32(Address, Val, __ATOMIC_SEQ_CST, "");
152}
153uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
154  return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
155}
156
157uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
158  uint32_t R;
159  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
160  return R;
161}
162uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) {
163  (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
164                                  __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
165  return Compare;
166}
167
168unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
169                                          unsigned long long Val) {
170  unsigned long long R;
171  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
172  return R;
173}
174unsigned long long __kmpc_atomic_add(unsigned long long *Address,
175                                     unsigned long long Val) {
176  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
177}
178
179// Stub implementations
180// Weak to allow overriding by local versions while comparing different
181// potential implementations
182__attribute__((weak)) EXTERN void *__kmpc_impl_malloc(size_t) {
183  return nullptr;
184}
185__attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {}
186
187EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
188  lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
189  hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
190}
191
192EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
193  return (((uint64_t)hi) << 32) | (uint64_t)lo;
194}
195
196EXTERN void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
197
198EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
199  // AMDGCN doesn't need to sync threads in a warp
200}
201
202EXTERN void __kmpc_impl_threadfence() {
203  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
204}
205
206EXTERN void __kmpc_impl_threadfence_block() {
207  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
208}
209
210EXTERN void __kmpc_impl_threadfence_system() {
211  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
212}
213
214// Calls to the AMDGCN layer (assuming 1D layout)
215EXTERN int __kmpc_get_hardware_thread_id_in_block() { return __builtin_amdgcn_workitem_id_x(); }
216EXTERN int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
217
218#pragma omp end declare target
219