1//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// Definitions of target specific functions 10// 11//===----------------------------------------------------------------------===// 12#pragma omp declare target 13 14#include "common/omptarget.h" 15#include "target_impl.h" 16#include "target_interface.h" 17 18// Implementations initially derived from hcc 19 20// Initialized with a 64-bit mask with bits set in positions less than the 21// thread's lane number in the warp 22EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { 23 uint32_t lane = GetLaneId(); 24 int64_t ballot = __kmpc_impl_activemask(); 25 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1; 26 return mask & ballot; 27} 28 29// Initialized with a 64-bit mask with bits set in positions greater than the 30// thread's lane number in the warp 31EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { 32 uint32_t lane = GetLaneId(); 33 if (lane == (WARPSIZE - 1)) 34 return 0; 35 uint64_t ballot = __kmpc_impl_activemask(); 36 uint64_t mask = (~((uint64_t)0)) << (lane + 1); 37 return mask & ballot; 38} 39 40EXTERN double __kmpc_impl_get_wtick() { return ((double)1E-9); } 41 42EXTERN double __kmpc_impl_get_wtime() { 43 // The intrinsics for measuring time have undocumented frequency 44 // This will probably need to be found by measurement on a number of 45 // architectures. Until then, return 0, which is very inaccurate as a 46 // timer but resolves the undefined symbol at link time. 47 return 0; 48} 49 50// Warp vote function 51EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() { 52 return __builtin_amdgcn_read_exec(); 53} 54 55static void pteam_mem_barrier(uint32_t num_threads, uint32_t *barrier_state) { 56 __atomic_thread_fence(__ATOMIC_ACQUIRE); 57 58 uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE; 59 60 // Partial barrier implementation for amdgcn. 61 // Uses two 16 bit unsigned counters. One for the number of waves to have 62 // reached the barrier, and one to count how many times the barrier has been 63 // passed. These are packed in a single atomically accessed 32 bit integer. 64 // Low bits for the number of waves, assumed zero before this call. 65 // High bits to count the number of times the barrier has been passed. 66 67 // precondition: num_waves != 0; 68 // invariant: num_waves * WARPSIZE == num_threads; 69 // precondition: num_waves < 0xffffu; 70 71 // Increment the low 16 bits once, using the lowest active thread. 72 uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1; 73 bool isLowest = GetLaneId() == lowestActiveThread; 74 75 if (isLowest) { 76 uint32_t load = __atomic_fetch_add(barrier_state, 1, 77 __ATOMIC_RELAXED); // commutative 78 79 // Record the number of times the barrier has been passed 80 uint32_t generation = load & 0xffff0000u; 81 82 if ((load & 0x0000ffffu) == (num_waves - 1)) { 83 // Reached num_waves in low bits so this is the last wave. 84 // Set low bits to zero and increment high bits 85 load += 0x00010000u; // wrap is safe 86 load &= 0xffff0000u; // because bits zeroed second 87 88 // Reset the wave counter and release the waiting waves 89 __atomic_store_n(barrier_state, load, __ATOMIC_RELAXED); 90 } else { 91 // more waves still to go, spin until generation counter changes 92 do { 93 __builtin_amdgcn_s_sleep(0); 94 load = __atomic_load_n(barrier_state, __ATOMIC_RELAXED); 95 } while ((load & 0xffff0000u) == generation); 96 } 97 } 98 __atomic_thread_fence(__ATOMIC_RELEASE); 99} 100 101uint32_t __kmpc_L0_Barrier [[clang::loader_uninitialized]]; 102#pragma allocate(__kmpc_L0_Barrier) allocator(omp_pteam_mem_alloc) 103 104EXTERN void __kmpc_impl_target_init() { 105 // Don't have global ctors, and shared memory is not zero init 106 __atomic_store_n(&__kmpc_L0_Barrier, 0u, __ATOMIC_RELEASE); 107} 108 109EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) { 110 pteam_mem_barrier(num_threads, &__kmpc_L0_Barrier); 111} 112 113namespace { 114uint32_t get_grid_dim(uint32_t n, uint16_t d) { 115 uint32_t q = n / d; 116 return q + (n > q * d); 117} 118uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size, 119 uint16_t group_size) { 120 uint32_t r = grid_size - group_id * group_size; 121 return (r < group_size) ? r : group_size; 122} 123} // namespace 124 125EXTERN int __kmpc_get_hardware_num_blocks() { 126 return get_grid_dim(__builtin_amdgcn_grid_size_x(), 127 __builtin_amdgcn_workgroup_size_x()); 128} 129 130EXTERN int __kmpc_get_hardware_num_threads_in_block() { 131 return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), 132 __builtin_amdgcn_grid_size_x(), 133 __builtin_amdgcn_workgroup_size_x()); 134} 135 136EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; } 137EXTERN unsigned GetWarpSize() { return WARPSIZE; } 138EXTERN unsigned GetLaneId() { 139 return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); 140} 141 142EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() { 143 return __kmpc_get_hardware_num_threads_in_block(); 144} 145 146// Atomics 147uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { 148 return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); 149} 150uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { 151 return __builtin_amdgcn_atomic_inc32(Address, Val, __ATOMIC_SEQ_CST, ""); 152} 153uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { 154 return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST); 155} 156 157uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { 158 uint32_t R; 159 __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); 160 return R; 161} 162uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) { 163 (void)__atomic_compare_exchange(Address, &Compare, &Val, false, 164 __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); 165 return Compare; 166} 167 168unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, 169 unsigned long long Val) { 170 unsigned long long R; 171 __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); 172 return R; 173} 174unsigned long long __kmpc_atomic_add(unsigned long long *Address, 175 unsigned long long Val) { 176 return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); 177} 178 179// Stub implementations 180// Weak to allow overriding by local versions while comparing different 181// potential implementations 182__attribute__((weak)) EXTERN void *__kmpc_impl_malloc(size_t) { 183 return nullptr; 184} 185__attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {} 186 187EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { 188 lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF)); 189 hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32); 190} 191 192EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { 193 return (((uint64_t)hi) << 32) | (uint64_t)lo; 194} 195 196EXTERN void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); } 197 198EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) { 199 // AMDGCN doesn't need to sync threads in a warp 200} 201 202EXTERN void __kmpc_impl_threadfence() { 203 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); 204} 205 206EXTERN void __kmpc_impl_threadfence_block() { 207 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); 208} 209 210EXTERN void __kmpc_impl_threadfence_system() { 211 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, ""); 212} 213 214// Calls to the AMDGCN layer (assuming 1D layout) 215EXTERN int __kmpc_get_hardware_thread_id_in_block() { return __builtin_amdgcn_workitem_id_x(); } 216EXTERN int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); } 217 218#pragma omp end declare target 219