1 //===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Definitions of target specific functions
10 //
11 //===----------------------------------------------------------------------===//
12 #pragma omp declare target
13 
14 #include "target_impl.h"
15 #include "common/debug.h"
16 
__kmpc_impl_unpack(uint64_t val,uint32_t & lo,uint32_t & hi)17 DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
18   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
19 }
20 
__kmpc_impl_pack(uint32_t lo,uint32_t hi)21 DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
22   uint64_t val;
23   asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
24   return val;
25 }
26 
__kmpc_impl_lanemask_lt()27 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
28   __kmpc_impl_lanemask_t res;
29   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
30   return res;
31 }
32 
__kmpc_impl_lanemask_gt()33 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
34   __kmpc_impl_lanemask_t res;
35   asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
36   return res;
37 }
38 
__kmpc_impl_smid()39 DEVICE uint32_t __kmpc_impl_smid() {
40   uint32_t id;
41   asm("mov.u32 %0, %%smid;" : "=r"(id));
42   return id;
43 }
44 
__kmpc_impl_get_wtick()45 DEVICE double __kmpc_impl_get_wtick() {
46   // Timer precision is 1ns
47   return ((double)1E-9);
48 }
49 
__kmpc_impl_get_wtime()50 DEVICE double __kmpc_impl_get_wtime() {
51   unsigned long long nsecs;
52   asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
53   return (double)nsecs * __kmpc_impl_get_wtick();
54 }
55 
56 // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
__kmpc_impl_activemask()57 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
58 #if CUDA_VERSION < 9020
59   return __nvvm_vote_ballot(1);
60 #else
61   unsigned int Mask;
62   asm volatile("activemask.b32 %0;" : "=r"(Mask));
63   return Mask;
64 #endif
65 }
66 
67 // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
__kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask,int32_t Var,int32_t SrcLane)68 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
69                                      int32_t SrcLane) {
70 #if CUDA_VERSION >= 9000
71   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
72 #else
73   return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
74 #endif // CUDA_VERSION
75 }
76 
__kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,int32_t Var,uint32_t Delta,int32_t Width)77 DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
78                                           int32_t Var, uint32_t Delta,
79                                           int32_t Width) {
80   int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
81 #if CUDA_VERSION >= 9000
82   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
83 #else
84   return __nvvm_shfl_down_i32(Var, Delta, T);
85 #endif // CUDA_VERSION
86 }
87 
__kmpc_impl_syncthreads()88 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
89 
__kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask)90 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
91 #if CUDA_VERSION >= 9000
92   __nvvm_bar_warp_sync(Mask);
93 #else
94   // In Cuda < 9.0 no need to sync threads in warps.
95 #endif // CUDA_VERSION
96 }
97 
98 // NVPTX specific kernel initialization
__kmpc_impl_target_init()99 DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
100 }
101 
102 // Barrier until num_threads arrive.
__kmpc_impl_named_sync(uint32_t num_threads)103 DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
104   // The named barrier for active parallel threads of a team in an L1 parallel
105   // region to synchronize with each other.
106   int barrier = 1;
107   asm volatile("bar.sync %0, %1;"
108                :
109                : "r"(barrier), "r"(num_threads)
110                : "memory");
111 }
112 
__kmpc_impl_threadfence()113 DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
__kmpc_impl_threadfence_block()114 DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
__kmpc_impl_threadfence_system()115 DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
116 
117 // Calls to the NVPTX layer (assuming 1D layout)
GetThreadIdInBlock()118 DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
GetBlockIdInKernel()119 DEVICE int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
GetNumberOfBlocksInKernel()120 DEVICE int GetNumberOfBlocksInKernel() {
121   return __nvvm_read_ptx_sreg_nctaid_x();
122 }
GetNumberOfThreadsInBlock()123 DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
GetWarpId()124 DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
GetLaneId()125 DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
126 
127 // Atomics
__kmpc_atomic_add(uint32_t * Address,uint32_t Val)128 DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
129   return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
130 }
__kmpc_atomic_inc(uint32_t * Address,uint32_t Val)131 DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
132   return __nvvm_atom_inc_gen_ui(Address, Val);
133 }
134 
__kmpc_atomic_max(uint32_t * Address,uint32_t Val)135 DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
136   return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
137 }
138 
__kmpc_atomic_exchange(uint32_t * Address,uint32_t Val)139 DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
140   uint32_t R;
141   __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
142   return R;
143 }
144 
__kmpc_atomic_cas(uint32_t * Address,uint32_t Compare,uint32_t Val)145 DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
146                                   uint32_t Val) {
147   (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
148                                   __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
149   return Compare;
150 }
151 
__kmpc_atomic_exchange(unsigned long long * Address,unsigned long long Val)152 DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
153                                                  unsigned long long Val) {
154   unsigned long long R;
155   __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
156   return R;
157 }
158 
__kmpc_atomic_add(unsigned long long * Address,unsigned long long Val)159 DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
160                                             unsigned long long Val) {
161   return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
162 }
163 
164 #define __OMP_SPIN 1000
165 #define UNSET 0u
166 #define SET 1u
167 
__kmpc_impl_init_lock(omp_lock_t * lock)168 DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock) {
169   __kmpc_impl_unset_lock(lock);
170 }
171 
__kmpc_impl_destroy_lock(omp_lock_t * lock)172 DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
173   __kmpc_impl_unset_lock(lock);
174 }
175 
__kmpc_impl_set_lock(omp_lock_t * lock)176 DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
177   // TODO: not sure spinning is a good idea here..
178   while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
179     int32_t start = __nvvm_read_ptx_sreg_clock();
180     int32_t now;
181     for (;;) {
182       now = __nvvm_read_ptx_sreg_clock();
183       int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
184       if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
185         break;
186       }
187     }
188   } // wait for 0 to be the read value
189 }
190 
__kmpc_impl_unset_lock(omp_lock_t * lock)191 DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock) {
192   (void)__kmpc_atomic_exchange(lock, UNSET);
193 }
194 
__kmpc_impl_test_lock(omp_lock_t * lock)195 DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) {
196   return __kmpc_atomic_add(lock, 0u);
197 }
198 
__kmpc_impl_malloc(size_t x)199 DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
__kmpc_impl_free(void * x)200 DEVICE void __kmpc_impl_free(void *x) { free(x); }
201 
202 #pragma omp end declare target
203