1 //===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Definitions of target specific functions
10 //
11 //===----------------------------------------------------------------------===//
12 #ifndef _TARGET_IMPL_H_
13 #define _TARGET_IMPL_H_
14 
15 #include <assert.h>
16 #include <cuda.h>
17 #include <inttypes.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 
21 #include "nvptx_interface.h"
22 
23 #define DEVICE __device__
24 #define INLINE __forceinline__ DEVICE
25 #define NOINLINE __noinline__ DEVICE
26 #define SHARED __shared__
27 #define ALIGN(N) __align__(N)
28 
29 ////////////////////////////////////////////////////////////////////////////////
30 // Kernel options
31 ////////////////////////////////////////////////////////////////////////////////
32 
33 ////////////////////////////////////////////////////////////////////////////////
34 // The following def must match the absolute limit hardwired in the host RTL
35 // max number of threads per team
36 #define MAX_THREADS_PER_TEAM 1024
37 
38 #define WARPSIZE 32
39 
40 // The named barrier for active parallel threads of a team in an L1 parallel
41 // region to synchronize with each other.
42 #define L1_BARRIER (1)
43 
44 // Maximum number of preallocated arguments to an outlined parallel/simd function.
45 // Anything more requires dynamic memory allocation.
46 #define MAX_SHARED_ARGS 20
47 
48 // Maximum number of omp state objects per SM allocated statically in global
49 // memory.
50 #if __CUDA_ARCH__ >= 700
51 #define OMP_STATE_COUNT 32
52 #define MAX_SM 84
53 #elif __CUDA_ARCH__ >= 600
54 #define OMP_STATE_COUNT 32
55 #define MAX_SM 56
56 #else
57 #define OMP_STATE_COUNT 16
58 #define MAX_SM 16
59 #endif
60 
61 #define OMP_ACTIVE_PARALLEL_LEVEL 128
62 
63 // Data sharing related quantities, need to match what is used in the compiler.
64 enum DATA_SHARING_SIZES {
65   // The maximum number of workers in a kernel.
66   DS_Max_Worker_Threads = 992,
67   // The size reserved for data in a shared memory slot.
68   DS_Slot_Size = 256,
69   // The slot size that should be reserved for a working warp.
70   DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
71   // The maximum number of warps in use
72   DS_Max_Warp_Number = 32,
73   // The size of the preallocated shared memory buffer per team
74   DS_Shared_Memory_Size = 128,
75 };
76 
__kmpc_impl_unpack(uint64_t val,uint32_t & lo,uint32_t & hi)77 INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
78   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
79 }
80 
__kmpc_impl_pack(uint32_t lo,uint32_t hi)81 INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
82   uint64_t val;
83   asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
84   return val;
85 }
86 
87 static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
88     UINT32_C(0xffffffff);
89 
__kmpc_impl_lanemask_lt()90 INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
91   __kmpc_impl_lanemask_t res;
92   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
93   return res;
94 }
95 
__kmpc_impl_lanemask_gt()96 INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
97   __kmpc_impl_lanemask_t res;
98   asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
99   return res;
100 }
101 
__kmpc_impl_smid()102 INLINE uint32_t __kmpc_impl_smid() {
103   uint32_t id;
104   asm("mov.u32 %0, %%smid;" : "=r"(id));
105   return id;
106 }
107 
__kmpc_impl_get_wtick()108 INLINE double __kmpc_impl_get_wtick() {
109   // Timer precision is 1ns
110   return ((double)1E-9);
111 }
112 
__kmpc_impl_get_wtime()113 INLINE double __kmpc_impl_get_wtime() {
114   unsigned long long nsecs;
115   asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
116   return (double)nsecs * __kmpc_impl_get_wtick();
117 }
118 
__kmpc_impl_ffs(uint32_t x)119 INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
120 
__kmpc_impl_popc(uint32_t x)121 INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
122 
__kmpc_impl_min(T x,T y)123 template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
124   return min(x, y);
125 }
126 
127 #ifndef CUDA_VERSION
128 #error CUDA_VERSION macro is undefined, something wrong with cuda.
129 #endif
130 
131 // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
132 
__kmpc_impl_activemask()133 INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
134 #if CUDA_VERSION >= 9000
135   return __activemask();
136 #else
137   return __ballot(1);
138 #endif
139 }
140 
141 // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
142 
__kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask,int32_t Var,int32_t SrcLane)143 INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
144                                      int32_t SrcLane) {
145 #if CUDA_VERSION >= 9000
146   return __shfl_sync(Mask, Var, SrcLane);
147 #else
148   return __shfl(Var, SrcLane);
149 #endif // CUDA_VERSION
150 }
151 
__kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,int32_t Var,uint32_t Delta,int32_t Width)152 INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
153                                           int32_t Var, uint32_t Delta,
154                                           int32_t Width) {
155 #if CUDA_VERSION >= 9000
156   return __shfl_down_sync(Mask, Var, Delta, Width);
157 #else
158   return __shfl_down(Var, Delta, Width);
159 #endif // CUDA_VERSION
160 }
161 
__kmpc_impl_syncthreads()162 INLINE void __kmpc_impl_syncthreads() {
163   // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
164 #if !defined(__clang__) || __clang_major__ >= 9
165   __syncthreads();
166 #else
167   asm volatile("bar.sync %0;" : : "r"(0) : "memory");
168 #endif // __clang__
169 }
170 
__kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask)171 INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
172 #if CUDA_VERSION >= 9000
173   __syncwarp(Mask);
174 #else
175   // In Cuda < 9.0 no need to sync threads in warps.
176 #endif // CUDA_VERSION
177 }
178 
__kmpc_impl_named_sync(int barrier,uint32_t num_threads)179 INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
180   asm volatile("bar.sync %0, %1;"
181                :
182                : "r"(barrier), "r"(num_threads)
183                : "memory");
184 }
185 
__kmpc_impl_threadfence(void)186 INLINE void __kmpc_impl_threadfence(void) { __threadfence(); }
__kmpc_impl_threadfence_block(void)187 INLINE void __kmpc_impl_threadfence_block(void) { __threadfence_block(); }
__kmpc_impl_threadfence_system(void)188 INLINE void __kmpc_impl_threadfence_system(void) { __threadfence_system(); }
189 
190 // Calls to the NVPTX layer (assuming 1D layout)
GetThreadIdInBlock()191 INLINE int GetThreadIdInBlock() { return threadIdx.x; }
GetBlockIdInKernel()192 INLINE int GetBlockIdInKernel() { return blockIdx.x; }
GetNumberOfBlocksInKernel()193 INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
GetNumberOfThreadsInBlock()194 INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
GetWarpId()195 INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
GetLaneId()196 INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
197 
198 // Locks
199 EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
200 EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
201 EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock);
202 EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock);
203 EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock);
204 
205 // Memory
__kmpc_impl_malloc(size_t x)206 INLINE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
__kmpc_impl_free(void * x)207 INLINE void __kmpc_impl_free(void *x) { free(x); }
208 
209 #endif
210