1 /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. 2 * 3 * NOTICE TO LICENSEE: 4 * 5 * The source code and/or documentation ("Licensed Deliverables") are 6 * subject to NVIDIA intellectual property rights under U.S. and 7 * international Copyright laws. 8 * 9 * The Licensed Deliverables contained herein are PROPRIETARY and 10 * CONFIDENTIAL to NVIDIA and are being provided under the terms and 11 * conditions of a form of NVIDIA software license agreement by and 12 * between NVIDIA and Licensee ("License Agreement") or electronically 13 * accepted by Licensee. Notwithstanding any terms or conditions to 14 * the contrary in the License Agreement, reproduction or disclosure 15 * of the Licensed Deliverables to any third party without the express 16 * written consent of NVIDIA is prohibited. 17 * 18 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 19 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE 20 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE 21 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. 22 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED 23 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 24 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 25 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 26 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY 27 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY 28 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 29 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 30 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 31 * OF THESE LICENSED DELIVERABLES. 32 * 33 * U.S. Government End Users. These Licensed Deliverables are a 34 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 35 * 1995), consisting of "commercial computer software" and "commercial 36 * computer software documentation" as such terms are used in 48 37 * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government 38 * only as a commercial end item. Consistent with 48 C.F.R.12.212 and 39 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all 40 * U.S. Government End Users acquire the Licensed Deliverables with 41 * only those rights set forth herein. 42 * 43 * Any use of the Licensed Deliverables in individual and commercial 44 * software must include, in the user documentation and internal 45 * comments to the code, the above Disclaimer and U.S. Government End 46 * Users Notice. 47 */ 48 49 /* 50 ** Define: _CG_VERSION 51 */ 52 # define _CG_VERSION 1000 53 54 /* 55 ** Define: _CG_ABI_VERSION 56 */ 57 # ifndef _CG_ABI_VERSION 58 # define _CG_ABI_VERSION 1 59 # endif 60 61 /* 62 ** Define: _CG_ABI_EXPERIMENTAL 63 ** Desc: If enabled, sets all features enabled (ABI-breaking or experimental) 64 */ 65 # if defined(_CG_ABI_EXPERIMENTAL) 66 # endif 67 68 # define _CG_CONCAT_INNER(x, y) x ## y 69 # define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y) 70 # define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION) 71 72 # define _CG_BEGIN_NAMESPACE \ 73 namespace cooperative_groups { namespace _CG_NAMESPACE { 74 # define _CG_END_NAMESPACE \ 75 }; using namespace _CG_NAMESPACE; }; 76 77 # if !defined(_CG_STATIC_QUALIFIER) 78 # define _CG_STATIC_QUALIFIER static __forceinline__ __device__ 79 # endif 80 # if !defined(_CG_QUALIFIER) 81 # define _CG_QUALIFIER __forceinline__ __device__ 82 # endif 83 84 # if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__) 85 # define _CG_HAS_GRID_GROUP 86 # endif 87 # if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__) 88 # define _CG_HAS_MULTI_GRID_GROUP 89 # endif 90 # if (__CUDA_ARCH__ >= 700) || !defined(__CUDA_ARCH__) 91 # define _CG_HAS_MATCH_COLLECTIVE 92 # endif 93 // Has __half and __half2 94 // Only usable if you include the cuda_fp16.h extension, and 95 // _before_ including cooperative_groups.h 96 # ifdef __CUDA_FP16_TYPES_EXIST__ 97 # define _CG_HAS_FP16_COLLECTIVE 98 # endif 99 100 /* 101 ** Define: CG_DEBUG 102 ** What: Enables various runtime safety checks 103 */ 104 #if defined(__CUDACC_DEBUG__) && !defined(_CG_DEBUG) 105 # define _CG_DEBUG 1 106 #endif 107 108 #if defined(_CG_DEBUG) && (_CG_DEBUG == 1) && !defined(NDEBUG) 109 # include <assert.h> 110 # define _CG_ASSERT(x) assert((x)); 111 # define _CG_ABORT() assert(0); 112 #else 113 # define _CG_ASSERT(x) 114 # define _CG_ABORT() __trap(); 115 #endif 116 117 _CG_BEGIN_NAMESPACE 118 119 namespace __internal { 120 121 enum groupType { 122 CoalescedTile, 123 Coalesced, 124 ThreadBlock, 125 Grid, 126 MultiGrid, 127 }; 128 129 #if defined(_CG_HAS_GRID_GROUP) 130 131 namespace grid { 132 get_intrinsic_handle()133 _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle() 134 { 135 return (cudaCGGetIntrinsicHandle(cudaCGScopeGrid)); 136 } 137 sync(const unsigned long long handle)138 _CG_STATIC_QUALIFIER void sync(const unsigned long long handle) 139 { 140 cudaCGSynchronizeGrid(handle, 0); 141 } 142 size(const unsigned long long handle)143 _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle) 144 { 145 return (blockDim.z * gridDim.z) * 146 (blockDim.y * gridDim.y) * 147 (blockDim.x * gridDim.x); 148 } 149 thread_rank(const unsigned long long handle)150 _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle) 151 { 152 unsigned int blkIdx = ((blockIdx.z * gridDim.y * gridDim.x) + 153 (blockIdx.y * gridDim.x) + 154 blockIdx.x); 155 return (blkIdx * (blockDim.x * blockDim.y * blockDim.z) + 156 ((threadIdx.z * blockDim.y * blockDim.x) + 157 (threadIdx.y * blockDim.x) + 158 threadIdx.x)); 159 } 160 grid_dim()161 _CG_STATIC_QUALIFIER dim3 grid_dim() 162 { 163 return (dim3(gridDim.x, gridDim.y, gridDim.z)); 164 } 165 }; 166 167 #endif 168 169 #if defined(_CG_HAS_MULTI_GRID_GROUP) 170 171 namespace multi_grid { 172 get_intrinsic_handle()173 _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle() 174 { 175 return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid)); 176 } 177 sync(const unsigned long long handle)178 _CG_STATIC_QUALIFIER void sync(const unsigned long long handle) 179 { 180 cudaError_t err = cudaCGSynchronize(handle, 0); 181 } 182 size(const unsigned long long handle)183 _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle) 184 { 185 unsigned int numThreads = 0; 186 cudaCGGetSize(&numThreads, NULL, handle); 187 return numThreads; 188 } 189 thread_rank(const unsigned long long handle)190 _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle) 191 { 192 unsigned int threadRank = 0; 193 cudaCGGetRank(&threadRank, NULL, handle); 194 return threadRank; 195 } 196 grid_rank(const unsigned long long handle)197 _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle) 198 { 199 unsigned int gridRank = 0; 200 cudaCGGetRank(NULL, &gridRank, handle); 201 return gridRank; 202 } 203 num_grids(const unsigned long long handle)204 _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle) 205 { 206 unsigned int numGrids = 0; 207 cudaCGGetSize(NULL, &numGrids, handle); 208 return numGrids; 209 } 210 211 }; 212 213 #endif 214 215 namespace cta { 216 sync()217 _CG_STATIC_QUALIFIER void sync() 218 { 219 __barrier_sync(0); 220 } 221 size()222 _CG_STATIC_QUALIFIER unsigned int size() 223 { 224 return (blockDim.x * blockDim.y * blockDim.z); 225 } 226 thread_rank()227 _CG_STATIC_QUALIFIER unsigned int thread_rank() 228 { 229 return ((threadIdx.z * blockDim.y * blockDim.x) + 230 (threadIdx.y * blockDim.x) + 231 threadIdx.x); 232 } 233 group_index()234 _CG_STATIC_QUALIFIER dim3 group_index() 235 { 236 return (dim3(blockIdx.x, blockIdx.y, blockIdx.z)); 237 } 238 thread_index()239 _CG_STATIC_QUALIFIER dim3 thread_index() 240 { 241 return (dim3(threadIdx.x, threadIdx.y, threadIdx.z)); 242 } 243 block_dim()244 _CG_STATIC_QUALIFIER dim3 block_dim() 245 { 246 return (dim3(blockDim.x, blockDim.y, blockDim.z)); 247 } 248 249 }; 250 laneid()251 _CG_STATIC_QUALIFIER unsigned int laneid() 252 { 253 unsigned int laneid; 254 asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid)); 255 return laneid; 256 } 257 warpsz()258 _CG_STATIC_QUALIFIER unsigned int warpsz() 259 { 260 unsigned int warpSize; 261 asm volatile("mov.u32 %0, WARP_SZ;" : "=r"(warpSize)); 262 return warpSize; 263 } 264 lanemask32_eq()265 _CG_STATIC_QUALIFIER unsigned int lanemask32_eq() 266 { 267 unsigned int lanemask32_eq; 268 asm volatile("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq)); 269 return (lanemask32_eq); 270 } 271 lanemask32_lt()272 _CG_STATIC_QUALIFIER unsigned int lanemask32_lt() 273 { 274 unsigned int lanemask32_lt; 275 asm volatile("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt)); 276 return (lanemask32_lt); 277 } 278 abort()279 _CG_STATIC_QUALIFIER void abort() 280 { 281 _CG_ABORT(); 282 } 283 284 }; // !Namespace internal 285 286 _CG_END_NAMESPACE 287