1  /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
2   *
3   * NOTICE TO LICENSEE:
4   *
5   * The source code and/or documentation ("Licensed Deliverables") are
6   * subject to NVIDIA intellectual property rights under U.S. and
7   * international Copyright laws.
8   *
9   * The Licensed Deliverables contained herein are PROPRIETARY and
10   * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11   * conditions of a form of NVIDIA software license agreement by and
12   * between NVIDIA and Licensee ("License Agreement") or electronically
13   * accepted by Licensee.  Notwithstanding any terms or conditions to
14   * the contrary in the License Agreement, reproduction or disclosure
15   * of the Licensed Deliverables to any third party without the express
16   * written consent of NVIDIA is prohibited.
17   *
18   * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19   * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20   * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
21   * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22   * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23   * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24   * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25   * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26   * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27   * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28   * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29   * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30   * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31   * OF THESE LICENSED DELIVERABLES.
32   *
33   * U.S. Government End Users.  These Licensed Deliverables are a
34   * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35   * 1995), consisting of "commercial computer software" and "commercial
36   * computer software documentation" as such terms are used in 48
37   * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38   * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
39   * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40   * U.S. Government End Users acquire the Licensed Deliverables with
41   * only those rights set forth herein.
42   *
43   * Any use of the Licensed Deliverables in individual and commercial
44   * software must include, in the user documentation and internal
45   * comments to the code, the above Disclaimer and U.S. Government End
46   * Users Notice.
47   */
48 
49 /*
50 ** Define: _CG_VERSION
51 */
52 # define _CG_VERSION 1000
53 
54 /*
55 ** Define: _CG_ABI_VERSION
56 */
57 # ifndef _CG_ABI_VERSION
58 #  define _CG_ABI_VERSION 1
59 # endif
60 
61 /*
62 ** Define: _CG_ABI_EXPERIMENTAL
63 ** Desc: If enabled, sets all features enabled (ABI-breaking or experimental)
64 */
65 # if defined(_CG_ABI_EXPERIMENTAL)
66 # endif
67 
68 # define _CG_CONCAT_INNER(x, y) x ## y
69 # define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y)
70 # define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION)
71 
72 # define _CG_BEGIN_NAMESPACE \
73     namespace cooperative_groups { namespace _CG_NAMESPACE {
74 # define _CG_END_NAMESPACE \
75     }; using namespace _CG_NAMESPACE; };
76 
77 # if !defined(_CG_STATIC_QUALIFIER)
78 #  define _CG_STATIC_QUALIFIER static __forceinline__ __device__
79 # endif
80 # if !defined(_CG_QUALIFIER)
81 #  define _CG_QUALIFIER __forceinline__ __device__
82 # endif
83 
84 # if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
85 #  define _CG_HAS_GRID_GROUP
86 # endif
87 # if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
88 #  define _CG_HAS_MULTI_GRID_GROUP
89 # endif
90 # if (__CUDA_ARCH__ >= 700) || !defined(__CUDA_ARCH__)
91 #  define _CG_HAS_MATCH_COLLECTIVE
92 # endif
93 // Has __half and __half2
94 // Only usable if you include the cuda_fp16.h extension, and
95 // _before_ including cooperative_groups.h
96 # ifdef __CUDA_FP16_TYPES_EXIST__
97 #  define _CG_HAS_FP16_COLLECTIVE
98 # endif
99 
100 /*
101 ** Define: CG_DEBUG
102 ** What: Enables various runtime safety checks
103 */
104 #if defined(__CUDACC_DEBUG__) && !defined(_CG_DEBUG)
105 # define _CG_DEBUG 1
106 #endif
107 
108 #if defined(_CG_DEBUG) && (_CG_DEBUG == 1) && !defined(NDEBUG)
109 # include <assert.h>
110 # define _CG_ASSERT(x) assert((x));
111 # define _CG_ABORT() assert(0);
112 #else
113 # define _CG_ASSERT(x)
114 # define _CG_ABORT() __trap();
115 #endif
116 
117 _CG_BEGIN_NAMESPACE
118 
119 namespace __internal {
120 
121     enum groupType {
122         CoalescedTile,
123         Coalesced,
124         ThreadBlock,
125         Grid,
126         MultiGrid,
127     };
128 
129 #if defined(_CG_HAS_GRID_GROUP)
130 
131     namespace grid {
132 
get_intrinsic_handle()133         _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
134         {
135             return (cudaCGGetIntrinsicHandle(cudaCGScopeGrid));
136         }
137 
sync(const unsigned long long handle)138         _CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
139         {
140             cudaCGSynchronizeGrid(handle, 0);
141         }
142 
size(const unsigned long long handle)143         _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
144         {
145             return (blockDim.z * gridDim.z) *
146                 (blockDim.y * gridDim.y) *
147                 (blockDim.x * gridDim.x);
148         }
149 
thread_rank(const unsigned long long handle)150         _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
151         {
152             unsigned int blkIdx = ((blockIdx.z * gridDim.y * gridDim.x) +
153                                (blockIdx.y * gridDim.x) +
154                                blockIdx.x);
155             return (blkIdx * (blockDim.x * blockDim.y * blockDim.z) +
156                     ((threadIdx.z * blockDim.y * blockDim.x) +
157                      (threadIdx.y * blockDim.x) +
158                      threadIdx.x));
159         }
160 
grid_dim()161         _CG_STATIC_QUALIFIER dim3 grid_dim()
162         {
163             return (dim3(gridDim.x, gridDim.y, gridDim.z));
164         }
165     };
166 
167 #endif
168 
169 #if defined(_CG_HAS_MULTI_GRID_GROUP)
170 
171     namespace multi_grid {
172 
get_intrinsic_handle()173         _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
174         {
175             return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
176         }
177 
sync(const unsigned long long handle)178         _CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
179         {
180             cudaError_t err = cudaCGSynchronize(handle, 0);
181         }
182 
size(const unsigned long long handle)183         _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
184         {
185             unsigned int numThreads = 0;
186             cudaCGGetSize(&numThreads, NULL, handle);
187             return numThreads;
188         }
189 
thread_rank(const unsigned long long handle)190         _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
191         {
192             unsigned int threadRank = 0;
193             cudaCGGetRank(&threadRank, NULL, handle);
194             return threadRank;
195         }
196 
grid_rank(const unsigned long long handle)197         _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
198         {
199             unsigned int gridRank = 0;
200             cudaCGGetRank(NULL, &gridRank, handle);
201             return gridRank;
202         }
203 
num_grids(const unsigned long long handle)204         _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
205         {
206             unsigned int numGrids = 0;
207             cudaCGGetSize(NULL, &numGrids, handle);
208             return numGrids;
209         }
210 
211     };
212 
213 #endif
214 
215     namespace cta {
216 
sync()217         _CG_STATIC_QUALIFIER void sync()
218         {
219             __barrier_sync(0);
220         }
221 
size()222         _CG_STATIC_QUALIFIER unsigned int size()
223         {
224             return (blockDim.x * blockDim.y * blockDim.z);
225         }
226 
thread_rank()227         _CG_STATIC_QUALIFIER unsigned int thread_rank()
228         {
229             return ((threadIdx.z * blockDim.y * blockDim.x) +
230                     (threadIdx.y * blockDim.x) +
231                     threadIdx.x);
232         }
233 
group_index()234         _CG_STATIC_QUALIFIER dim3 group_index()
235         {
236             return (dim3(blockIdx.x, blockIdx.y, blockIdx.z));
237         }
238 
thread_index()239         _CG_STATIC_QUALIFIER dim3 thread_index()
240         {
241             return (dim3(threadIdx.x, threadIdx.y, threadIdx.z));
242         }
243 
block_dim()244         _CG_STATIC_QUALIFIER dim3 block_dim()
245         {
246             return (dim3(blockDim.x, blockDim.y, blockDim.z));
247         }
248 
249     };
250 
laneid()251     _CG_STATIC_QUALIFIER unsigned int laneid()
252     {
253         unsigned int laneid;
254         asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid));
255         return laneid;
256     }
257 
warpsz()258     _CG_STATIC_QUALIFIER unsigned int warpsz()
259     {
260         unsigned int warpSize;
261         asm volatile("mov.u32 %0, WARP_SZ;" : "=r"(warpSize));
262         return warpSize;
263     }
264 
lanemask32_eq()265     _CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
266     {
267         unsigned int lanemask32_eq;
268         asm volatile("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
269         return (lanemask32_eq);
270     }
271 
lanemask32_lt()272     _CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
273     {
274         unsigned int lanemask32_lt;
275         asm volatile("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
276         return (lanemask32_lt);
277     }
278 
abort()279     _CG_STATIC_QUALIFIER void abort()
280     {
281         _CG_ABORT();
282     }
283 
284 }; // !Namespace internal
285 
286 _CG_END_NAMESPACE
287