1 /*
2  * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
3  *
4  * NOTICE TO LICENSEE:
5  *
6  * This source code and/or documentation ("Licensed Deliverables") are
7  * subject to NVIDIA intellectual property rights under U.S. and
8  * international Copyright laws.
9  *
10  * These Licensed Deliverables contained herein is PROPRIETARY and
11  * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12  * conditions of a form of NVIDIA software license agreement by and
13  * between NVIDIA and Licensee ("License Agreement") or electronically
14  * accepted by Licensee.  Notwithstanding any terms or conditions to
15  * the contrary in the License Agreement, reproduction or disclosure
16  * of the Licensed Deliverables to any third party without the express
17  * written consent of NVIDIA is prohibited.
18  *
19  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
22  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32  * OF THESE LICENSED DELIVERABLES.
33  *
34  * U.S. Government End Users.  These Licensed Deliverables are a
35  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36  * 1995), consisting of "commercial computer software" and "commercial
37  * computer software documentation" as such terms are used in 48
38  * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
40  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41  * U.S. Government End Users acquire the Licensed Deliverables with
42  * only those rights set forth herein.
43  *
44  * Any use of the Licensed Deliverables in individual and commercial
45  * software must include, in the user documentation and internal
46  * comments to the code, the above Disclaimer and U.S. Government End
47  * Users Notice.
48  */
49 
50 #ifndef _COOPERATIVE_GROUPS_H_
51 # define _COOPERATIVE_GROUPS_H_
52 
53 #if defined(__cplusplus) && defined(__CUDACC__)
54 
55 # include "cooperative_groups_helpers.h"
56 
57 _CG_BEGIN_NAMESPACE
58 
59 /**
60  * class thread_group;
61  *
62  * Generic thread group type, into which all groups are convertible.
63  * It acts as a container for all storage necessary for the derived groups,
64  * and will dispatch the API calls to the correct derived group. This means
65  * that all derived groups must implement the same interface as thread_group.
66  */
67 class thread_group
68 {
69     friend _CG_QUALIFIER thread_group this_thread();
70     friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
71     friend class thread_block;
72 
73  protected:
74     union __align__(8) {
75         unsigned int type : 8;
76         struct {
77             unsigned int type : 8;
78             unsigned int size : 24;
79             unsigned int mask;
80         } coalesced;
81         struct {
82             void* ptr[2];
83         } buffer;
84     } _data;
85 
86     _CG_QUALIFIER thread_group operator=(const thread_group& src);
thread_group(__internal::groupType type)87     _CG_QUALIFIER thread_group(__internal::groupType type) {
88         _data.type = type;
89     }
90 
91 #if __cplusplus >= 201103L
92     static_assert(sizeof(_data) == 16, "Failed size check");
93 #endif
94 
95 public:
96     _CG_QUALIFIER unsigned int size() const;
97     _CG_QUALIFIER unsigned int thread_rank() const;
98     _CG_QUALIFIER void sync() const;
99 };
100 
101 /**
102  * thread_group this_thread()
103  *
104  * Constructs a generic thread_group containing only the calling thread
105  */
this_thread()106 _CG_QUALIFIER thread_group this_thread()
107 {
108     thread_group g = thread_group(__internal::Coalesced);
109     g._data.coalesced.mask = __internal::lanemask32_eq();
110     g._data.coalesced.size = 1;
111     return (g);
112 }
113 
114 #if defined(_CG_HAS_MULTI_GRID_GROUP)
115 
116 /**
117  * class multi_grid_group;
118  *
119  * Threads within this this group are guaranteed to be co-resident on the
120  * same system, on multiple devices within the same launched kernels.
121  * To use this group, the kernel must have been launched with
122  * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
123  * and the device must support it (queryable device attribute).
124  *
125  * Constructed via this_multi_grid();
126  */
127 class multi_grid_group
128 {
129     friend _CG_QUALIFIER multi_grid_group this_multi_grid();
130 
131     struct __align__(8) {
132         unsigned long long handle;
133         unsigned int size;
134         unsigned int rank;
135     } _data;
136 
137 #if __cplusplus >= 201103L
138     static_assert(sizeof(_data) == 16, "Failed size check");
139 #endif
140 
141 public:
multi_grid_group()142     _CG_QUALIFIER multi_grid_group() {
143         _data.handle = __internal::multi_grid::get_intrinsic_handle();
144         _data.size = __internal::multi_grid::size(_data.handle);
145         _data.rank = __internal::multi_grid::thread_rank(_data.handle);
146     }
147 
is_valid()148     _CG_QUALIFIER bool is_valid() const {
149         return (_data.handle != 0);
150     }
151 
sync()152     _CG_QUALIFIER void sync() const {
153         _CG_ASSERT(is_valid());
154         __internal::multi_grid::sync(_data.handle);
155     }
156 
size()157     _CG_QUALIFIER unsigned int size() const {
158         _CG_ASSERT(is_valid());
159         return (_data.size);
160     }
161 
thread_rank()162     _CG_QUALIFIER unsigned int thread_rank() const {
163         _CG_ASSERT(is_valid());
164         return (_data.rank);
165     }
166 
grid_rank()167     _CG_QUALIFIER unsigned int grid_rank() const {
168         _CG_ASSERT(is_valid());
169         return (__internal::multi_grid::grid_rank(_data.handle));
170     }
171 
num_grids()172     _CG_QUALIFIER unsigned int num_grids() const {
173         _CG_ASSERT(is_valid());
174         return (__internal::multi_grid::num_grids(_data.handle));
175     }
176 };
177 
178 /**
179  * multi_grid_group this_multi_grid()
180  *
181  * Constructs a multi_grid_group
182  */
this_multi_grid()183 _CG_QUALIFIER multi_grid_group this_multi_grid()
184 {
185     return (multi_grid_group());
186 }
187 
188 #endif
189 
190 #if defined(_CG_HAS_GRID_GROUP)
191 
192 /**
193  * class grid_group;
194  *
195  * Threads within this this group are guaranteed to be co-resident on the
196  * same device within the same launched kernel. To use this group, the kernel
197  * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
198  * and the device must support it (queryable device attribute).
199  *
200  * Constructed via this_grid();
201  */
202 class grid_group
203 {
204     friend _CG_QUALIFIER grid_group this_grid();
205 
206     struct __align__(8) {
207         unsigned long long handle;
208         unsigned int size;
209         unsigned int rank;
210     } _data;
211 
212 #if __cplusplus >= 201103L
213     static_assert(sizeof(_data) == 16, "Failed size check");
214 #endif
215 
216  public:
grid_group()217     _CG_QUALIFIER grid_group() {
218         _data.handle = (__internal::grid::get_intrinsic_handle());
219         _data.size = __internal::grid::size(_data.handle);
220         _data.rank = __internal::grid::thread_rank(_data.handle);
221     }
222 
is_valid()223     _CG_QUALIFIER bool is_valid() const {
224         return (_data.handle != 0);
225     }
226 
sync()227     _CG_QUALIFIER void sync() const {
228         _CG_ASSERT(is_valid());
229         __internal::grid::sync(_data.handle);
230     }
231 
size()232     _CG_QUALIFIER unsigned int size() const {
233         _CG_ASSERT(is_valid());
234         return (_data.size);
235     }
236 
thread_rank()237     _CG_QUALIFIER unsigned int thread_rank() const {
238         _CG_ASSERT(is_valid());
239         return (_data.rank);
240     }
241 
group_dim()242     _CG_QUALIFIER dim3 group_dim() const {
243         _CG_ASSERT(is_valid());
244         return (__internal::grid::grid_dim());
245     }
246 
247 };
248 
249 /**
250  * grid_group this_grid()
251  *
252  * Constructs a grid_group
253  */
this_grid()254 _CG_QUALIFIER grid_group this_grid()
255 {
256     return (grid_group());
257 }
258 
259 #endif
260 
261 /**
262  * class thread_block
263  *
264  * Every GPU kernel is executed by a grid of thread blocks, and threads within
265  * each block are guaranteed to reside on the same streaming multiprocessor.
266  * A thread_block represents a thread block whose dimensions are not known until runtime.
267  *
268  * Constructed via this_thread_block();
269  */
270 class thread_block : public thread_group
271 {
272     friend _CG_QUALIFIER thread_block this_thread_block();
273     friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
274     friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
275 
thread_block()276     _CG_QUALIFIER thread_block() : thread_group(__internal::ThreadBlock) {
277     }
278 
279     // Internal Use
_get_tiled_threads(unsigned int tilesz)280     _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
281         const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
282 
283         // Invalid, immediately fail
284         if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
285             __internal::abort();
286             return (thread_block());
287         }
288 
289         unsigned int mask;
290         unsigned int base_offset = thread_rank() & (~(tilesz - 1));
291         unsigned int masklength = min(size() - base_offset, tilesz);
292 
293         mask = (unsigned int)(-1) >> (32 - masklength);
294         mask <<= (__internal::laneid() & ~(tilesz - 1));
295         thread_group tile = thread_group(__internal::CoalescedTile);
296         tile._data.coalesced.mask = mask;
297         tile._data.coalesced.size = __popc(mask);
298         return (tile);
299     }
300 
301  public:
sync()302     _CG_QUALIFIER void sync() const {
303         __internal::cta::sync();
304     }
305 
size()306     _CG_QUALIFIER unsigned int size() const {
307         return (__internal::cta::size());
308     }
309 
thread_rank()310     _CG_QUALIFIER unsigned int thread_rank() const {
311         return (__internal::cta::thread_rank());
312     }
313 
314     // Additional functionality exposed by the group
group_index()315     _CG_QUALIFIER dim3 group_index() const {
316         return (__internal::cta::group_index());
317     }
318 
thread_index()319     _CG_QUALIFIER dim3 thread_index() const {
320         return (__internal::cta::thread_index());
321     }
322 
group_dim()323     _CG_QUALIFIER dim3 group_dim() const {
324         return (__internal::cta::block_dim());
325     }
326 
327 };
328 
329 /**
330  * thread_block this_thread_block()
331  *
332  * Constructs a thread_block group
333  */
this_thread_block()334 _CG_QUALIFIER thread_block this_thread_block()
335 {
336     return (thread_block());
337 }
338 
339 /**
340  * class coalesced_group
341  *
342  * A group representing the current set of converged threads in a warp.
343  * The size of the group is not guaranteed and it may return a group of
344  * only one thread (itself).
345  *
346  * This group exposes warp-synchronous builtins.
347  * Constructed via coalesced_threads();
348  */
349 class coalesced_group : public thread_group
350 {
351     friend _CG_QUALIFIER coalesced_group coalesced_threads();
352     friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
353     friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
354 
_packLanes(unsigned laneMask)355     _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
356         unsigned int member_pack = 0;
357         unsigned int member_rank = 0;
358         for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
359             unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
360             if (lane_bit) {
361                 if (laneMask & lane_bit)
362                     member_pack |= 1 << member_rank;
363                 member_rank++;
364             }
365         }
366         return (member_pack);
367     }
368 
369     // Internal Use
_get_tiled_threads(unsigned int tilesz)370     _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
371         const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
372 
373         // Invalid, immediately fail
374         if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
375             __internal::abort();
376             return (coalesced_group(0));
377         }
378         if (size() <= tilesz) {
379             return (*this);
380         }
381 
382         if ((_data.type == __internal::CoalescedTile) && pow2_tilesz) {
383             unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
384             unsigned int masklength = min(size() - base_offset, tilesz);
385             unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
386 
387             mask <<= (__internal::laneid() & ~(tilesz - 1));
388             coalesced_group coalesced_tile = coalesced_group(mask);
389             coalesced_tile._data.type = __internal::CoalescedTile;
390             return (coalesced_tile);
391         }
392         else if ((_data.type == __internal::Coalesced) && pow2_tilesz) {
393             unsigned int mask = 0;
394             unsigned int member_rank = 0;
395             int seen_lanes = (thread_rank() / tilesz) * tilesz;
396             for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
397                 unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
398                 if (lane_bit) {
399                     if (seen_lanes <= 0 && member_rank < tilesz) {
400                         mask |= lane_bit;
401                         member_rank++;
402                     }
403                     seen_lanes--;
404                 }
405             }
406             return (coalesced_group(mask));
407         }
408         else {
409             // None in _CG_VERSION 1000
410             __internal::abort();
411         }
412 
413         return (coalesced_group(0));
414     }
415 
416  protected:
417     // Construct a group from scratch (coalesced_threads)
coalesced_group(unsigned int mask)418     _CG_QUALIFIER coalesced_group(unsigned int mask) : thread_group(__internal::Coalesced) {
419         _data.coalesced.mask = mask;
420         _data.coalesced.size = __popc(mask);
421     }
422 
423  public:
size()424     _CG_QUALIFIER unsigned int size() const {
425         return (_data.coalesced.size);
426     }
thread_rank()427     _CG_QUALIFIER unsigned int thread_rank() const {
428         return (__popc(_data.coalesced.mask & __internal::lanemask32_lt()));
429     }
sync()430     _CG_QUALIFIER void sync() const {
431         __syncwarp(_data.coalesced.mask);
432     }
433 
434 #define COALESCED_SHFL_FUNCTION(type)                                   \
435     _CG_QUALIFIER type shfl(type var, unsigned int src_rank) const {    \
436         unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 : \
437             (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1)); \
438         return (__shfl_sync(_data.coalesced.mask, var, lane, 32));      \
439     }
440 
441 #define COALESCED_SHFL_UP_FUNCTION(type)                                \
442     _CG_QUALIFIER type shfl_up(type var, int delta) const {             \
443         if (size() == 32) {                                             \
444             return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));        \
445         }                                                               \
446         unsigned lane = __fns(_data.coalesced.mask, __internal::laneid(), -(delta + 1)); \
447         if (lane >= 32) lane = __internal::laneid();                    \
448         return (__shfl_sync(_data.coalesced.mask, var, lane, 32));      \
449     }
450 
451 #define COALESCED_SHFL_DOWN_FUNCTION(type)                              \
452     _CG_QUALIFIER type shfl_down(type var, int delta) const {           \
453         if (size() == 32) {                                             \
454             return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));      \
455         }                                                               \
456         unsigned int lane = __fns(_data.coalesced.mask, __internal::laneid(), delta + 1); \
457         if (lane >= 32) lane = __internal::laneid();                    \
458         return (__shfl_sync(_data.coalesced.mask, var, lane, 32));      \
459     }
460 
461     COALESCED_SHFL_FUNCTION(int);
462     COALESCED_SHFL_FUNCTION(unsigned int);
463     COALESCED_SHFL_FUNCTION(long);
464     COALESCED_SHFL_FUNCTION(unsigned long);
465     COALESCED_SHFL_FUNCTION(long long);
466     COALESCED_SHFL_FUNCTION(unsigned long long);
467     COALESCED_SHFL_FUNCTION(float);
468     COALESCED_SHFL_FUNCTION(double);
469 
470     COALESCED_SHFL_UP_FUNCTION(int);
471     COALESCED_SHFL_UP_FUNCTION(unsigned int);
472     COALESCED_SHFL_UP_FUNCTION(long);
473     COALESCED_SHFL_UP_FUNCTION(unsigned long);
474     COALESCED_SHFL_UP_FUNCTION(long long);
475     COALESCED_SHFL_UP_FUNCTION(unsigned long long);
476     COALESCED_SHFL_UP_FUNCTION(float);
477     COALESCED_SHFL_UP_FUNCTION(double);
478 
479     COALESCED_SHFL_DOWN_FUNCTION(int);
480     COALESCED_SHFL_DOWN_FUNCTION(unsigned int);
481     COALESCED_SHFL_DOWN_FUNCTION(long);
482     COALESCED_SHFL_DOWN_FUNCTION(unsigned long);
483     COALESCED_SHFL_DOWN_FUNCTION(long long);
484     COALESCED_SHFL_DOWN_FUNCTION(unsigned long long);
485     COALESCED_SHFL_DOWN_FUNCTION(float);
486     COALESCED_SHFL_DOWN_FUNCTION(double);
487 
488 # ifdef _CG_HAS_FP16_COLLECTIVE
489     COALESCED_SHFL_FUNCTION(__half);
490     COALESCED_SHFL_UP_FUNCTION(__half);
491     COALESCED_SHFL_DOWN_FUNCTION(__half);
492 
493     COALESCED_SHFL_FUNCTION(__half2);
494     COALESCED_SHFL_UP_FUNCTION(__half2);
495     COALESCED_SHFL_DOWN_FUNCTION(__half2);
496 # endif
497 
498 #undef COALESCED_SHFL_FUNCTION
499 #undef COALESCED_SHFL_UP_FUNCTION
500 #undef COALESCED_SHFL_DOWN_FUNCTION
501 
any(int predicate)502     _CG_QUALIFIER int any(int predicate) const {
503         return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
504     }
all(int predicate)505     _CG_QUALIFIER int all(int predicate) const {
506         return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
507     }
ballot(int predicate)508     _CG_QUALIFIER unsigned int ballot(int predicate) const {
509         if (size() == 32) {
510             return (__ballot_sync(0xFFFFFFFF, predicate));
511         }
512         unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
513         return (_packLanes(lane_ballot));
514     }
515 
516 #ifdef _CG_HAS_MATCH_COLLECTIVE
517 
518 # define COALESCED_MATCH_ANY_FUNCTION(type)                             \
519     _CG_QUALIFIER unsigned int match_any(type val) const {              \
520         if (size() == 32) {                                             \
521             return (__match_any_sync(0xFFFFFFFF, val));                 \
522         }                                                               \
523         unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val); \
524         return (_packLanes(lane_match));                                \
525     }
526 # define COALESCED_MATCH_ALL_FUNCTION(type)                             \
527     _CG_QUALIFIER unsigned int match_all(type val, int &pred) const {   \
528         if (size() == 32) {                                             \
529             return (__match_all_sync(0xFFFFFFFF, val, &pred));          \
530         }                                                               \
531         unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred); \
532         return (_packLanes(lane_match));                                \
533     }
534 
535     COALESCED_MATCH_ANY_FUNCTION(int);
536     COALESCED_MATCH_ANY_FUNCTION(unsigned int);
537     COALESCED_MATCH_ANY_FUNCTION(long);
538     COALESCED_MATCH_ANY_FUNCTION(unsigned long);
539     COALESCED_MATCH_ANY_FUNCTION(long long);
540     COALESCED_MATCH_ANY_FUNCTION(unsigned long long);
541     COALESCED_MATCH_ANY_FUNCTION(float);
542     COALESCED_MATCH_ANY_FUNCTION(double);
543 
544     COALESCED_MATCH_ALL_FUNCTION(int);
545     COALESCED_MATCH_ALL_FUNCTION(unsigned int);
546     COALESCED_MATCH_ALL_FUNCTION(long);
547     COALESCED_MATCH_ALL_FUNCTION(unsigned long);
548     COALESCED_MATCH_ALL_FUNCTION(long long);
549     COALESCED_MATCH_ALL_FUNCTION(unsigned long long);
550     COALESCED_MATCH_ALL_FUNCTION(float);
551     COALESCED_MATCH_ALL_FUNCTION(double);
552 
553 # undef COALESCED_MATCH_ANY_FUNCTION
554 # undef COALESCED_MATCH_ALL_FUNCTION
555 
556 #endif /* !_CG_HAS_MATCH_COLLECTIVE */
557 
558 };
559 
coalesced_threads()560 _CG_QUALIFIER coalesced_group coalesced_threads()
561 {
562     return (coalesced_group(__activemask()));
563 }
564 
565 template <unsigned int Size>
566 class __thread_block_tile_base : public thread_group
567 {
568     static const unsigned int numThreads = Size;
569 
build_mask()570     _CG_QUALIFIER unsigned int build_mask() const {
571         unsigned int mask;
572 
573         if (numThreads == 32) {
574             mask = 0xFFFFFFFF;
575         }
576         else {
577             mask = (unsigned int)(-1) >> (32 - numThreads);
578             mask <<= (__internal::laneid() & (~(numThreads - 1)));
579         }
580         return (mask);
581     }
582 
583  protected:
__thread_block_tile_base()584     _CG_QUALIFIER __thread_block_tile_base() : thread_group(__internal::CoalescedTile) {
585         _data.coalesced.mask = build_mask();
586         _data.coalesced.size = numThreads;
587     }
588 
589  public:
sync()590     _CG_QUALIFIER void sync() const {
591         __syncwarp(build_mask());
592     }
thread_rank()593     _CG_QUALIFIER unsigned int thread_rank() const {
594         return (__internal::laneid() & (numThreads - 1));
595     }
size()596     _CG_QUALIFIER unsigned int size() const {
597         return (numThreads);
598     }
599 
600     // PTX supported collectives
shfl(int var,int srcRank)601     _CG_QUALIFIER int shfl(int var, int srcRank) const {
602         return (__shfl_sync(build_mask(), var, srcRank, numThreads));
603     }
shfl_down(int var,unsigned int delta)604     _CG_QUALIFIER int shfl_down(int var, unsigned int delta) const {
605         return (__shfl_down_sync(build_mask(), var, delta, numThreads));
606     }
shfl_up(int var,unsigned int delta)607     _CG_QUALIFIER int shfl_up(int var, unsigned int delta) const {
608         return (__shfl_up_sync(build_mask(), var, delta, numThreads));
609     }
shfl_xor(int var,unsigned int laneMask)610     _CG_QUALIFIER int shfl_xor(int var, unsigned int laneMask) const {
611         return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
612     }
shfl(unsigned int var,int srcRank)613     _CG_QUALIFIER unsigned int shfl(unsigned int var, int srcRank) const {
614         return (__shfl_sync(build_mask(), var, srcRank, numThreads));
615     }
shfl_down(unsigned int var,unsigned int delta)616     _CG_QUALIFIER unsigned int shfl_down(unsigned int var, unsigned int delta) const {
617         return (__shfl_down_sync(build_mask(), var, delta, numThreads));
618     }
shfl_up(unsigned int var,unsigned int delta)619     _CG_QUALIFIER unsigned int shfl_up(unsigned int var, unsigned int delta) const {
620         return (__shfl_up_sync(build_mask(), var, delta, numThreads));
621     }
shfl_xor(unsigned int var,unsigned int laneMask)622     _CG_QUALIFIER unsigned int shfl_xor(unsigned int var, unsigned int laneMask) const {
623         return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
624     }
shfl(long var,int srcRank)625     _CG_QUALIFIER long shfl(long var, int srcRank) const {
626         return (__shfl_sync(build_mask(), var, srcRank, numThreads));
627     }
shfl_down(long var,unsigned int delta)628     _CG_QUALIFIER long shfl_down(long var, unsigned int delta) const {
629         return (__shfl_down_sync(build_mask(), var, delta, numThreads));
630     }
shfl_up(long var,unsigned int delta)631     _CG_QUALIFIER long shfl_up(long var, unsigned int delta) const {
632         return (__shfl_up_sync(build_mask(), var, delta, numThreads));
633     }
shfl_xor(long var,unsigned int laneMask)634     _CG_QUALIFIER long shfl_xor(long var, unsigned int laneMask) const {
635         return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
636     }
shfl(unsigned long var,int srcRank)637     _CG_QUALIFIER unsigned long shfl(unsigned long var, int srcRank) const {
638         return (__shfl_sync(build_mask(), var, srcRank, numThreads));
639     }
shfl_down(unsigned long var,unsigned int delta)640     _CG_QUALIFIER unsigned long shfl_down(unsigned long var, unsigned int delta) const {
641         return (__shfl_down_sync(build_mask(), var, delta, numThreads));
642     }
shfl_up(unsigned long var,unsigned int delta)643     _CG_QUALIFIER unsigned long shfl_up(unsigned long var, unsigned int delta) const {
644         return (__shfl_up_sync(build_mask(), var, delta, numThreads));
645     }
shfl_xor(unsigned long var,unsigned int laneMask)646     _CG_QUALIFIER unsigned long shfl_xor(unsigned long var, unsigned int laneMask) const {
647         return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
648     }
shfl(long long var,int srcRank)649     _CG_QUALIFIER long long shfl(long long var, int srcRank) const {
650         return (__shfl_sync(build_mask(), var, srcRank, numThreads));
651     }
shfl_down(long long var,unsigned int delta)652     _CG_QUALIFIER long long shfl_down(long long var, unsigned int delta) const {
653         return (__shfl_down_sync(build_mask(), var, delta, numThreads));
654     }
shfl_up(long long var,unsigned int delta)655     _CG_QUALIFIER long long shfl_up(long long var, unsigned int delta) const {
656         return (__shfl_up_sync(build_mask(), var, delta, numThreads));
657     }
shfl_xor(long long var,unsigned int laneMask)658     _CG_QUALIFIER long long shfl_xor(long long var, unsigned int laneMask) const {
659         return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
660     }
shfl(unsigned long long var,int srcRank)661     _CG_QUALIFIER unsigned long long shfl(unsigned long long var, int srcRank) const {
662         return (__shfl_sync(build_mask(), var, srcRank, numThreads));
663     }
shfl_down(unsigned long long var,unsigned int delta)664     _CG_QUALIFIER unsigned long long shfl_down(unsigned long long var, unsigned int delta) const {
665         return (__shfl_down_sync(build_mask(), var, delta, numThreads));
666     }
shfl_up(unsigned long long var,unsigned int delta)667     _CG_QUALIFIER unsigned long long shfl_up(unsigned long long var, unsigned int delta) const {
668         return (__shfl_up_sync(build_mask(), var, delta, numThreads));
669     }
shfl_xor(unsigned long long var,unsigned int laneMask)670     _CG_QUALIFIER unsigned long long shfl_xor(unsigned long long var, unsigned int laneMask) const {
671         return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
672     }
shfl(float var,int srcRank)673     _CG_QUALIFIER float shfl(float var, int srcRank) const {
674         return (__shfl_sync(build_mask(), var, srcRank, numThreads));
675     }
shfl_down(float var,unsigned int delta)676     _CG_QUALIFIER float shfl_down(float var, unsigned int delta) const {
677         return (__shfl_down_sync(build_mask(), var, delta, numThreads));
678     }
shfl_up(float var,unsigned int delta)679     _CG_QUALIFIER float shfl_up(float var, unsigned int delta) const {
680         return (__shfl_up_sync(build_mask(), var, delta, numThreads));
681     }
shfl_xor(float var,unsigned int laneMask)682     _CG_QUALIFIER float shfl_xor(float var, unsigned int laneMask) const {
683         return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
684     }
shfl(double var,int srcRank)685     _CG_QUALIFIER double shfl(double var, int srcRank) const {
686         return (__shfl_sync(build_mask(), var, srcRank, numThreads));
687     }
shfl_down(double var,unsigned int delta)688     _CG_QUALIFIER double shfl_down(double var, unsigned int delta) const {
689         return (__shfl_down_sync(build_mask(), var, delta, numThreads));
690     }
shfl_up(double var,unsigned int delta)691     _CG_QUALIFIER double shfl_up(double var, unsigned int delta) const {
692         return (__shfl_up_sync(build_mask(), var, delta, numThreads));
693     }
shfl_xor(double var,unsigned int laneMask)694     _CG_QUALIFIER double shfl_xor(double var, unsigned int laneMask) const {
695         return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
696     }
any(int predicate)697     _CG_QUALIFIER int any(int predicate) const {
698         unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate);
699         return (lane_ballot != 0);
700     }
all(int predicate)701     _CG_QUALIFIER int all(int predicate) const {
702         unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate);
703         return (lane_ballot == build_mask());
704     }
ballot(int predicate)705     _CG_QUALIFIER unsigned int ballot(int predicate) const {
706         unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate);
707         return (lane_ballot >> (__internal::laneid() & (~(numThreads - 1))));
708     }
709 
710 #ifdef _CG_HAS_FP16_COLLECTIVE
shfl(__half var,int srcRank)711     _CG_QUALIFIER __half shfl(__half var, int srcRank) const {
712         return (__shfl_sync(build_mask(), var, srcRank, numThreads));
713     }
shfl_down(__half var,unsigned int delta)714     _CG_QUALIFIER __half shfl_down(__half var, unsigned int delta) const {
715         return (__shfl_down_sync(build_mask(), var, delta, numThreads));
716     }
shfl_up(__half var,unsigned int delta)717     _CG_QUALIFIER __half shfl_up(__half var, unsigned int delta) const {
718         return (__shfl_up_sync(build_mask(), var, delta, numThreads));
719     }
shfl_xor(__half var,unsigned int laneMask)720     _CG_QUALIFIER __half shfl_xor(__half var, unsigned int laneMask) const {
721         return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
722     }
shfl(__half2 var,int srcRank)723     _CG_QUALIFIER __half2 shfl(__half2 var, int srcRank) const {
724         return (__shfl_sync(build_mask(), var, srcRank, numThreads));
725     }
shfl_down(__half2 var,unsigned int delta)726     _CG_QUALIFIER __half2 shfl_down(__half2 var, unsigned int delta) const {
727         return (__shfl_down_sync(build_mask(), var, delta, numThreads));
728     }
shfl_up(__half2 var,unsigned int delta)729     _CG_QUALIFIER __half2 shfl_up(__half2 var, unsigned int delta) const {
730         return (__shfl_up_sync(build_mask(), var, delta, numThreads));
731     }
shfl_xor(__half2 var,unsigned int laneMask)732     _CG_QUALIFIER __half2 shfl_xor(__half2 var, unsigned int laneMask) const {
733         return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
734     }
735 #endif
736 
737 #ifdef _CG_HAS_MATCH_COLLECTIVE
match_any(int val)738     _CG_QUALIFIER unsigned int match_any(int val) const {
739         unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
740         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
741     }
match_any(unsigned int val)742     _CG_QUALIFIER unsigned int match_any(unsigned int val) const {
743         unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
744         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
745     }
match_any(long val)746     _CG_QUALIFIER unsigned int match_any(long val) const {
747         unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
748         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
749     }
match_any(unsigned long val)750     _CG_QUALIFIER unsigned int match_any(unsigned long val) const {
751         unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
752         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
753     }
match_any(long long val)754     _CG_QUALIFIER unsigned int match_any(long long val) const {
755         unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
756         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
757     }
match_any(unsigned long long val)758     _CG_QUALIFIER unsigned int match_any(unsigned long long val) const {
759         unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
760         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
761     }
match_any(float val)762     _CG_QUALIFIER unsigned int match_any(float val) const {
763         unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
764         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
765     }
match_any(double val)766     _CG_QUALIFIER unsigned int match_any(double val) const {
767         unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
768         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
769     }
770 
match_all(int val,int & pred)771     _CG_QUALIFIER unsigned int match_all(int val, int &pred) const {
772         unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
773         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
774     }
match_all(unsigned int val,int & pred)775     _CG_QUALIFIER unsigned int match_all(unsigned int val, int &pred) const {
776         unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
777         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
778     }
match_all(long val,int & pred)779     _CG_QUALIFIER unsigned int match_all(long val, int &pred) const {
780         unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
781         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
782     }
match_all(unsigned long val,int & pred)783     _CG_QUALIFIER unsigned int match_all(unsigned long val, int &pred) const {
784         unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
785         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
786     }
match_all(long long val,int & pred)787     _CG_QUALIFIER unsigned int match_all(long long val, int &pred) const {
788         unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
789         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
790     }
match_all(unsigned long long val,int & pred)791     _CG_QUALIFIER unsigned int match_all(unsigned long long val, int &pred) const {
792         unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
793         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
794     }
match_all(float val,int & pred)795     _CG_QUALIFIER unsigned int match_all(float val, int &pred) const {
796         unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
797         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
798     }
match_all(double val,int & pred)799     _CG_QUALIFIER unsigned int match_all(double val, int &pred) const {
800         unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
801         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
802     }
803 #endif
804 
805 };
806 
807 /**
808  * class thread_block_tile<unsigned int Size>
809  *
810  * Statically-sized group type, representing one tile of a thread block.
811  * The only specializations currently supported are those with native
812  * hardware support (1/2/4/8/16/32)
813  *
814  * This group exposes warp-synchronous builtins.
815  * Constructed via tiled_partition<Size>(class thread_block);
816  */
817 template <unsigned int Size>
818 class thread_block_tile;
819 template <> class thread_block_tile<32> : public __thread_block_tile_base<32> { };
820 template <> class thread_block_tile<16> : public __thread_block_tile_base<16> { };
821 template <> class thread_block_tile<8>  : public __thread_block_tile_base<8> { };
822 template <> class thread_block_tile<4>  : public __thread_block_tile_base<4> { };
823 template <> class thread_block_tile<2>  : public __thread_block_tile_base<2> { };
824 template <> class thread_block_tile<1>  : public __thread_block_tile_base<1> { };
825 
826 /**
827  * Outer level API calls
828  * void sync(GroupT) - see <group_type>.sync()
829  * void thread_rank(GroupT) - see <group_type>.thread_rank()
830  * void group_size(GroupT) - see <group_type>.size()
831  */
sync(GroupT const & g)832 template <class GroupT> _CG_QUALIFIER void sync(GroupT const &g)
833 {
834     g.sync();
835 }
836 
thread_rank(GroupT const & g)837 template <class GroupT> _CG_QUALIFIER unsigned int thread_rank(GroupT const& g)
838 {
839     return (g.thread_rank());
840 }
841 
group_size(GroupT const & g)842 template <class GroupT> _CG_QUALIFIER unsigned int group_size(GroupT const &g)
843 {
844     return (g.size());
845 }
846 
847 /**
848  * <group_type>.sync()
849  *
850  * Executes a barrier across the group
851  *
852  * Implements both a compiler fence and an architectural fence to prevent,
853  * memory reordering around the barrier.
854  */
sync()855 _CG_QUALIFIER void thread_group::sync() const
856 {
857     if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) {
858         static_cast<const coalesced_group*>(this)->sync();
859     }
860     else {
861         static_cast<const thread_block*>(this)->sync();
862     }
863 }
864 
865 /**
866  * <group_type>.size()
867  *
868  * Returns the total number of threads in the group.
869  */
size()870 _CG_QUALIFIER unsigned int thread_group::size() const
871 {
872     if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) {
873         return (static_cast<const coalesced_group*>(this)->size());
874     }
875     else {
876         return (static_cast<const thread_block*>(this)->size());
877     }
878 }
879 
880 /**
881  * <group_type>.thread_rank()
882  *
883  * Returns the linearized rank of the calling thread along the interval [0, size()).
884  */
thread_rank()885 _CG_QUALIFIER unsigned int thread_group::thread_rank() const
886 {
887     if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) {
888         return (static_cast<const coalesced_group*>(this)->thread_rank());
889     }
890     else {
891         return (static_cast<const thread_block*>(this)->thread_rank());
892     }
893 }
894 
895 /**
896  * tiled_partition
897  *
898  * The tiled_partition(parent, tilesz) method is a collective operation that
899  * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
900  *
901  * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
902  * be created where threads having identical k = (thread_rank(parent)/tilesz)
903  * will be members of the same subgroup.
904  *
905  * The implementation may cause the calling thread to wait until all the members
906  * of the parent group have invoked the operation before resuming execution.
907  *
908  * Functionality is limited to power-of-two sized subgorup instances of at most
909  * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
910  * tiled_partition() in _CG_VERSION 1000.
911  */
tiled_partition(const thread_group & parent,unsigned int tilesz)912 _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
913 {
914     if (parent._data.type == __internal::Coalesced || parent._data.type == __internal::CoalescedTile) {
915         return (static_cast<const coalesced_group&>(parent)._get_tiled_threads(tilesz));
916     }
917     else {
918         return (static_cast<const thread_block&>(parent)._get_tiled_threads(tilesz));
919     }
920 }
921 // Thread block type overload: returns a basic thread_group for now (may be specialized later)
tiled_partition(const thread_block & parent,unsigned int tilesz)922 _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
923 {
924     return (parent._get_tiled_threads(tilesz));
925 }
926 // Coalesced group type overload: retains its ability to stay coalesced
tiled_partition(const coalesced_group & parent,unsigned int tilesz)927 _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
928 {
929     return (parent._get_tiled_threads(tilesz));
930 }
931 
932 namespace __internal {
933 
934     // For specializing on different tiled_partition template arguments
935     template <unsigned int Size, typename ParentT>
936     struct tiled_partition_impl;
937 
938     template <unsigned int Size>
939     struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size> {
940         _CG_QUALIFIER tiled_partition_impl(thread_block const &) : thread_block_tile<Size>() {}
941     };
942     template <unsigned int Size>
943     struct tiled_partition_impl<Size, thread_block_tile<32> > : public thread_block_tile<Size> {
944         _CG_QUALIFIER tiled_partition_impl(thread_block_tile<32> const&) : thread_block_tile<Size>() {}
945     };
946     template <unsigned int Size>
947     struct tiled_partition_impl<Size, thread_block_tile<16> > : public thread_block_tile<Size> {
948         _CG_QUALIFIER tiled_partition_impl(thread_block_tile<16> const&) : thread_block_tile<Size>() {}
949     };
950     template <unsigned int Size>
951     struct tiled_partition_impl<Size, thread_block_tile<8> > : public thread_block_tile<Size> {
952         _CG_QUALIFIER tiled_partition_impl(thread_block_tile<8> const&) : thread_block_tile<Size>() {}
953     };
954     template <unsigned int Size>
955     struct tiled_partition_impl<Size, thread_block_tile<4> > : public thread_block_tile<Size> {
956         _CG_QUALIFIER tiled_partition_impl(thread_block_tile<4> const&) : thread_block_tile<Size>() {}
957     };
958     template <unsigned int Size>
959     struct tiled_partition_impl<Size, thread_block_tile<2> > : public thread_block_tile<Size> {
960         _CG_QUALIFIER tiled_partition_impl(thread_block_tile<2> const&) : thread_block_tile<Size>() {}
961     };
962     template <>
963     struct tiled_partition_impl<1, thread_block_tile<1> > : public thread_block_tile<1> {
964         _CG_QUALIFIER tiled_partition_impl(thread_block_tile<1> const&) : thread_block_tile<1>() {}
965     };
966 
967 };
968 
969 /**
970  * tiled_partition<tilesz>
971  *
972  * The tiled_partition<tilesz>(parent) method is a collective operation that
973  * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
974  *
975  * A total of ((size(parent)/tilesz) subgroups will be created,
976  * therefore the parent group size must be evenly divisible by the tilesz.
977  * The allow parent groups are thread_block or thread_block_tile<size>.
978  *
979  * The implementation may cause the calling thread to wait until all the members
980  * of the parent group have invoked the operation before resuming execution.
981  *
982  * Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
983  * The size(parent) must be greater than the template Size parameter
984  * otherwise the results are undefined.
985  */
986 template <unsigned int Size, typename ParentT>
987 _CG_QUALIFIER thread_block_tile<Size> tiled_partition(const ParentT& g)
988 {
989     return (__internal::tiled_partition_impl<Size, ParentT>(g));
990 }
991 
992 _CG_END_NAMESPACE
993 
994 # endif /* ! (__cplusplus, __CUDACC__) */
995 
996 #endif /* !_COOPERATIVE_GROUPS_H_ */
997