1 /*
2 * Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
3 *
4 * NOTICE TO LICENSEE:
5 *
6 * This source code and/or documentation ("Licensed Deliverables") are
7 * subject to NVIDIA intellectual property rights under U.S. and
8 * international Copyright laws.
9 *
10 * These Licensed Deliverables contained herein is PROPRIETARY and
11 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12 * conditions of a form of NVIDIA software license agreement by and
13 * between NVIDIA and Licensee ("License Agreement") or electronically
14 * accepted by Licensee. Notwithstanding any terms or conditions to
15 * the contrary in the License Agreement, reproduction or disclosure
16 * of the Licensed Deliverables to any third party without the express
17 * written consent of NVIDIA is prohibited.
18 *
19 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32 * OF THESE LICENSED DELIVERABLES.
33 *
34 * U.S. Government End Users. These Licensed Deliverables are a
35 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36 * 1995), consisting of "commercial computer software" and "commercial
37 * computer software documentation" as such terms are used in 48
38 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39 * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41 * U.S. Government End Users acquire the Licensed Deliverables with
42 * only those rights set forth herein.
43 *
44 * Any use of the Licensed Deliverables in individual and commercial
45 * software must include, in the user documentation and internal
46 * comments to the code, the above Disclaimer and U.S. Government End
47 * Users Notice.
48 */
49
50 #ifndef _COOPERATIVE_GROUPS_H_
51 # define _COOPERATIVE_GROUPS_H_
52
53 #if defined(__cplusplus) && defined(__CUDACC__)
54
55 # include "cooperative_groups_helpers.h"
56
57 _CG_BEGIN_NAMESPACE
58
59 /**
60 * class thread_group;
61 *
62 * Generic thread group type, into which all groups are convertible.
63 * It acts as a container for all storage necessary for the derived groups,
64 * and will dispatch the API calls to the correct derived group. This means
65 * that all derived groups must implement the same interface as thread_group.
66 */
67 class thread_group
68 {
69 friend _CG_QUALIFIER thread_group this_thread();
70 friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
71 friend class thread_block;
72
73 protected:
74 union __align__(8) {
75 unsigned int type : 8;
76 struct {
77 unsigned int type : 8;
78 unsigned int size : 24;
79 unsigned int mask;
80 } coalesced;
81 struct {
82 void* ptr[2];
83 } buffer;
84 } _data;
85
86 _CG_QUALIFIER thread_group operator=(const thread_group& src);
thread_group(__internal::groupType type)87 _CG_QUALIFIER thread_group(__internal::groupType type) {
88 _data.type = type;
89 }
90
91 #if __cplusplus >= 201103L
92 static_assert(sizeof(_data) == 16, "Failed size check");
93 #endif
94
95 public:
96 _CG_QUALIFIER unsigned int size() const;
97 _CG_QUALIFIER unsigned int thread_rank() const;
98 _CG_QUALIFIER void sync() const;
99 };
100
101 /**
102 * thread_group this_thread()
103 *
104 * Constructs a generic thread_group containing only the calling thread
105 */
this_thread()106 _CG_QUALIFIER thread_group this_thread()
107 {
108 thread_group g = thread_group(__internal::Coalesced);
109 g._data.coalesced.mask = __internal::lanemask32_eq();
110 g._data.coalesced.size = 1;
111 return (g);
112 }
113
114 #if defined(_CG_HAS_MULTI_GRID_GROUP)
115
116 /**
117 * class multi_grid_group;
118 *
119 * Threads within this this group are guaranteed to be co-resident on the
120 * same system, on multiple devices within the same launched kernels.
121 * To use this group, the kernel must have been launched with
122 * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
123 * and the device must support it (queryable device attribute).
124 *
125 * Constructed via this_multi_grid();
126 */
127 class multi_grid_group
128 {
129 friend _CG_QUALIFIER multi_grid_group this_multi_grid();
130
131 struct __align__(8) {
132 unsigned long long handle;
133 unsigned int size;
134 unsigned int rank;
135 } _data;
136
137 #if __cplusplus >= 201103L
138 static_assert(sizeof(_data) == 16, "Failed size check");
139 #endif
140
141 public:
multi_grid_group()142 _CG_QUALIFIER multi_grid_group() {
143 _data.handle = __internal::multi_grid::get_intrinsic_handle();
144 _data.size = __internal::multi_grid::size(_data.handle);
145 _data.rank = __internal::multi_grid::thread_rank(_data.handle);
146 }
147
is_valid()148 _CG_QUALIFIER bool is_valid() const {
149 return (_data.handle != 0);
150 }
151
sync()152 _CG_QUALIFIER void sync() const {
153 _CG_ASSERT(is_valid());
154 __internal::multi_grid::sync(_data.handle);
155 }
156
size()157 _CG_QUALIFIER unsigned int size() const {
158 _CG_ASSERT(is_valid());
159 return (_data.size);
160 }
161
thread_rank()162 _CG_QUALIFIER unsigned int thread_rank() const {
163 _CG_ASSERT(is_valid());
164 return (_data.rank);
165 }
166
grid_rank()167 _CG_QUALIFIER unsigned int grid_rank() const {
168 _CG_ASSERT(is_valid());
169 return (__internal::multi_grid::grid_rank(_data.handle));
170 }
171
num_grids()172 _CG_QUALIFIER unsigned int num_grids() const {
173 _CG_ASSERT(is_valid());
174 return (__internal::multi_grid::num_grids(_data.handle));
175 }
176 };
177
178 /**
179 * multi_grid_group this_multi_grid()
180 *
181 * Constructs a multi_grid_group
182 */
this_multi_grid()183 _CG_QUALIFIER multi_grid_group this_multi_grid()
184 {
185 return (multi_grid_group());
186 }
187
188 #endif
189
190 #if defined(_CG_HAS_GRID_GROUP)
191
192 /**
193 * class grid_group;
194 *
195 * Threads within this this group are guaranteed to be co-resident on the
196 * same device within the same launched kernel. To use this group, the kernel
197 * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
198 * and the device must support it (queryable device attribute).
199 *
200 * Constructed via this_grid();
201 */
202 class grid_group
203 {
204 friend _CG_QUALIFIER grid_group this_grid();
205
206 struct __align__(8) {
207 unsigned long long handle;
208 unsigned int size;
209 unsigned int rank;
210 } _data;
211
212 #if __cplusplus >= 201103L
213 static_assert(sizeof(_data) == 16, "Failed size check");
214 #endif
215
216 public:
grid_group()217 _CG_QUALIFIER grid_group() {
218 _data.handle = (__internal::grid::get_intrinsic_handle());
219 _data.size = __internal::grid::size(_data.handle);
220 _data.rank = __internal::grid::thread_rank(_data.handle);
221 }
222
is_valid()223 _CG_QUALIFIER bool is_valid() const {
224 return (_data.handle != 0);
225 }
226
sync()227 _CG_QUALIFIER void sync() const {
228 _CG_ASSERT(is_valid());
229 __internal::grid::sync(_data.handle);
230 }
231
size()232 _CG_QUALIFIER unsigned int size() const {
233 _CG_ASSERT(is_valid());
234 return (_data.size);
235 }
236
thread_rank()237 _CG_QUALIFIER unsigned int thread_rank() const {
238 _CG_ASSERT(is_valid());
239 return (_data.rank);
240 }
241
group_dim()242 _CG_QUALIFIER dim3 group_dim() const {
243 _CG_ASSERT(is_valid());
244 return (__internal::grid::grid_dim());
245 }
246
247 };
248
249 /**
250 * grid_group this_grid()
251 *
252 * Constructs a grid_group
253 */
this_grid()254 _CG_QUALIFIER grid_group this_grid()
255 {
256 return (grid_group());
257 }
258
259 #endif
260
261 /**
262 * class thread_block
263 *
264 * Every GPU kernel is executed by a grid of thread blocks, and threads within
265 * each block are guaranteed to reside on the same streaming multiprocessor.
266 * A thread_block represents a thread block whose dimensions are not known until runtime.
267 *
268 * Constructed via this_thread_block();
269 */
270 class thread_block : public thread_group
271 {
272 friend _CG_QUALIFIER thread_block this_thread_block();
273 friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
274 friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
275
thread_block()276 _CG_QUALIFIER thread_block() : thread_group(__internal::ThreadBlock) {
277 }
278
279 // Internal Use
_get_tiled_threads(unsigned int tilesz)280 _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
281 const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
282
283 // Invalid, immediately fail
284 if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
285 __internal::abort();
286 return (thread_block());
287 }
288
289 unsigned int mask;
290 unsigned int base_offset = thread_rank() & (~(tilesz - 1));
291 unsigned int masklength = min(size() - base_offset, tilesz);
292
293 mask = (unsigned int)(-1) >> (32 - masklength);
294 mask <<= (__internal::laneid() & ~(tilesz - 1));
295 thread_group tile = thread_group(__internal::CoalescedTile);
296 tile._data.coalesced.mask = mask;
297 tile._data.coalesced.size = __popc(mask);
298 return (tile);
299 }
300
301 public:
sync()302 _CG_QUALIFIER void sync() const {
303 __internal::cta::sync();
304 }
305
size()306 _CG_QUALIFIER unsigned int size() const {
307 return (__internal::cta::size());
308 }
309
thread_rank()310 _CG_QUALIFIER unsigned int thread_rank() const {
311 return (__internal::cta::thread_rank());
312 }
313
314 // Additional functionality exposed by the group
group_index()315 _CG_QUALIFIER dim3 group_index() const {
316 return (__internal::cta::group_index());
317 }
318
thread_index()319 _CG_QUALIFIER dim3 thread_index() const {
320 return (__internal::cta::thread_index());
321 }
322
group_dim()323 _CG_QUALIFIER dim3 group_dim() const {
324 return (__internal::cta::block_dim());
325 }
326
327 };
328
329 /**
330 * thread_block this_thread_block()
331 *
332 * Constructs a thread_block group
333 */
this_thread_block()334 _CG_QUALIFIER thread_block this_thread_block()
335 {
336 return (thread_block());
337 }
338
339 /**
340 * class coalesced_group
341 *
342 * A group representing the current set of converged threads in a warp.
343 * The size of the group is not guaranteed and it may return a group of
344 * only one thread (itself).
345 *
346 * This group exposes warp-synchronous builtins.
347 * Constructed via coalesced_threads();
348 */
349 class coalesced_group : public thread_group
350 {
351 friend _CG_QUALIFIER coalesced_group coalesced_threads();
352 friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
353 friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
354
_packLanes(unsigned laneMask)355 _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
356 unsigned int member_pack = 0;
357 unsigned int member_rank = 0;
358 for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
359 unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
360 if (lane_bit) {
361 if (laneMask & lane_bit)
362 member_pack |= 1 << member_rank;
363 member_rank++;
364 }
365 }
366 return (member_pack);
367 }
368
369 // Internal Use
_get_tiled_threads(unsigned int tilesz)370 _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
371 const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
372
373 // Invalid, immediately fail
374 if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
375 __internal::abort();
376 return (coalesced_group(0));
377 }
378 if (size() <= tilesz) {
379 return (*this);
380 }
381
382 if ((_data.type == __internal::CoalescedTile) && pow2_tilesz) {
383 unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
384 unsigned int masklength = min(size() - base_offset, tilesz);
385 unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
386
387 mask <<= (__internal::laneid() & ~(tilesz - 1));
388 coalesced_group coalesced_tile = coalesced_group(mask);
389 coalesced_tile._data.type = __internal::CoalescedTile;
390 return (coalesced_tile);
391 }
392 else if ((_data.type == __internal::Coalesced) && pow2_tilesz) {
393 unsigned int mask = 0;
394 unsigned int member_rank = 0;
395 int seen_lanes = (thread_rank() / tilesz) * tilesz;
396 for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
397 unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
398 if (lane_bit) {
399 if (seen_lanes <= 0 && member_rank < tilesz) {
400 mask |= lane_bit;
401 member_rank++;
402 }
403 seen_lanes--;
404 }
405 }
406 return (coalesced_group(mask));
407 }
408 else {
409 // None in _CG_VERSION 1000
410 __internal::abort();
411 }
412
413 return (coalesced_group(0));
414 }
415
416 protected:
417 // Construct a group from scratch (coalesced_threads)
coalesced_group(unsigned int mask)418 _CG_QUALIFIER coalesced_group(unsigned int mask) : thread_group(__internal::Coalesced) {
419 _data.coalesced.mask = mask;
420 _data.coalesced.size = __popc(mask);
421 }
422
423 public:
size()424 _CG_QUALIFIER unsigned int size() const {
425 return (_data.coalesced.size);
426 }
thread_rank()427 _CG_QUALIFIER unsigned int thread_rank() const {
428 return (__popc(_data.coalesced.mask & __internal::lanemask32_lt()));
429 }
sync()430 _CG_QUALIFIER void sync() const {
431 __syncwarp(_data.coalesced.mask);
432 }
433
434 #define COALESCED_SHFL_FUNCTION(type) \
435 _CG_QUALIFIER type shfl(type var, unsigned int src_rank) const { \
436 unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 : \
437 (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1)); \
438 return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); \
439 }
440
441 #define COALESCED_SHFL_UP_FUNCTION(type) \
442 _CG_QUALIFIER type shfl_up(type var, int delta) const { \
443 if (size() == 32) { \
444 return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32)); \
445 } \
446 unsigned lane = __fns(_data.coalesced.mask, __internal::laneid(), -(delta + 1)); \
447 if (lane >= 32) lane = __internal::laneid(); \
448 return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); \
449 }
450
451 #define COALESCED_SHFL_DOWN_FUNCTION(type) \
452 _CG_QUALIFIER type shfl_down(type var, int delta) const { \
453 if (size() == 32) { \
454 return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32)); \
455 } \
456 unsigned int lane = __fns(_data.coalesced.mask, __internal::laneid(), delta + 1); \
457 if (lane >= 32) lane = __internal::laneid(); \
458 return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); \
459 }
460
461 COALESCED_SHFL_FUNCTION(int);
462 COALESCED_SHFL_FUNCTION(unsigned int);
463 COALESCED_SHFL_FUNCTION(long);
464 COALESCED_SHFL_FUNCTION(unsigned long);
465 COALESCED_SHFL_FUNCTION(long long);
466 COALESCED_SHFL_FUNCTION(unsigned long long);
467 COALESCED_SHFL_FUNCTION(float);
468 COALESCED_SHFL_FUNCTION(double);
469
470 COALESCED_SHFL_UP_FUNCTION(int);
471 COALESCED_SHFL_UP_FUNCTION(unsigned int);
472 COALESCED_SHFL_UP_FUNCTION(long);
473 COALESCED_SHFL_UP_FUNCTION(unsigned long);
474 COALESCED_SHFL_UP_FUNCTION(long long);
475 COALESCED_SHFL_UP_FUNCTION(unsigned long long);
476 COALESCED_SHFL_UP_FUNCTION(float);
477 COALESCED_SHFL_UP_FUNCTION(double);
478
479 COALESCED_SHFL_DOWN_FUNCTION(int);
480 COALESCED_SHFL_DOWN_FUNCTION(unsigned int);
481 COALESCED_SHFL_DOWN_FUNCTION(long);
482 COALESCED_SHFL_DOWN_FUNCTION(unsigned long);
483 COALESCED_SHFL_DOWN_FUNCTION(long long);
484 COALESCED_SHFL_DOWN_FUNCTION(unsigned long long);
485 COALESCED_SHFL_DOWN_FUNCTION(float);
486 COALESCED_SHFL_DOWN_FUNCTION(double);
487
488 # ifdef _CG_HAS_FP16_COLLECTIVE
489 COALESCED_SHFL_FUNCTION(__half);
490 COALESCED_SHFL_UP_FUNCTION(__half);
491 COALESCED_SHFL_DOWN_FUNCTION(__half);
492
493 COALESCED_SHFL_FUNCTION(__half2);
494 COALESCED_SHFL_UP_FUNCTION(__half2);
495 COALESCED_SHFL_DOWN_FUNCTION(__half2);
496 # endif
497
498 #undef COALESCED_SHFL_FUNCTION
499 #undef COALESCED_SHFL_UP_FUNCTION
500 #undef COALESCED_SHFL_DOWN_FUNCTION
501
any(int predicate)502 _CG_QUALIFIER int any(int predicate) const {
503 return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
504 }
all(int predicate)505 _CG_QUALIFIER int all(int predicate) const {
506 return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
507 }
ballot(int predicate)508 _CG_QUALIFIER unsigned int ballot(int predicate) const {
509 if (size() == 32) {
510 return (__ballot_sync(0xFFFFFFFF, predicate));
511 }
512 unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
513 return (_packLanes(lane_ballot));
514 }
515
516 #ifdef _CG_HAS_MATCH_COLLECTIVE
517
518 # define COALESCED_MATCH_ANY_FUNCTION(type) \
519 _CG_QUALIFIER unsigned int match_any(type val) const { \
520 if (size() == 32) { \
521 return (__match_any_sync(0xFFFFFFFF, val)); \
522 } \
523 unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val); \
524 return (_packLanes(lane_match)); \
525 }
526 # define COALESCED_MATCH_ALL_FUNCTION(type) \
527 _CG_QUALIFIER unsigned int match_all(type val, int &pred) const { \
528 if (size() == 32) { \
529 return (__match_all_sync(0xFFFFFFFF, val, &pred)); \
530 } \
531 unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred); \
532 return (_packLanes(lane_match)); \
533 }
534
535 COALESCED_MATCH_ANY_FUNCTION(int);
536 COALESCED_MATCH_ANY_FUNCTION(unsigned int);
537 COALESCED_MATCH_ANY_FUNCTION(long);
538 COALESCED_MATCH_ANY_FUNCTION(unsigned long);
539 COALESCED_MATCH_ANY_FUNCTION(long long);
540 COALESCED_MATCH_ANY_FUNCTION(unsigned long long);
541 COALESCED_MATCH_ANY_FUNCTION(float);
542 COALESCED_MATCH_ANY_FUNCTION(double);
543
544 COALESCED_MATCH_ALL_FUNCTION(int);
545 COALESCED_MATCH_ALL_FUNCTION(unsigned int);
546 COALESCED_MATCH_ALL_FUNCTION(long);
547 COALESCED_MATCH_ALL_FUNCTION(unsigned long);
548 COALESCED_MATCH_ALL_FUNCTION(long long);
549 COALESCED_MATCH_ALL_FUNCTION(unsigned long long);
550 COALESCED_MATCH_ALL_FUNCTION(float);
551 COALESCED_MATCH_ALL_FUNCTION(double);
552
553 # undef COALESCED_MATCH_ANY_FUNCTION
554 # undef COALESCED_MATCH_ALL_FUNCTION
555
556 #endif /* !_CG_HAS_MATCH_COLLECTIVE */
557
558 };
559
coalesced_threads()560 _CG_QUALIFIER coalesced_group coalesced_threads()
561 {
562 return (coalesced_group(__activemask()));
563 }
564
565 template <unsigned int Size>
566 class __thread_block_tile_base : public thread_group
567 {
568 static const unsigned int numThreads = Size;
569
build_mask()570 _CG_QUALIFIER unsigned int build_mask() const {
571 unsigned int mask;
572
573 if (numThreads == 32) {
574 mask = 0xFFFFFFFF;
575 }
576 else {
577 mask = (unsigned int)(-1) >> (32 - numThreads);
578 mask <<= (__internal::laneid() & (~(numThreads - 1)));
579 }
580 return (mask);
581 }
582
583 protected:
__thread_block_tile_base()584 _CG_QUALIFIER __thread_block_tile_base() : thread_group(__internal::CoalescedTile) {
585 _data.coalesced.mask = build_mask();
586 _data.coalesced.size = numThreads;
587 }
588
589 public:
sync()590 _CG_QUALIFIER void sync() const {
591 __syncwarp(build_mask());
592 }
thread_rank()593 _CG_QUALIFIER unsigned int thread_rank() const {
594 return (__internal::laneid() & (numThreads - 1));
595 }
size()596 _CG_QUALIFIER unsigned int size() const {
597 return (numThreads);
598 }
599
600 // PTX supported collectives
shfl(int var,int srcRank)601 _CG_QUALIFIER int shfl(int var, int srcRank) const {
602 return (__shfl_sync(build_mask(), var, srcRank, numThreads));
603 }
shfl_down(int var,unsigned int delta)604 _CG_QUALIFIER int shfl_down(int var, unsigned int delta) const {
605 return (__shfl_down_sync(build_mask(), var, delta, numThreads));
606 }
shfl_up(int var,unsigned int delta)607 _CG_QUALIFIER int shfl_up(int var, unsigned int delta) const {
608 return (__shfl_up_sync(build_mask(), var, delta, numThreads));
609 }
shfl_xor(int var,unsigned int laneMask)610 _CG_QUALIFIER int shfl_xor(int var, unsigned int laneMask) const {
611 return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
612 }
shfl(unsigned int var,int srcRank)613 _CG_QUALIFIER unsigned int shfl(unsigned int var, int srcRank) const {
614 return (__shfl_sync(build_mask(), var, srcRank, numThreads));
615 }
shfl_down(unsigned int var,unsigned int delta)616 _CG_QUALIFIER unsigned int shfl_down(unsigned int var, unsigned int delta) const {
617 return (__shfl_down_sync(build_mask(), var, delta, numThreads));
618 }
shfl_up(unsigned int var,unsigned int delta)619 _CG_QUALIFIER unsigned int shfl_up(unsigned int var, unsigned int delta) const {
620 return (__shfl_up_sync(build_mask(), var, delta, numThreads));
621 }
shfl_xor(unsigned int var,unsigned int laneMask)622 _CG_QUALIFIER unsigned int shfl_xor(unsigned int var, unsigned int laneMask) const {
623 return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
624 }
shfl(long var,int srcRank)625 _CG_QUALIFIER long shfl(long var, int srcRank) const {
626 return (__shfl_sync(build_mask(), var, srcRank, numThreads));
627 }
shfl_down(long var,unsigned int delta)628 _CG_QUALIFIER long shfl_down(long var, unsigned int delta) const {
629 return (__shfl_down_sync(build_mask(), var, delta, numThreads));
630 }
shfl_up(long var,unsigned int delta)631 _CG_QUALIFIER long shfl_up(long var, unsigned int delta) const {
632 return (__shfl_up_sync(build_mask(), var, delta, numThreads));
633 }
shfl_xor(long var,unsigned int laneMask)634 _CG_QUALIFIER long shfl_xor(long var, unsigned int laneMask) const {
635 return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
636 }
shfl(unsigned long var,int srcRank)637 _CG_QUALIFIER unsigned long shfl(unsigned long var, int srcRank) const {
638 return (__shfl_sync(build_mask(), var, srcRank, numThreads));
639 }
shfl_down(unsigned long var,unsigned int delta)640 _CG_QUALIFIER unsigned long shfl_down(unsigned long var, unsigned int delta) const {
641 return (__shfl_down_sync(build_mask(), var, delta, numThreads));
642 }
shfl_up(unsigned long var,unsigned int delta)643 _CG_QUALIFIER unsigned long shfl_up(unsigned long var, unsigned int delta) const {
644 return (__shfl_up_sync(build_mask(), var, delta, numThreads));
645 }
shfl_xor(unsigned long var,unsigned int laneMask)646 _CG_QUALIFIER unsigned long shfl_xor(unsigned long var, unsigned int laneMask) const {
647 return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
648 }
shfl(long long var,int srcRank)649 _CG_QUALIFIER long long shfl(long long var, int srcRank) const {
650 return (__shfl_sync(build_mask(), var, srcRank, numThreads));
651 }
shfl_down(long long var,unsigned int delta)652 _CG_QUALIFIER long long shfl_down(long long var, unsigned int delta) const {
653 return (__shfl_down_sync(build_mask(), var, delta, numThreads));
654 }
shfl_up(long long var,unsigned int delta)655 _CG_QUALIFIER long long shfl_up(long long var, unsigned int delta) const {
656 return (__shfl_up_sync(build_mask(), var, delta, numThreads));
657 }
shfl_xor(long long var,unsigned int laneMask)658 _CG_QUALIFIER long long shfl_xor(long long var, unsigned int laneMask) const {
659 return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
660 }
shfl(unsigned long long var,int srcRank)661 _CG_QUALIFIER unsigned long long shfl(unsigned long long var, int srcRank) const {
662 return (__shfl_sync(build_mask(), var, srcRank, numThreads));
663 }
shfl_down(unsigned long long var,unsigned int delta)664 _CG_QUALIFIER unsigned long long shfl_down(unsigned long long var, unsigned int delta) const {
665 return (__shfl_down_sync(build_mask(), var, delta, numThreads));
666 }
shfl_up(unsigned long long var,unsigned int delta)667 _CG_QUALIFIER unsigned long long shfl_up(unsigned long long var, unsigned int delta) const {
668 return (__shfl_up_sync(build_mask(), var, delta, numThreads));
669 }
shfl_xor(unsigned long long var,unsigned int laneMask)670 _CG_QUALIFIER unsigned long long shfl_xor(unsigned long long var, unsigned int laneMask) const {
671 return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
672 }
shfl(float var,int srcRank)673 _CG_QUALIFIER float shfl(float var, int srcRank) const {
674 return (__shfl_sync(build_mask(), var, srcRank, numThreads));
675 }
shfl_down(float var,unsigned int delta)676 _CG_QUALIFIER float shfl_down(float var, unsigned int delta) const {
677 return (__shfl_down_sync(build_mask(), var, delta, numThreads));
678 }
shfl_up(float var,unsigned int delta)679 _CG_QUALIFIER float shfl_up(float var, unsigned int delta) const {
680 return (__shfl_up_sync(build_mask(), var, delta, numThreads));
681 }
shfl_xor(float var,unsigned int laneMask)682 _CG_QUALIFIER float shfl_xor(float var, unsigned int laneMask) const {
683 return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
684 }
shfl(double var,int srcRank)685 _CG_QUALIFIER double shfl(double var, int srcRank) const {
686 return (__shfl_sync(build_mask(), var, srcRank, numThreads));
687 }
shfl_down(double var,unsigned int delta)688 _CG_QUALIFIER double shfl_down(double var, unsigned int delta) const {
689 return (__shfl_down_sync(build_mask(), var, delta, numThreads));
690 }
shfl_up(double var,unsigned int delta)691 _CG_QUALIFIER double shfl_up(double var, unsigned int delta) const {
692 return (__shfl_up_sync(build_mask(), var, delta, numThreads));
693 }
shfl_xor(double var,unsigned int laneMask)694 _CG_QUALIFIER double shfl_xor(double var, unsigned int laneMask) const {
695 return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
696 }
any(int predicate)697 _CG_QUALIFIER int any(int predicate) const {
698 unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate);
699 return (lane_ballot != 0);
700 }
all(int predicate)701 _CG_QUALIFIER int all(int predicate) const {
702 unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate);
703 return (lane_ballot == build_mask());
704 }
ballot(int predicate)705 _CG_QUALIFIER unsigned int ballot(int predicate) const {
706 unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate);
707 return (lane_ballot >> (__internal::laneid() & (~(numThreads - 1))));
708 }
709
710 #ifdef _CG_HAS_FP16_COLLECTIVE
shfl(__half var,int srcRank)711 _CG_QUALIFIER __half shfl(__half var, int srcRank) const {
712 return (__shfl_sync(build_mask(), var, srcRank, numThreads));
713 }
shfl_down(__half var,unsigned int delta)714 _CG_QUALIFIER __half shfl_down(__half var, unsigned int delta) const {
715 return (__shfl_down_sync(build_mask(), var, delta, numThreads));
716 }
shfl_up(__half var,unsigned int delta)717 _CG_QUALIFIER __half shfl_up(__half var, unsigned int delta) const {
718 return (__shfl_up_sync(build_mask(), var, delta, numThreads));
719 }
shfl_xor(__half var,unsigned int laneMask)720 _CG_QUALIFIER __half shfl_xor(__half var, unsigned int laneMask) const {
721 return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
722 }
shfl(__half2 var,int srcRank)723 _CG_QUALIFIER __half2 shfl(__half2 var, int srcRank) const {
724 return (__shfl_sync(build_mask(), var, srcRank, numThreads));
725 }
shfl_down(__half2 var,unsigned int delta)726 _CG_QUALIFIER __half2 shfl_down(__half2 var, unsigned int delta) const {
727 return (__shfl_down_sync(build_mask(), var, delta, numThreads));
728 }
shfl_up(__half2 var,unsigned int delta)729 _CG_QUALIFIER __half2 shfl_up(__half2 var, unsigned int delta) const {
730 return (__shfl_up_sync(build_mask(), var, delta, numThreads));
731 }
shfl_xor(__half2 var,unsigned int laneMask)732 _CG_QUALIFIER __half2 shfl_xor(__half2 var, unsigned int laneMask) const {
733 return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
734 }
735 #endif
736
737 #ifdef _CG_HAS_MATCH_COLLECTIVE
match_any(int val)738 _CG_QUALIFIER unsigned int match_any(int val) const {
739 unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
740 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
741 }
match_any(unsigned int val)742 _CG_QUALIFIER unsigned int match_any(unsigned int val) const {
743 unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
744 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
745 }
match_any(long val)746 _CG_QUALIFIER unsigned int match_any(long val) const {
747 unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
748 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
749 }
match_any(unsigned long val)750 _CG_QUALIFIER unsigned int match_any(unsigned long val) const {
751 unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
752 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
753 }
match_any(long long val)754 _CG_QUALIFIER unsigned int match_any(long long val) const {
755 unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
756 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
757 }
match_any(unsigned long long val)758 _CG_QUALIFIER unsigned int match_any(unsigned long long val) const {
759 unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
760 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
761 }
match_any(float val)762 _CG_QUALIFIER unsigned int match_any(float val) const {
763 unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
764 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
765 }
match_any(double val)766 _CG_QUALIFIER unsigned int match_any(double val) const {
767 unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
768 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
769 }
770
match_all(int val,int & pred)771 _CG_QUALIFIER unsigned int match_all(int val, int &pred) const {
772 unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
773 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
774 }
match_all(unsigned int val,int & pred)775 _CG_QUALIFIER unsigned int match_all(unsigned int val, int &pred) const {
776 unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
777 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
778 }
match_all(long val,int & pred)779 _CG_QUALIFIER unsigned int match_all(long val, int &pred) const {
780 unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
781 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
782 }
match_all(unsigned long val,int & pred)783 _CG_QUALIFIER unsigned int match_all(unsigned long val, int &pred) const {
784 unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
785 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
786 }
match_all(long long val,int & pred)787 _CG_QUALIFIER unsigned int match_all(long long val, int &pred) const {
788 unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
789 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
790 }
match_all(unsigned long long val,int & pred)791 _CG_QUALIFIER unsigned int match_all(unsigned long long val, int &pred) const {
792 unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
793 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
794 }
match_all(float val,int & pred)795 _CG_QUALIFIER unsigned int match_all(float val, int &pred) const {
796 unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
797 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
798 }
match_all(double val,int & pred)799 _CG_QUALIFIER unsigned int match_all(double val, int &pred) const {
800 unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
801 return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
802 }
803 #endif
804
805 };
806
807 /**
808 * class thread_block_tile<unsigned int Size>
809 *
810 * Statically-sized group type, representing one tile of a thread block.
811 * The only specializations currently supported are those with native
812 * hardware support (1/2/4/8/16/32)
813 *
814 * This group exposes warp-synchronous builtins.
815 * Constructed via tiled_partition<Size>(class thread_block);
816 */
817 template <unsigned int Size>
818 class thread_block_tile;
819 template <> class thread_block_tile<32> : public __thread_block_tile_base<32> { };
820 template <> class thread_block_tile<16> : public __thread_block_tile_base<16> { };
821 template <> class thread_block_tile<8> : public __thread_block_tile_base<8> { };
822 template <> class thread_block_tile<4> : public __thread_block_tile_base<4> { };
823 template <> class thread_block_tile<2> : public __thread_block_tile_base<2> { };
824 template <> class thread_block_tile<1> : public __thread_block_tile_base<1> { };
825
826 /**
827 * Outer level API calls
828 * void sync(GroupT) - see <group_type>.sync()
829 * void thread_rank(GroupT) - see <group_type>.thread_rank()
830 * void group_size(GroupT) - see <group_type>.size()
831 */
sync(GroupT const & g)832 template <class GroupT> _CG_QUALIFIER void sync(GroupT const &g)
833 {
834 g.sync();
835 }
836
thread_rank(GroupT const & g)837 template <class GroupT> _CG_QUALIFIER unsigned int thread_rank(GroupT const& g)
838 {
839 return (g.thread_rank());
840 }
841
group_size(GroupT const & g)842 template <class GroupT> _CG_QUALIFIER unsigned int group_size(GroupT const &g)
843 {
844 return (g.size());
845 }
846
847 /**
848 * <group_type>.sync()
849 *
850 * Executes a barrier across the group
851 *
852 * Implements both a compiler fence and an architectural fence to prevent,
853 * memory reordering around the barrier.
854 */
sync()855 _CG_QUALIFIER void thread_group::sync() const
856 {
857 if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) {
858 static_cast<const coalesced_group*>(this)->sync();
859 }
860 else {
861 static_cast<const thread_block*>(this)->sync();
862 }
863 }
864
865 /**
866 * <group_type>.size()
867 *
868 * Returns the total number of threads in the group.
869 */
size()870 _CG_QUALIFIER unsigned int thread_group::size() const
871 {
872 if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) {
873 return (static_cast<const coalesced_group*>(this)->size());
874 }
875 else {
876 return (static_cast<const thread_block*>(this)->size());
877 }
878 }
879
880 /**
881 * <group_type>.thread_rank()
882 *
883 * Returns the linearized rank of the calling thread along the interval [0, size()).
884 */
thread_rank()885 _CG_QUALIFIER unsigned int thread_group::thread_rank() const
886 {
887 if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) {
888 return (static_cast<const coalesced_group*>(this)->thread_rank());
889 }
890 else {
891 return (static_cast<const thread_block*>(this)->thread_rank());
892 }
893 }
894
895 /**
896 * tiled_partition
897 *
898 * The tiled_partition(parent, tilesz) method is a collective operation that
899 * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
900 *
901 * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
902 * be created where threads having identical k = (thread_rank(parent)/tilesz)
903 * will be members of the same subgroup.
904 *
905 * The implementation may cause the calling thread to wait until all the members
906 * of the parent group have invoked the operation before resuming execution.
907 *
908 * Functionality is limited to power-of-two sized subgorup instances of at most
909 * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
910 * tiled_partition() in _CG_VERSION 1000.
911 */
tiled_partition(const thread_group & parent,unsigned int tilesz)912 _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
913 {
914 if (parent._data.type == __internal::Coalesced || parent._data.type == __internal::CoalescedTile) {
915 return (static_cast<const coalesced_group&>(parent)._get_tiled_threads(tilesz));
916 }
917 else {
918 return (static_cast<const thread_block&>(parent)._get_tiled_threads(tilesz));
919 }
920 }
921 // Thread block type overload: returns a basic thread_group for now (may be specialized later)
tiled_partition(const thread_block & parent,unsigned int tilesz)922 _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
923 {
924 return (parent._get_tiled_threads(tilesz));
925 }
926 // Coalesced group type overload: retains its ability to stay coalesced
tiled_partition(const coalesced_group & parent,unsigned int tilesz)927 _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
928 {
929 return (parent._get_tiled_threads(tilesz));
930 }
931
932 namespace __internal {
933
934 // For specializing on different tiled_partition template arguments
935 template <unsigned int Size, typename ParentT>
936 struct tiled_partition_impl;
937
938 template <unsigned int Size>
939 struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size> {
940 _CG_QUALIFIER tiled_partition_impl(thread_block const &) : thread_block_tile<Size>() {}
941 };
942 template <unsigned int Size>
943 struct tiled_partition_impl<Size, thread_block_tile<32> > : public thread_block_tile<Size> {
944 _CG_QUALIFIER tiled_partition_impl(thread_block_tile<32> const&) : thread_block_tile<Size>() {}
945 };
946 template <unsigned int Size>
947 struct tiled_partition_impl<Size, thread_block_tile<16> > : public thread_block_tile<Size> {
948 _CG_QUALIFIER tiled_partition_impl(thread_block_tile<16> const&) : thread_block_tile<Size>() {}
949 };
950 template <unsigned int Size>
951 struct tiled_partition_impl<Size, thread_block_tile<8> > : public thread_block_tile<Size> {
952 _CG_QUALIFIER tiled_partition_impl(thread_block_tile<8> const&) : thread_block_tile<Size>() {}
953 };
954 template <unsigned int Size>
955 struct tiled_partition_impl<Size, thread_block_tile<4> > : public thread_block_tile<Size> {
956 _CG_QUALIFIER tiled_partition_impl(thread_block_tile<4> const&) : thread_block_tile<Size>() {}
957 };
958 template <unsigned int Size>
959 struct tiled_partition_impl<Size, thread_block_tile<2> > : public thread_block_tile<Size> {
960 _CG_QUALIFIER tiled_partition_impl(thread_block_tile<2> const&) : thread_block_tile<Size>() {}
961 };
962 template <>
963 struct tiled_partition_impl<1, thread_block_tile<1> > : public thread_block_tile<1> {
964 _CG_QUALIFIER tiled_partition_impl(thread_block_tile<1> const&) : thread_block_tile<1>() {}
965 };
966
967 };
968
969 /**
970 * tiled_partition<tilesz>
971 *
972 * The tiled_partition<tilesz>(parent) method is a collective operation that
973 * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
974 *
975 * A total of ((size(parent)/tilesz) subgroups will be created,
976 * therefore the parent group size must be evenly divisible by the tilesz.
977 * The allow parent groups are thread_block or thread_block_tile<size>.
978 *
979 * The implementation may cause the calling thread to wait until all the members
980 * of the parent group have invoked the operation before resuming execution.
981 *
982 * Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
983 * The size(parent) must be greater than the template Size parameter
984 * otherwise the results are undefined.
985 */
986 template <unsigned int Size, typename ParentT>
987 _CG_QUALIFIER thread_block_tile<Size> tiled_partition(const ParentT& g)
988 {
989 return (__internal::tiled_partition_impl<Size, ParentT>(g));
990 }
991
992 _CG_END_NAMESPACE
993
994 # endif /* ! (__cplusplus, __CUDACC__) */
995
996 #endif /* !_COOPERATIVE_GROUPS_H_ */
997