/* * Copyright 1993-2016 NVIDIA Corporation. All rights reserved. * * NOTICE TO LICENSEE: * * This source code and/or documentation ("Licensed Deliverables") are * subject to NVIDIA intellectual property rights under U.S. and * international Copyright laws. * * These Licensed Deliverables contained herein is PROPRIETARY and * CONFIDENTIAL to NVIDIA and is being provided under the terms and * conditions of a form of NVIDIA software license agreement by and * between NVIDIA and Licensee ("License Agreement") or electronically * accepted by Licensee. Notwithstanding any terms or conditions to * the contrary in the License Agreement, reproduction or disclosure * of the Licensed Deliverables to any third party without the express * written consent of NVIDIA is prohibited. * * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THESE LICENSED DELIVERABLES. * * U.S. Government End Users. These Licensed Deliverables are a * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT * 1995), consisting of "commercial computer software" and "commercial * computer software documentation" as such terms are used in 48 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government * only as a commercial end item. Consistent with 48 C.F.R.12.212 and * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all * U.S. Government End Users acquire the Licensed Deliverables with * only those rights set forth herein. * * Any use of the Licensed Deliverables in individual and commercial * software must include, in the user documentation and internal * comments to the code, the above Disclaimer and U.S. Government End * Users Notice. */ #ifndef _COOPERATIVE_GROUPS_H_ # define _COOPERATIVE_GROUPS_H_ #if defined(__cplusplus) && defined(__CUDACC__) # include "cooperative_groups_helpers.h" _CG_BEGIN_NAMESPACE /** * class thread_group; * * Generic thread group type, into which all groups are convertible. * It acts as a container for all storage necessary for the derived groups, * and will dispatch the API calls to the correct derived group. This means * that all derived groups must implement the same interface as thread_group. */ class thread_group { friend _CG_QUALIFIER thread_group this_thread(); friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); friend class thread_block; protected: union __align__(8) { unsigned int type : 8; struct { unsigned int type : 8; unsigned int size : 24; unsigned int mask; } coalesced; struct { void* ptr[2]; } buffer; } _data; _CG_QUALIFIER thread_group operator=(const thread_group& src); _CG_QUALIFIER thread_group(__internal::groupType type) { _data.type = type; } #if __cplusplus >= 201103L static_assert(sizeof(_data) == 16, "Failed size check"); #endif public: _CG_QUALIFIER unsigned int size() const; _CG_QUALIFIER unsigned int thread_rank() const; _CG_QUALIFIER void sync() const; }; /** * thread_group this_thread() * * Constructs a generic thread_group containing only the calling thread */ _CG_QUALIFIER thread_group this_thread() { thread_group g = thread_group(__internal::Coalesced); g._data.coalesced.mask = __internal::lanemask32_eq(); g._data.coalesced.size = 1; return (g); } #if defined(_CG_HAS_MULTI_GRID_GROUP) /** * class multi_grid_group; * * Threads within this this group are guaranteed to be co-resident on the * same system, on multiple devices within the same launched kernels. * To use this group, the kernel must have been launched with * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent), * and the device must support it (queryable device attribute). * * Constructed via this_multi_grid(); */ class multi_grid_group { friend _CG_QUALIFIER multi_grid_group this_multi_grid(); struct __align__(8) { unsigned long long handle; unsigned int size; unsigned int rank; } _data; #if __cplusplus >= 201103L static_assert(sizeof(_data) == 16, "Failed size check"); #endif public: _CG_QUALIFIER multi_grid_group() { _data.handle = __internal::multi_grid::get_intrinsic_handle(); _data.size = __internal::multi_grid::size(_data.handle); _data.rank = __internal::multi_grid::thread_rank(_data.handle); } _CG_QUALIFIER bool is_valid() const { return (_data.handle != 0); } _CG_QUALIFIER void sync() const { _CG_ASSERT(is_valid()); __internal::multi_grid::sync(_data.handle); } _CG_QUALIFIER unsigned int size() const { _CG_ASSERT(is_valid()); return (_data.size); } _CG_QUALIFIER unsigned int thread_rank() const { _CG_ASSERT(is_valid()); return (_data.rank); } _CG_QUALIFIER unsigned int grid_rank() const { _CG_ASSERT(is_valid()); return (__internal::multi_grid::grid_rank(_data.handle)); } _CG_QUALIFIER unsigned int num_grids() const { _CG_ASSERT(is_valid()); return (__internal::multi_grid::num_grids(_data.handle)); } }; /** * multi_grid_group this_multi_grid() * * Constructs a multi_grid_group */ _CG_QUALIFIER multi_grid_group this_multi_grid() { return (multi_grid_group()); } #endif #if defined(_CG_HAS_GRID_GROUP) /** * class grid_group; * * Threads within this this group are guaranteed to be co-resident on the * same device within the same launched kernel. To use this group, the kernel * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent), * and the device must support it (queryable device attribute). * * Constructed via this_grid(); */ class grid_group { friend _CG_QUALIFIER grid_group this_grid(); struct __align__(8) { unsigned long long handle; unsigned int size; unsigned int rank; } _data; #if __cplusplus >= 201103L static_assert(sizeof(_data) == 16, "Failed size check"); #endif public: _CG_QUALIFIER grid_group() { _data.handle = (__internal::grid::get_intrinsic_handle()); _data.size = __internal::grid::size(_data.handle); _data.rank = __internal::grid::thread_rank(_data.handle); } _CG_QUALIFIER bool is_valid() const { return (_data.handle != 0); } _CG_QUALIFIER void sync() const { _CG_ASSERT(is_valid()); __internal::grid::sync(_data.handle); } _CG_QUALIFIER unsigned int size() const { _CG_ASSERT(is_valid()); return (_data.size); } _CG_QUALIFIER unsigned int thread_rank() const { _CG_ASSERT(is_valid()); return (_data.rank); } _CG_QUALIFIER dim3 group_dim() const { _CG_ASSERT(is_valid()); return (__internal::grid::grid_dim()); } }; /** * grid_group this_grid() * * Constructs a grid_group */ _CG_QUALIFIER grid_group this_grid() { return (grid_group()); } #endif /** * class thread_block * * Every GPU kernel is executed by a grid of thread blocks, and threads within * each block are guaranteed to reside on the same streaming multiprocessor. * A thread_block represents a thread block whose dimensions are not known until runtime. * * Constructed via this_thread_block(); */ class thread_block : public thread_group { friend _CG_QUALIFIER thread_block this_thread_block(); friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz); _CG_QUALIFIER thread_block() : thread_group(__internal::ThreadBlock) { } // Internal Use _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const { const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); // Invalid, immediately fail if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) { __internal::abort(); return (thread_block()); } unsigned int mask; unsigned int base_offset = thread_rank() & (~(tilesz - 1)); unsigned int masklength = min(size() - base_offset, tilesz); mask = (unsigned int)(-1) >> (32 - masklength); mask <<= (__internal::laneid() & ~(tilesz - 1)); thread_group tile = thread_group(__internal::CoalescedTile); tile._data.coalesced.mask = mask; tile._data.coalesced.size = __popc(mask); return (tile); } public: _CG_QUALIFIER void sync() const { __internal::cta::sync(); } _CG_QUALIFIER unsigned int size() const { return (__internal::cta::size()); } _CG_QUALIFIER unsigned int thread_rank() const { return (__internal::cta::thread_rank()); } // Additional functionality exposed by the group _CG_QUALIFIER dim3 group_index() const { return (__internal::cta::group_index()); } _CG_QUALIFIER dim3 thread_index() const { return (__internal::cta::thread_index()); } _CG_QUALIFIER dim3 group_dim() const { return (__internal::cta::block_dim()); } }; /** * thread_block this_thread_block() * * Constructs a thread_block group */ _CG_QUALIFIER thread_block this_thread_block() { return (thread_block()); } /** * class coalesced_group * * A group representing the current set of converged threads in a warp. * The size of the group is not guaranteed and it may return a group of * only one thread (itself). * * This group exposes warp-synchronous builtins. * Constructed via coalesced_threads(); */ class coalesced_group : public thread_group { friend _CG_QUALIFIER coalesced_group coalesced_threads(); friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz); _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const { unsigned int member_pack = 0; unsigned int member_rank = 0; for (int bit_idx = 0; bit_idx < 32; bit_idx++) { unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx); if (lane_bit) { if (laneMask & lane_bit) member_pack |= 1 << member_rank; member_rank++; } } return (member_pack); } // Internal Use _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const { const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); // Invalid, immediately fail if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) { __internal::abort(); return (coalesced_group(0)); } if (size() <= tilesz) { return (*this); } if ((_data.type == __internal::CoalescedTile) && pow2_tilesz) { unsigned int base_offset = (thread_rank() & (~(tilesz - 1))); unsigned int masklength = min(size() - base_offset, tilesz); unsigned int mask = (unsigned int)(-1) >> (32 - masklength); mask <<= (__internal::laneid() & ~(tilesz - 1)); coalesced_group coalesced_tile = coalesced_group(mask); coalesced_tile._data.type = __internal::CoalescedTile; return (coalesced_tile); } else if ((_data.type == __internal::Coalesced) && pow2_tilesz) { unsigned int mask = 0; unsigned int member_rank = 0; int seen_lanes = (thread_rank() / tilesz) * tilesz; for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) { unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx); if (lane_bit) { if (seen_lanes <= 0 && member_rank < tilesz) { mask |= lane_bit; member_rank++; } seen_lanes--; } } return (coalesced_group(mask)); } else { // None in _CG_VERSION 1000 __internal::abort(); } return (coalesced_group(0)); } protected: // Construct a group from scratch (coalesced_threads) _CG_QUALIFIER coalesced_group(unsigned int mask) : thread_group(__internal::Coalesced) { _data.coalesced.mask = mask; _data.coalesced.size = __popc(mask); } public: _CG_QUALIFIER unsigned int size() const { return (_data.coalesced.size); } _CG_QUALIFIER unsigned int thread_rank() const { return (__popc(_data.coalesced.mask & __internal::lanemask32_lt())); } _CG_QUALIFIER void sync() const { __syncwarp(_data.coalesced.mask); } #define COALESCED_SHFL_FUNCTION(type) \ _CG_QUALIFIER type shfl(type var, unsigned int src_rank) const { \ unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 : \ (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1)); \ return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); \ } #define COALESCED_SHFL_UP_FUNCTION(type) \ _CG_QUALIFIER type shfl_up(type var, int delta) const { \ if (size() == 32) { \ return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32)); \ } \ unsigned lane = __fns(_data.coalesced.mask, __internal::laneid(), -(delta + 1)); \ if (lane >= 32) lane = __internal::laneid(); \ return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); \ } #define COALESCED_SHFL_DOWN_FUNCTION(type) \ _CG_QUALIFIER type shfl_down(type var, int delta) const { \ if (size() == 32) { \ return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32)); \ } \ unsigned int lane = __fns(_data.coalesced.mask, __internal::laneid(), delta + 1); \ if (lane >= 32) lane = __internal::laneid(); \ return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); \ } COALESCED_SHFL_FUNCTION(int); COALESCED_SHFL_FUNCTION(unsigned int); COALESCED_SHFL_FUNCTION(long); COALESCED_SHFL_FUNCTION(unsigned long); COALESCED_SHFL_FUNCTION(long long); COALESCED_SHFL_FUNCTION(unsigned long long); COALESCED_SHFL_FUNCTION(float); COALESCED_SHFL_FUNCTION(double); COALESCED_SHFL_UP_FUNCTION(int); COALESCED_SHFL_UP_FUNCTION(unsigned int); COALESCED_SHFL_UP_FUNCTION(long); COALESCED_SHFL_UP_FUNCTION(unsigned long); COALESCED_SHFL_UP_FUNCTION(long long); COALESCED_SHFL_UP_FUNCTION(unsigned long long); COALESCED_SHFL_UP_FUNCTION(float); COALESCED_SHFL_UP_FUNCTION(double); COALESCED_SHFL_DOWN_FUNCTION(int); COALESCED_SHFL_DOWN_FUNCTION(unsigned int); COALESCED_SHFL_DOWN_FUNCTION(long); COALESCED_SHFL_DOWN_FUNCTION(unsigned long); COALESCED_SHFL_DOWN_FUNCTION(long long); COALESCED_SHFL_DOWN_FUNCTION(unsigned long long); COALESCED_SHFL_DOWN_FUNCTION(float); COALESCED_SHFL_DOWN_FUNCTION(double); # ifdef _CG_HAS_FP16_COLLECTIVE COALESCED_SHFL_FUNCTION(__half); COALESCED_SHFL_UP_FUNCTION(__half); COALESCED_SHFL_DOWN_FUNCTION(__half); COALESCED_SHFL_FUNCTION(__half2); COALESCED_SHFL_UP_FUNCTION(__half2); COALESCED_SHFL_DOWN_FUNCTION(__half2); # endif #undef COALESCED_SHFL_FUNCTION #undef COALESCED_SHFL_UP_FUNCTION #undef COALESCED_SHFL_DOWN_FUNCTION _CG_QUALIFIER int any(int predicate) const { return (__ballot_sync(_data.coalesced.mask, predicate) != 0); } _CG_QUALIFIER int all(int predicate) const { return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask); } _CG_QUALIFIER unsigned int ballot(int predicate) const { if (size() == 32) { return (__ballot_sync(0xFFFFFFFF, predicate)); } unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate); return (_packLanes(lane_ballot)); } #ifdef _CG_HAS_MATCH_COLLECTIVE # define COALESCED_MATCH_ANY_FUNCTION(type) \ _CG_QUALIFIER unsigned int match_any(type val) const { \ if (size() == 32) { \ return (__match_any_sync(0xFFFFFFFF, val)); \ } \ unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val); \ return (_packLanes(lane_match)); \ } # define COALESCED_MATCH_ALL_FUNCTION(type) \ _CG_QUALIFIER unsigned int match_all(type val, int &pred) const { \ if (size() == 32) { \ return (__match_all_sync(0xFFFFFFFF, val, &pred)); \ } \ unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred); \ return (_packLanes(lane_match)); \ } COALESCED_MATCH_ANY_FUNCTION(int); COALESCED_MATCH_ANY_FUNCTION(unsigned int); COALESCED_MATCH_ANY_FUNCTION(long); COALESCED_MATCH_ANY_FUNCTION(unsigned long); COALESCED_MATCH_ANY_FUNCTION(long long); COALESCED_MATCH_ANY_FUNCTION(unsigned long long); COALESCED_MATCH_ANY_FUNCTION(float); COALESCED_MATCH_ANY_FUNCTION(double); COALESCED_MATCH_ALL_FUNCTION(int); COALESCED_MATCH_ALL_FUNCTION(unsigned int); COALESCED_MATCH_ALL_FUNCTION(long); COALESCED_MATCH_ALL_FUNCTION(unsigned long); COALESCED_MATCH_ALL_FUNCTION(long long); COALESCED_MATCH_ALL_FUNCTION(unsigned long long); COALESCED_MATCH_ALL_FUNCTION(float); COALESCED_MATCH_ALL_FUNCTION(double); # undef COALESCED_MATCH_ANY_FUNCTION # undef COALESCED_MATCH_ALL_FUNCTION #endif /* !_CG_HAS_MATCH_COLLECTIVE */ }; _CG_QUALIFIER coalesced_group coalesced_threads() { return (coalesced_group(__activemask())); } template class __thread_block_tile_base : public thread_group { static const unsigned int numThreads = Size; _CG_QUALIFIER unsigned int build_mask() const { unsigned int mask; if (numThreads == 32) { mask = 0xFFFFFFFF; } else { mask = (unsigned int)(-1) >> (32 - numThreads); mask <<= (__internal::laneid() & (~(numThreads - 1))); } return (mask); } protected: _CG_QUALIFIER __thread_block_tile_base() : thread_group(__internal::CoalescedTile) { _data.coalesced.mask = build_mask(); _data.coalesced.size = numThreads; } public: _CG_QUALIFIER void sync() const { __syncwarp(build_mask()); } _CG_QUALIFIER unsigned int thread_rank() const { return (__internal::laneid() & (numThreads - 1)); } _CG_QUALIFIER unsigned int size() const { return (numThreads); } // PTX supported collectives _CG_QUALIFIER int shfl(int var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } _CG_QUALIFIER int shfl_down(int var, unsigned int delta) const { return (__shfl_down_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER int shfl_up(int var, unsigned int delta) const { return (__shfl_up_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER int shfl_xor(int var, unsigned int laneMask) const { return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); } _CG_QUALIFIER unsigned int shfl(unsigned int var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } _CG_QUALIFIER unsigned int shfl_down(unsigned int var, unsigned int delta) const { return (__shfl_down_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER unsigned int shfl_up(unsigned int var, unsigned int delta) const { return (__shfl_up_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER unsigned int shfl_xor(unsigned int var, unsigned int laneMask) const { return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); } _CG_QUALIFIER long shfl(long var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } _CG_QUALIFIER long shfl_down(long var, unsigned int delta) const { return (__shfl_down_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER long shfl_up(long var, unsigned int delta) const { return (__shfl_up_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER long shfl_xor(long var, unsigned int laneMask) const { return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); } _CG_QUALIFIER unsigned long shfl(unsigned long var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } _CG_QUALIFIER unsigned long shfl_down(unsigned long var, unsigned int delta) const { return (__shfl_down_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER unsigned long shfl_up(unsigned long var, unsigned int delta) const { return (__shfl_up_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER unsigned long shfl_xor(unsigned long var, unsigned int laneMask) const { return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); } _CG_QUALIFIER long long shfl(long long var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } _CG_QUALIFIER long long shfl_down(long long var, unsigned int delta) const { return (__shfl_down_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER long long shfl_up(long long var, unsigned int delta) const { return (__shfl_up_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER long long shfl_xor(long long var, unsigned int laneMask) const { return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); } _CG_QUALIFIER unsigned long long shfl(unsigned long long var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } _CG_QUALIFIER unsigned long long shfl_down(unsigned long long var, unsigned int delta) const { return (__shfl_down_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER unsigned long long shfl_up(unsigned long long var, unsigned int delta) const { return (__shfl_up_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER unsigned long long shfl_xor(unsigned long long var, unsigned int laneMask) const { return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); } _CG_QUALIFIER float shfl(float var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } _CG_QUALIFIER float shfl_down(float var, unsigned int delta) const { return (__shfl_down_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER float shfl_up(float var, unsigned int delta) const { return (__shfl_up_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER float shfl_xor(float var, unsigned int laneMask) const { return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); } _CG_QUALIFIER double shfl(double var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } _CG_QUALIFIER double shfl_down(double var, unsigned int delta) const { return (__shfl_down_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER double shfl_up(double var, unsigned int delta) const { return (__shfl_up_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER double shfl_xor(double var, unsigned int laneMask) const { return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); } _CG_QUALIFIER int any(int predicate) const { unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate); return (lane_ballot != 0); } _CG_QUALIFIER int all(int predicate) const { unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate); return (lane_ballot == build_mask()); } _CG_QUALIFIER unsigned int ballot(int predicate) const { unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate); return (lane_ballot >> (__internal::laneid() & (~(numThreads - 1)))); } #ifdef _CG_HAS_FP16_COLLECTIVE _CG_QUALIFIER __half shfl(__half var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } _CG_QUALIFIER __half shfl_down(__half var, unsigned int delta) const { return (__shfl_down_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER __half shfl_up(__half var, unsigned int delta) const { return (__shfl_up_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER __half shfl_xor(__half var, unsigned int laneMask) const { return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); } _CG_QUALIFIER __half2 shfl(__half2 var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } _CG_QUALIFIER __half2 shfl_down(__half2 var, unsigned int delta) const { return (__shfl_down_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER __half2 shfl_up(__half2 var, unsigned int delta) const { return (__shfl_up_sync(build_mask(), var, delta, numThreads)); } _CG_QUALIFIER __half2 shfl_xor(__half2 var, unsigned int laneMask) const { return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); } #endif #ifdef _CG_HAS_MATCH_COLLECTIVE _CG_QUALIFIER unsigned int match_any(int val) const { unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_any(unsigned int val) const { unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_any(long val) const { unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_any(unsigned long val) const { unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_any(long long val) const { unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_any(unsigned long long val) const { unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_any(float val) const { unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_any(double val) const { unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_all(int val, int &pred) const { unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_all(unsigned int val, int &pred) const { unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_all(long val, int &pred) const { unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_all(unsigned long val, int &pred) const { unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_all(long long val, int &pred) const { unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_all(unsigned long long val, int &pred) const { unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_all(float val, int &pred) const { unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } _CG_QUALIFIER unsigned int match_all(double val, int &pred) const { unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred); return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); } #endif }; /** * class thread_block_tile * * Statically-sized group type, representing one tile of a thread block. * The only specializations currently supported are those with native * hardware support (1/2/4/8/16/32) * * This group exposes warp-synchronous builtins. * Constructed via tiled_partition(class thread_block); */ template class thread_block_tile; template <> class thread_block_tile<32> : public __thread_block_tile_base<32> { }; template <> class thread_block_tile<16> : public __thread_block_tile_base<16> { }; template <> class thread_block_tile<8> : public __thread_block_tile_base<8> { }; template <> class thread_block_tile<4> : public __thread_block_tile_base<4> { }; template <> class thread_block_tile<2> : public __thread_block_tile_base<2> { }; template <> class thread_block_tile<1> : public __thread_block_tile_base<1> { }; /** * Outer level API calls * void sync(GroupT) - see .sync() * void thread_rank(GroupT) - see .thread_rank() * void group_size(GroupT) - see .size() */ template _CG_QUALIFIER void sync(GroupT const &g) { g.sync(); } template _CG_QUALIFIER unsigned int thread_rank(GroupT const& g) { return (g.thread_rank()); } template _CG_QUALIFIER unsigned int group_size(GroupT const &g) { return (g.size()); } /** * .sync() * * Executes a barrier across the group * * Implements both a compiler fence and an architectural fence to prevent, * memory reordering around the barrier. */ _CG_QUALIFIER void thread_group::sync() const { if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) { static_cast(this)->sync(); } else { static_cast(this)->sync(); } } /** * .size() * * Returns the total number of threads in the group. */ _CG_QUALIFIER unsigned int thread_group::size() const { if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) { return (static_cast(this)->size()); } else { return (static_cast(this)->size()); } } /** * .thread_rank() * * Returns the linearized rank of the calling thread along the interval [0, size()). */ _CG_QUALIFIER unsigned int thread_group::thread_rank() const { if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) { return (static_cast(this)->thread_rank()); } else { return (static_cast(this)->thread_rank()); } } /** * tiled_partition * * The tiled_partition(parent, tilesz) method is a collective operation that * partitions the parent group into a one-dimensional, row-major, tiling of subgroups. * * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will * be created where threads having identical k = (thread_rank(parent)/tilesz) * will be members of the same subgroup. * * The implementation may cause the calling thread to wait until all the members * of the parent group have invoked the operation before resuming execution. * * Functionality is limited to power-of-two sized subgorup instances of at most * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be * tiled_partition() in _CG_VERSION 1000. */ _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz) { if (parent._data.type == __internal::Coalesced || parent._data.type == __internal::CoalescedTile) { return (static_cast(parent)._get_tiled_threads(tilesz)); } else { return (static_cast(parent)._get_tiled_threads(tilesz)); } } // Thread block type overload: returns a basic thread_group for now (may be specialized later) _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz) { return (parent._get_tiled_threads(tilesz)); } // Coalesced group type overload: retains its ability to stay coalesced _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz) { return (parent._get_tiled_threads(tilesz)); } namespace __internal { // For specializing on different tiled_partition template arguments template struct tiled_partition_impl; template struct tiled_partition_impl : public thread_block_tile { _CG_QUALIFIER tiled_partition_impl(thread_block const &) : thread_block_tile() {} }; template struct tiled_partition_impl > : public thread_block_tile { _CG_QUALIFIER tiled_partition_impl(thread_block_tile<32> const&) : thread_block_tile() {} }; template struct tiled_partition_impl > : public thread_block_tile { _CG_QUALIFIER tiled_partition_impl(thread_block_tile<16> const&) : thread_block_tile() {} }; template struct tiled_partition_impl > : public thread_block_tile { _CG_QUALIFIER tiled_partition_impl(thread_block_tile<8> const&) : thread_block_tile() {} }; template struct tiled_partition_impl > : public thread_block_tile { _CG_QUALIFIER tiled_partition_impl(thread_block_tile<4> const&) : thread_block_tile() {} }; template struct tiled_partition_impl > : public thread_block_tile { _CG_QUALIFIER tiled_partition_impl(thread_block_tile<2> const&) : thread_block_tile() {} }; template <> struct tiled_partition_impl<1, thread_block_tile<1> > : public thread_block_tile<1> { _CG_QUALIFIER tiled_partition_impl(thread_block_tile<1> const&) : thread_block_tile<1>() {} }; }; /** * tiled_partition * * The tiled_partition(parent) method is a collective operation that * partitions the parent group into a one-dimensional, row-major, tiling of subgroups. * * A total of ((size(parent)/tilesz) subgroups will be created, * therefore the parent group size must be evenly divisible by the tilesz. * The allow parent groups are thread_block or thread_block_tile. * * The implementation may cause the calling thread to wait until all the members * of the parent group have invoked the operation before resuming execution. * * Functionality is limited to native hardware sizes, 1/2/4/8/16/32. * The size(parent) must be greater than the template Size parameter * otherwise the results are undefined. */ template _CG_QUALIFIER thread_block_tile tiled_partition(const ParentT& g) { return (__internal::tiled_partition_impl(g)); } _CG_END_NAMESPACE # endif /* ! (__cplusplus, __CUDACC__) */ #endif /* !_COOPERATIVE_GROUPS_H_ */