1 /****************************************************************************** 2 * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the NVIDIA CORPORATION nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 ******************************************************************************/ 28 29 /** 30 * \file 31 * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units (grains). 32 */ 33 34 35 #pragma once 36 37 #include "../util_namespace.cuh" 38 #include "../util_macro.cuh" 39 #include "grid_mapping.cuh" 40 41 /// Optional outer namespace(s) 42 CUB_NS_PREFIX 43 44 /// CUB namespace 45 namespace cub { 46 47 48 /** 49 * \addtogroup GridModule 50 * @{ 51 */ 52 53 54 /** 55 * \brief GridEvenShare is a descriptor utility for distributing input among 56 * CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly 57 * the same number of input tiles. 58 * 59 * \par Overview 60 * Each thread block is assigned a consecutive sequence of input tiles. To help 61 * preserve alignment and eliminate the overhead of guarded loads for all but the 62 * last thread block, to GridEvenShare assigns one of three different amounts of 63 * work to a given thread block: "big", "normal", or "last". The "big" workloads 64 * are one scheduling grain larger than "normal". The "last" work unit for the 65 * last thread block may be partially-full if the input is not an even multiple of 66 * the scheduling grain size. 67 * 68 * \par 69 * Before invoking a child grid, a parent thread will typically construct an 70 * instance of GridEvenShare. The instance can be passed to child thread blocks 71 * which can initialize their per-thread block offsets using \p BlockInit(). 72 */ 73 template <typename OffsetT> 74 struct GridEvenShare 75 { 76 private: 77 78 OffsetT total_tiles; 79 int big_shares; 80 OffsetT big_share_items; 81 OffsetT normal_share_items; 82 OffsetT normal_base_offset; 83 84 public: 85 86 /// Total number of input items 87 OffsetT num_items; 88 89 /// Grid size in thread blocks 90 int grid_size; 91 92 /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles 93 OffsetT block_offset; 94 95 /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles 96 OffsetT block_end; 97 98 /// Stride between input tiles 99 OffsetT block_stride; 100 101 102 /** 103 * \brief Constructor. 104 */ GridEvenSharecub::GridEvenShare105 __host__ __device__ __forceinline__ GridEvenShare() : 106 total_tiles(0), 107 big_shares(0), 108 big_share_items(0), 109 normal_share_items(0), 110 normal_base_offset(0), 111 num_items(0), 112 grid_size(0), 113 block_offset(0), 114 block_end(0), 115 block_stride(0) 116 {} 117 118 119 /** 120 * \brief Dispatch initializer. To be called prior prior to kernel launch. 121 */ DispatchInitcub::GridEvenShare122 __host__ __device__ __forceinline__ void DispatchInit( 123 OffsetT num_items, ///< Total number of input items 124 int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) 125 int tile_items) ///< Number of data items per input tile 126 { 127 this->block_offset = num_items; // Initialize past-the-end 128 this->block_end = num_items; // Initialize past-the-end 129 this->num_items = num_items; 130 this->total_tiles = (num_items + tile_items - 1) / tile_items; 131 this->grid_size = CUB_MIN(total_tiles, max_grid_size); 132 OffsetT avg_tiles_per_block = total_tiles / grid_size; 133 this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); // leftover grains go to big blocks 134 this->normal_share_items = avg_tiles_per_block * tile_items; 135 this->normal_base_offset = big_shares * tile_items; 136 this->big_share_items = normal_share_items + tile_items; 137 } 138 139 140 /** 141 * \brief Initializes ranges for the specified thread block index. Specialized 142 * for a "raking" access pattern in which each thread block is assigned a 143 * consecutive sequence of input tiles. 144 */ 145 template <int TILE_ITEMS> BlockInitcub::GridEvenShare146 __device__ __forceinline__ void BlockInit( 147 int block_id, 148 Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/) 149 { 150 block_stride = TILE_ITEMS; 151 if (block_id < big_shares) 152 { 153 // This thread block gets a big share of grains (avg_tiles_per_block + 1) 154 block_offset = (block_id * big_share_items); 155 block_end = block_offset + big_share_items; 156 } 157 else if (block_id < total_tiles) 158 { 159 // This thread block gets a normal share of grains (avg_tiles_per_block) 160 block_offset = normal_base_offset + (block_id * normal_share_items); 161 block_end = CUB_MIN(num_items, block_offset + normal_share_items); 162 } 163 // Else default past-the-end 164 } 165 166 167 /** 168 * \brief Block-initialization, specialized for a "raking" access 169 * pattern in which each thread block is assigned a consecutive sequence 170 * of input tiles. 171 */ 172 template <int TILE_ITEMS> BlockInitcub::GridEvenShare173 __device__ __forceinline__ void BlockInit( 174 int block_id, 175 Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/) 176 { 177 block_stride = grid_size * TILE_ITEMS; 178 block_offset = (block_id * TILE_ITEMS); 179 block_end = num_items; 180 } 181 182 183 /** 184 * \brief Block-initialization, specialized for "strip mining" access 185 * pattern in which the input tiles assigned to each thread block are 186 * separated by a stride equal to the the extent of the grid. 187 */ 188 template < 189 int TILE_ITEMS, 190 GridMappingStrategy STRATEGY> BlockInitcub::GridEvenShare191 __device__ __forceinline__ void BlockInit() 192 { 193 BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>()); 194 } 195 196 197 /** 198 * \brief Block-initialization, specialized for a "raking" access 199 * pattern in which each thread block is assigned a consecutive sequence 200 * of input tiles. 201 */ 202 template <int TILE_ITEMS> BlockInitcub::GridEvenShare203 __device__ __forceinline__ void BlockInit( 204 OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) 205 OffsetT block_end) ///< [in] Threadblock end offset (exclusive) 206 { 207 this->block_offset = block_offset; 208 this->block_end = block_end; 209 this->block_stride = TILE_ITEMS; 210 } 211 212 213 }; 214 215 216 217 218 219 /** @} */ // end group GridModule 220 221 } // CUB namespace 222 CUB_NS_POSTFIX // Optional outer namespace(s) 223