1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units (grains).
32  */
33 
34 
35 #pragma once
36 
37 #include "../util_namespace.cuh"
38 #include "../util_macro.cuh"
39 #include "grid_mapping.cuh"
40 
41 /// Optional outer namespace(s)
42 CUB_NS_PREFIX
43 
44 /// CUB namespace
45 namespace cub {
46 
47 
48 /**
49  * \addtogroup GridModule
50  * @{
51  */
52 
53 
54 /**
55  * \brief GridEvenShare is a descriptor utility for distributing input among
56  * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
57  * the same number of input tiles.
58  *
59  * \par Overview
60  * Each thread block is assigned a consecutive sequence of input tiles.  To help
61  * preserve alignment and eliminate the overhead of guarded loads for all but the
62  * last thread block, to GridEvenShare assigns one of three different amounts of
63  * work to a given thread block: "big", "normal", or "last".  The "big" workloads
64  * are one scheduling grain larger than "normal".  The "last" work unit for the
65  * last thread block may be partially-full if the input is not an even multiple of
66  * the scheduling grain size.
67  *
68  * \par
69  * Before invoking a child grid, a parent thread will typically construct an
70  * instance of GridEvenShare.  The instance can be passed to child thread blocks
71  * which can initialize their per-thread block offsets using \p BlockInit().
72  */
73 template <typename OffsetT>
74 struct GridEvenShare
75 {
76 private:
77 
78     OffsetT     total_tiles;
79     int         big_shares;
80     OffsetT     big_share_items;
81     OffsetT     normal_share_items;
82     OffsetT     normal_base_offset;
83 
84 public:
85 
86     /// Total number of input items
87     OffsetT     num_items;
88 
89     /// Grid size in thread blocks
90     int         grid_size;
91 
92     /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
93     OffsetT     block_offset;
94 
95     /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
96     OffsetT     block_end;
97 
98     /// Stride between input tiles
99     OffsetT     block_stride;
100 
101 
102     /**
103      * \brief Constructor.
104      */
GridEvenSharecub::GridEvenShare105     __host__ __device__ __forceinline__ GridEvenShare() :
106         total_tiles(0),
107         big_shares(0),
108         big_share_items(0),
109         normal_share_items(0),
110         normal_base_offset(0),
111         num_items(0),
112         grid_size(0),
113         block_offset(0),
114         block_end(0),
115         block_stride(0)
116     {}
117 
118 
119     /**
120      * \brief Dispatch initializer. To be called prior prior to kernel launch.
121      */
DispatchInitcub::GridEvenShare122     __host__ __device__ __forceinline__ void DispatchInit(
123         OffsetT num_items,          ///< Total number of input items
124         int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
125         int     tile_items)         ///< Number of data items per input tile
126     {
127         this->block_offset          = num_items;    // Initialize past-the-end
128         this->block_end             = num_items;    // Initialize past-the-end
129         this->num_items             = num_items;
130         this->total_tiles           = (num_items + tile_items - 1) / tile_items;
131         this->grid_size             = CUB_MIN(total_tiles, max_grid_size);
132         OffsetT avg_tiles_per_block = total_tiles / grid_size;
133         this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);        // leftover grains go to big blocks
134         this->normal_share_items    = avg_tiles_per_block * tile_items;
135         this->normal_base_offset    = big_shares * tile_items;
136         this->big_share_items       = normal_share_items + tile_items;
137     }
138 
139 
140     /**
141      * \brief Initializes ranges for the specified thread block index.  Specialized
142      * for a "raking" access pattern in which each thread block is assigned a
143      * consecutive sequence of input tiles.
144      */
145     template <int TILE_ITEMS>
BlockInitcub::GridEvenShare146     __device__ __forceinline__ void BlockInit(
147         int block_id,
148         Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
149     {
150         block_stride = TILE_ITEMS;
151         if (block_id < big_shares)
152         {
153             // This thread block gets a big share of grains (avg_tiles_per_block + 1)
154             block_offset = (block_id * big_share_items);
155             block_end = block_offset + big_share_items;
156         }
157         else if (block_id < total_tiles)
158         {
159             // This thread block gets a normal share of grains (avg_tiles_per_block)
160             block_offset = normal_base_offset + (block_id * normal_share_items);
161             block_end = CUB_MIN(num_items, block_offset + normal_share_items);
162         }
163         // Else default past-the-end
164     }
165 
166 
167     /**
168      * \brief Block-initialization, specialized for a "raking" access
169      * pattern in which each thread block is assigned a consecutive sequence
170      * of input tiles.
171      */
172     template <int TILE_ITEMS>
BlockInitcub::GridEvenShare173     __device__ __forceinline__ void BlockInit(
174         int block_id,
175         Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
176     {
177         block_stride = grid_size * TILE_ITEMS;
178         block_offset = (block_id * TILE_ITEMS);
179         block_end = num_items;
180     }
181 
182 
183     /**
184      * \brief Block-initialization, specialized for "strip mining" access
185      * pattern in which the input tiles assigned to each thread block are
186      * separated by a stride equal to the the extent of the grid.
187      */
188     template <
189         int TILE_ITEMS,
190         GridMappingStrategy STRATEGY>
BlockInitcub::GridEvenShare191     __device__ __forceinline__ void BlockInit()
192     {
193         BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
194     }
195 
196 
197     /**
198      * \brief Block-initialization, specialized for a "raking" access
199      * pattern in which each thread block is assigned a consecutive sequence
200      * of input tiles.
201      */
202     template <int TILE_ITEMS>
BlockInitcub::GridEvenShare203     __device__ __forceinline__ void BlockInit(
204         OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
205         OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
206     {
207         this->block_offset = block_offset;
208         this->block_end = block_end;
209         this->block_stride = TILE_ITEMS;
210     }
211 
212 
213 };
214 
215 
216 
217 
218 
219 /** @} */       // end group GridModule
220 
221 }               // CUB namespace
222 CUB_NS_POSTFIX  // Optional outer namespace(s)
223