1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
32  */
33 
34 #pragma once
35 
36 #include <iterator>
37 
38 #include "../util_type.cuh"
39 #include "../block/block_load.cuh"
40 #include "../grid/grid_queue.cuh"
41 #include "../iterator/cache_modified_input_iterator.cuh"
42 #include "../util_namespace.cuh"
43 
44 /// Optional outer namespace(s)
45 CUB_NS_PREFIX
46 
47 /// CUB namespace
48 namespace cub {
49 
50 
51 /******************************************************************************
52  * Tuning policy
53  ******************************************************************************/
54 
55 /**
56  *
57  */
58 enum BlockHistogramMemoryPreference
59 {
60     GMEM,
61     SMEM,
62     BLEND
63 };
64 
65 
66 /**
67  * Parameterizable tuning policy type for AgentHistogram
68  */
69 template <
70     int                             _BLOCK_THREADS,                 ///< Threads per thread block
71     int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
72     BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
73     CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
74     bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
75     BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
76     bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
77 struct AgentHistogramPolicy
78 {
79     enum
80     {
81         BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
82         PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
83         IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
84         MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
85         IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
86     };
87 
88     static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
89     static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
90 };
91 
92 
93 /******************************************************************************
94  * Thread block abstractions
95  ******************************************************************************/
96 
97 /**
98  * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
99  */
100 template <
101     typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
102     int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
103     int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
104     int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
105     typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
106     typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
107     typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
108     typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
109     typename    OffsetT,                        ///< Signed integer type for global offsets
110     int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
111 struct AgentHistogram
112 {
113     //---------------------------------------------------------------------
114     // Types and constants
115     //---------------------------------------------------------------------
116 
117     /// The sample type of the input iterator
118     typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
119 
120     /// The pixel type of SampleT
121     typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT;
122 
123     /// The quad type of SampleT
124     typedef typename CubVector<SampleT, 4>::Type QuadT;
125 
126     /// Constants
127     enum
128     {
129         BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
130 
131         PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
132         SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
133         QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
134 
135         TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
136         TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
137 
138         IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
139 
140         MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
141                                         AgentHistogramPolicyT::MEM_PREFERENCE :
142                                         GMEM,
143 
144         IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
145     };
146 
147     /// Cache load modifier for reading input elements
148     static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
149 
150 
151     /// Input iterator wrapper type (for applying cache modifier)
152     typedef typename If<IsPointer<SampleIteratorT>::VALUE,
153             CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
154             SampleIteratorT>::Type                                           // Directly use the supplied input iterator type
155         WrappedSampleIteratorT;
156 
157     /// Pixel input iterator type (for applying cache modifier)
158     typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
159         WrappedPixelIteratorT;
160 
161     /// Qaud input iterator type (for applying cache modifier)
162     typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
163         WrappedQuadIteratorT;
164 
165     /// Parameterized BlockLoad type for samples
166     typedef BlockLoad<
167             SampleT,
168             BLOCK_THREADS,
169             SAMPLES_PER_THREAD,
170             AgentHistogramPolicyT::LOAD_ALGORITHM>
171         BlockLoadSampleT;
172 
173     /// Parameterized BlockLoad type for pixels
174     typedef BlockLoad<
175             PixelT,
176             BLOCK_THREADS,
177             PIXELS_PER_THREAD,
178             AgentHistogramPolicyT::LOAD_ALGORITHM>
179         BlockLoadPixelT;
180 
181     /// Parameterized BlockLoad type for quads
182     typedef BlockLoad<
183             QuadT,
184             BLOCK_THREADS,
185             QUADS_PER_THREAD,
186             AgentHistogramPolicyT::LOAD_ALGORITHM>
187         BlockLoadQuadT;
188 
189     /// Shared memory type required by this thread block
190     struct _TempStorage
191     {
192         CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
193 
194         int tile_idx;
195 
196         // Aliasable storage layout
197         union Aliasable
198         {
199             typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
200             typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
201             typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
202 
203         } aliasable;
204     };
205 
206 
207     /// Temporary storage type (unionable)
208     struct TempStorage : Uninitialized<_TempStorage> {};
209 
210 
211     //---------------------------------------------------------------------
212     // Per-thread fields
213     //---------------------------------------------------------------------
214 
215     /// Reference to temp_storage
216     _TempStorage &temp_storage;
217 
218     /// Sample input iterator (with cache modifier applied, if possible)
219     WrappedSampleIteratorT d_wrapped_samples;
220 
221     /// Native pointer for input samples (possibly NULL if unavailable)
222     SampleT* d_native_samples;
223 
224     /// The number of output bins for each channel
225     int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
226 
227     /// The number of privatized bins for each channel
228     int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
229 
230     /// Reference to gmem privatized histograms for each channel
231     CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
232 
233     /// Reference to final output histograms (gmem)
234     CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
235 
236     /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
237     OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
238 
239     /// The transform operator for determining privatized counter indices from samples, one for each channel
240     PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
241 
242     /// Whether to prefer privatized smem counters vs privatized global counters
243     bool prefer_smem;
244 
245 
246     //---------------------------------------------------------------------
247     // Initialize privatized bin counters
248     //---------------------------------------------------------------------
249 
250     // Initialize privatized bin counters
InitBinCounterscub::AgentHistogram251     __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
252     {
253         // Initialize histogram bin counts to zeros
254         #pragma unroll
255         for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
256         {
257             for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
258             {
259                 privatized_histograms[CHANNEL][privatized_bin] = 0;
260             }
261         }
262 
263         // Barrier to make sure all threads are done updating counters
264         CTA_SYNC();
265     }
266 
267 
268     // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
InitSmemBinCounterscub::AgentHistogram269     __device__ __forceinline__ void InitSmemBinCounters()
270     {
271         CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
272 
273         for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
274             privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
275 
276         InitBinCounters(privatized_histograms);
277     }
278 
279 
280     // Initialize privatized bin counters.  Specialized for privatized global-memory counters
InitGmemBinCounterscub::AgentHistogram281     __device__ __forceinline__ void InitGmemBinCounters()
282     {
283         InitBinCounters(d_privatized_histograms);
284     }
285 
286 
287     //---------------------------------------------------------------------
288     // Update final output histograms
289     //---------------------------------------------------------------------
290 
291     // Update final output histograms from privatized histograms
StoreOutputcub::AgentHistogram292     __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
293     {
294         // Barrier to make sure all threads are done updating counters
295         CTA_SYNC();
296 
297         // Apply privatized bin counts to output bin counts
298         #pragma unroll
299         for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
300         {
301             int channel_bins = num_privatized_bins[CHANNEL];
302             for (int privatized_bin = threadIdx.x;
303                     privatized_bin < channel_bins;
304                     privatized_bin += BLOCK_THREADS)
305             {
306                 int         output_bin  = -1;
307                 CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
308                 bool        is_valid    = count > 0;
309 
310                 output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
311 
312                 if (output_bin >= 0)
313                 {
314                     atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
315                 }
316 
317             }
318         }
319     }
320 
321 
322     // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
StoreSmemOutputcub::AgentHistogram323     __device__ __forceinline__ void StoreSmemOutput()
324     {
325         CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
326         for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
327             privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
328 
329         StoreOutput(privatized_histograms);
330     }
331 
332 
333     // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
StoreGmemOutputcub::AgentHistogram334     __device__ __forceinline__ void StoreGmemOutput()
335     {
336         StoreOutput(d_privatized_histograms);
337     }
338 
339 
340     //---------------------------------------------------------------------
341     // Tile accumulation
342     //---------------------------------------------------------------------
343 
344     // Accumulate pixels.  Specialized for RLE compression.
AccumulatePixelscub::AgentHistogram345     __device__ __forceinline__ void AccumulatePixels(
346         SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
347         bool                is_valid[PIXELS_PER_THREAD],
348         CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
349         Int2Type<true>      is_rle_compress)
350     {
351         #pragma unroll
352         for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
353         {
354             // Bin pixels
355             int bins[PIXELS_PER_THREAD];
356 
357             #pragma unroll
358             for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
359             {
360                 bins[PIXEL] = -1;
361                 privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
362             }
363 
364             CounterT accumulator = 1;
365 
366             #pragma unroll
367             for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
368             {
369                 if (bins[PIXEL] != bins[PIXEL + 1])
370                 {
371                     if (bins[PIXEL] >= 0)
372                         atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
373 
374                      accumulator = 0;
375                 }
376                 accumulator++;
377             }
378 
379             // Last pixel
380             if (bins[PIXELS_PER_THREAD - 1] >= 0)
381                 atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
382         }
383     }
384 
385 
386     // Accumulate pixels.  Specialized for individual accumulation of each pixel.
AccumulatePixelscub::AgentHistogram387     __device__ __forceinline__ void AccumulatePixels(
388         SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
389         bool                is_valid[PIXELS_PER_THREAD],
390         CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
391         Int2Type<false>     is_rle_compress)
392     {
393         #pragma unroll
394         for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
395         {
396             #pragma unroll
397             for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
398             {
399                 int bin = -1;
400                 privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
401                 if (bin >= 0)
402                     atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
403             }
404         }
405     }
406 
407 
408     /**
409      * Accumulate pixel, specialized for smem privatized histogram
410      */
AccumulateSmemPixelscub::AgentHistogram411     __device__ __forceinline__ void AccumulateSmemPixels(
412         SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
413         bool                is_valid[PIXELS_PER_THREAD])
414     {
415         CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
416 
417         for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
418             privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
419 
420         AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
421     }
422 
423 
424     /**
425      * Accumulate pixel, specialized for gmem privatized histogram
426      */
AccumulateGmemPixelscub::AgentHistogram427     __device__ __forceinline__ void AccumulateGmemPixels(
428         SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
429         bool                is_valid[PIXELS_PER_THREAD])
430     {
431         AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
432     }
433 
434 
435 
436     //---------------------------------------------------------------------
437     // Tile loading
438     //---------------------------------------------------------------------
439 
440     // Load full, aligned tile using pixel iterator (multi-channel)
441     template <int _NUM_ACTIVE_CHANNELS>
LoadFullAlignedTilecub::AgentHistogram442     __device__ __forceinline__ void LoadFullAlignedTile(
443         OffsetT                         block_offset,
444         int                             valid_samples,
445         SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
446         Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)
447     {
448         typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
449 
450         WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
451 
452         // Load using a wrapped pixel iterator
453         BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
454             d_wrapped_pixels,
455             reinterpret_cast<AliasedPixels&>(samples));
456     }
457 
458     // Load full, aligned tile using quad iterator (single-channel)
LoadFullAlignedTilecub::AgentHistogram459     __device__ __forceinline__ void LoadFullAlignedTile(
460         OffsetT                         block_offset,
461         int                             valid_samples,
462         SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
463         Int2Type<1>                     num_active_channels)
464     {
465         typedef QuadT AliasedQuads[QUADS_PER_THREAD];
466 
467         WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
468 
469         // Load using a wrapped quad iterator
470         BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
471             d_wrapped_quads,
472             reinterpret_cast<AliasedQuads&>(samples));
473     }
474 
475     // Load full, aligned tile
LoadTilecub::AgentHistogram476     __device__ __forceinline__ void LoadTile(
477         OffsetT         block_offset,
478         int             valid_samples,
479         SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
480         Int2Type<true>  is_full_tile,
481         Int2Type<true>  is_aligned)
482     {
483         LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
484     }
485 
486     // Load full, mis-aligned tile using sample iterator
LoadTilecub::AgentHistogram487     __device__ __forceinline__ void LoadTile(
488         OffsetT         block_offset,
489         int             valid_samples,
490         SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
491         Int2Type<true>  is_full_tile,
492         Int2Type<false> is_aligned)
493     {
494         typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
495 
496         // Load using sample iterator
497         BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
498             d_wrapped_samples + block_offset,
499             reinterpret_cast<AliasedSamples&>(samples));
500     }
501 
502     // Load partially-full, aligned tile using the pixel iterator
LoadTilecub::AgentHistogram503     __device__ __forceinline__ void LoadTile(
504         OffsetT         block_offset,
505         int             valid_samples,
506         SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
507         Int2Type<false> is_full_tile,
508         Int2Type<true>  is_aligned)
509     {
510         typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
511 
512         WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
513 
514         int valid_pixels = valid_samples / NUM_CHANNELS;
515 
516         // Load using a wrapped pixel iterator
517         BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
518             d_wrapped_pixels,
519             reinterpret_cast<AliasedPixels&>(samples),
520             valid_pixels);
521     }
522 
523     // Load partially-full, mis-aligned tile using sample iterator
LoadTilecub::AgentHistogram524     __device__ __forceinline__ void LoadTile(
525         OffsetT         block_offset,
526         int             valid_samples,
527         SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
528         Int2Type<false> is_full_tile,
529         Int2Type<false> is_aligned)
530     {
531         typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
532 
533         BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
534             d_wrapped_samples + block_offset,
535             reinterpret_cast<AliasedSamples&>(samples),
536             valid_samples);
537     }
538 
539 
540     //---------------------------------------------------------------------
541     // Tile processing
542     //---------------------------------------------------------------------
543 
544     // Consume a tile of data samples
545     template <
546         bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
547         bool IS_FULL_TILE>      // Whether the tile is full
ConsumeTilecub::AgentHistogram548     __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
549     {
550         SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
551         bool        is_valid[PIXELS_PER_THREAD];
552 
553         // Load tile
554         LoadTile(
555             block_offset,
556             valid_samples,
557             samples,
558             Int2Type<IS_FULL_TILE>(),
559             Int2Type<IS_ALIGNED>());
560 
561         // Set valid flags
562         #pragma unroll
563         for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
564             is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
565 
566         // Accumulate samples
567 #if CUB_PTX_ARCH >= 120
568         if (prefer_smem)
569             AccumulateSmemPixels(samples, is_valid);
570         else
571             AccumulateGmemPixels(samples, is_valid);
572 #else
573         AccumulateGmemPixels(samples, is_valid);
574 #endif
575 
576     }
577 
578 
579     // Consume row tiles.  Specialized for work-stealing from queue
580     template <bool IS_ALIGNED>
ConsumeTilescub::AgentHistogram581     __device__ __forceinline__ void ConsumeTiles(
582         OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
583         OffsetT             num_rows,                   ///< The number of rows in the region of interest
584         OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
585         int                 tiles_per_row,              ///< Number of image tiles per row
586         GridQueue<int>      tile_queue,
587         Int2Type<true>      is_work_stealing)
588     {
589 
590         int         num_tiles                   = num_rows * tiles_per_row;
591         int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
592         OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
593 
594         while (tile_idx < num_tiles)
595         {
596             int     row             = tile_idx / tiles_per_row;
597             int     col             = tile_idx - (row * tiles_per_row);
598             OffsetT row_offset      = row * row_stride_samples;
599             OffsetT col_offset      = (col * TILE_SAMPLES);
600             OffsetT tile_offset     = row_offset + col_offset;
601 
602             if (col == tiles_per_row - 1)
603             {
604                 // Consume a partially-full tile at the end of the row
605                 OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
606                 ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
607             }
608             else
609             {
610                 // Consume full tile
611                 ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
612             }
613 
614             CTA_SYNC();
615 
616             // Get next tile
617             if (threadIdx.x == 0)
618                 temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
619 
620             CTA_SYNC();
621 
622             tile_idx = temp_storage.tile_idx;
623         }
624     }
625 
626 
627     // Consume row tiles.  Specialized for even-share (striped across thread blocks)
628     template <bool IS_ALIGNED>
ConsumeTilescub::AgentHistogram629     __device__ __forceinline__ void ConsumeTiles(
630         OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
631         OffsetT             num_rows,                   ///< The number of rows in the region of interest
632         OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
633         int                 tiles_per_row,              ///< Number of image tiles per row
634         GridQueue<int>      tile_queue,
635         Int2Type<false>     is_work_stealing)
636     {
637         for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
638         {
639             OffsetT row_begin   = row * row_stride_samples;
640             OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
641             OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
642 
643             while (tile_offset < row_end)
644             {
645                 OffsetT num_remaining = row_end - tile_offset;
646 
647                 if (num_remaining < TILE_SAMPLES)
648                 {
649                     // Consume partial tile
650                     ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
651                     break;
652                 }
653 
654                 // Consume full tile
655                 ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
656                 tile_offset += gridDim.x * TILE_SAMPLES;
657             }
658         }
659     }
660 
661 
662     //---------------------------------------------------------------------
663     // Parameter extraction
664     //---------------------------------------------------------------------
665 
666     // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
667     template <
668         CacheLoadModifier   _MODIFIER,
669         typename            _ValueT,
670         typename            _OffsetT>
NativePointercub::AgentHistogram671     __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
672     {
673         return itr.ptr;
674     }
675 
676     // Return a native pixel pointer (specialized for other types)
677     template <typename IteratorT>
NativePointercub::AgentHistogram678     __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
679     {
680         return NULL;
681     }
682 
683 
684 
685     //---------------------------------------------------------------------
686     // Interface
687     //---------------------------------------------------------------------
688 
689 
690     /**
691      * Constructor
692      */
AgentHistogramcub::AgentHistogram693     __device__ __forceinline__ AgentHistogram(
694         TempStorage         &temp_storage,                                      ///< Reference to temp_storage
695         SampleIteratorT     d_samples,                                          ///< Input data to reduce
696         int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
697         int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
698         CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
699         CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
700         OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
701         PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
702     :
703         temp_storage(temp_storage.Alias()),
704         d_wrapped_samples(d_samples),
705         num_output_bins(num_output_bins),
706         num_privatized_bins(num_privatized_bins),
707         d_output_histograms(d_output_histograms),
708         privatized_decode_op(privatized_decode_op),
709         output_decode_op(output_decode_op),
710         d_native_samples(NativePointer(d_wrapped_samples)),
711         prefer_smem((MEM_PREFERENCE == SMEM) ?
712             true :                              // prefer smem privatized histograms
713             (MEM_PREFERENCE == GMEM) ?
714                 false :                         // prefer gmem privatized histograms
715                 blockIdx.x & 1)                 // prefer blended privatized histograms
716     {
717         int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
718 
719         // Initialize the locations of this block's privatized histograms
720         for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
721             this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
722     }
723 
724 
725     /**
726      * Consume image
727      */
ConsumeTilescub::AgentHistogram728     __device__ __forceinline__ void ConsumeTiles(
729         OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
730         OffsetT             num_rows,                   ///< The number of rows in the region of interest
731         OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
732         int                 tiles_per_row,              ///< Number of image tiles per row
733         GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
734     {
735         // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
736         int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
737         int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
738         size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
739 
740         bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
741                                         ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
742                                         ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
743 
744         bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
745                                         ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
746                                         ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
747 
748         // Whether rows are aligned and can be vectorized
749         if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
750             ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
751         else
752             ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
753     }
754 
755 
756     /**
757      * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
758      */
InitBinCounterscub::AgentHistogram759     __device__ __forceinline__ void InitBinCounters()
760     {
761         if (prefer_smem)
762             InitSmemBinCounters();
763         else
764             InitGmemBinCounters();
765     }
766 
767 
768     /**
769      * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
770      */
StoreOutputcub::AgentHistogram771     __device__ __forceinline__ void StoreOutput()
772     {
773         if (prefer_smem)
774             StoreSmemOutput();
775         else
776             StoreGmemOutput();
777     }
778 
779 
780 };
781 
782 
783 
784 
785 }               // CUB namespace
786 CUB_NS_POSTFIX  // Optional outer namespace(s)
787 
788