1 /****************************************************************************** 2 * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the NVIDIA CORPORATION nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 ******************************************************************************/ 28 29 /** 30 * \file 31 * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . 32 */ 33 34 #pragma once 35 36 #include <iterator> 37 38 #include "../util_type.cuh" 39 #include "../block/block_load.cuh" 40 #include "../grid/grid_queue.cuh" 41 #include "../iterator/cache_modified_input_iterator.cuh" 42 #include "../util_namespace.cuh" 43 44 /// Optional outer namespace(s) 45 CUB_NS_PREFIX 46 47 /// CUB namespace 48 namespace cub { 49 50 51 /****************************************************************************** 52 * Tuning policy 53 ******************************************************************************/ 54 55 /** 56 * 57 */ 58 enum BlockHistogramMemoryPreference 59 { 60 GMEM, 61 SMEM, 62 BLEND 63 }; 64 65 66 /** 67 * Parameterizable tuning policy type for AgentHistogram 68 */ 69 template < 70 int _BLOCK_THREADS, ///< Threads per thread block 71 int _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) 72 BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use 73 CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements 74 bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming 75 BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) 76 bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue 77 struct AgentHistogramPolicy 78 { 79 enum 80 { 81 BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block 82 PIXELS_PER_THREAD = _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) 83 IS_RLE_COMPRESS = _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming 84 MEM_PREFERENCE = _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) 85 IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue 86 }; 87 88 static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use 89 static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements 90 }; 91 92 93 /****************************************************************************** 94 * Thread block abstractions 95 ******************************************************************************/ 96 97 /** 98 * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . 99 */ 100 template < 101 typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type 102 int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in device-accessible memory. 103 int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels. 104 int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed 105 typename SampleIteratorT, ///< Random-access input iterator type for reading samples 106 typename CounterT, ///< Integer type for counting sample occurrences per histogram bin 107 typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel 108 typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel 109 typename OffsetT, ///< Signed integer type for global offsets 110 int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability 111 struct AgentHistogram 112 { 113 //--------------------------------------------------------------------- 114 // Types and constants 115 //--------------------------------------------------------------------- 116 117 /// The sample type of the input iterator 118 typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT; 119 120 /// The pixel type of SampleT 121 typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT; 122 123 /// The quad type of SampleT 124 typedef typename CubVector<SampleT, 4>::Type QuadT; 125 126 /// Constants 127 enum 128 { 129 BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS, 130 131 PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD, 132 SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS, 133 QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4, 134 135 TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS, 136 TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS, 137 138 IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS, 139 140 MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? 141 AgentHistogramPolicyT::MEM_PREFERENCE : 142 GMEM, 143 144 IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING, 145 }; 146 147 /// Cache load modifier for reading input elements 148 static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER; 149 150 151 /// Input iterator wrapper type (for applying cache modifier) 152 typedef typename If<IsPointer<SampleIteratorT>::VALUE, 153 CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator 154 SampleIteratorT>::Type // Directly use the supplied input iterator type 155 WrappedSampleIteratorT; 156 157 /// Pixel input iterator type (for applying cache modifier) 158 typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT> 159 WrappedPixelIteratorT; 160 161 /// Qaud input iterator type (for applying cache modifier) 162 typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT> 163 WrappedQuadIteratorT; 164 165 /// Parameterized BlockLoad type for samples 166 typedef BlockLoad< 167 SampleT, 168 BLOCK_THREADS, 169 SAMPLES_PER_THREAD, 170 AgentHistogramPolicyT::LOAD_ALGORITHM> 171 BlockLoadSampleT; 172 173 /// Parameterized BlockLoad type for pixels 174 typedef BlockLoad< 175 PixelT, 176 BLOCK_THREADS, 177 PIXELS_PER_THREAD, 178 AgentHistogramPolicyT::LOAD_ALGORITHM> 179 BlockLoadPixelT; 180 181 /// Parameterized BlockLoad type for quads 182 typedef BlockLoad< 183 QuadT, 184 BLOCK_THREADS, 185 QUADS_PER_THREAD, 186 AgentHistogramPolicyT::LOAD_ALGORITHM> 187 BlockLoadQuadT; 188 189 /// Shared memory type required by this thread block 190 struct _TempStorage 191 { 192 CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding) 193 194 int tile_idx; 195 196 // Aliasable storage layout 197 union Aliasable 198 { 199 typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples 200 typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels 201 typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads 202 203 } aliasable; 204 }; 205 206 207 /// Temporary storage type (unionable) 208 struct TempStorage : Uninitialized<_TempStorage> {}; 209 210 211 //--------------------------------------------------------------------- 212 // Per-thread fields 213 //--------------------------------------------------------------------- 214 215 /// Reference to temp_storage 216 _TempStorage &temp_storage; 217 218 /// Sample input iterator (with cache modifier applied, if possible) 219 WrappedSampleIteratorT d_wrapped_samples; 220 221 /// Native pointer for input samples (possibly NULL if unavailable) 222 SampleT* d_native_samples; 223 224 /// The number of output bins for each channel 225 int (&num_output_bins)[NUM_ACTIVE_CHANNELS]; 226 227 /// The number of privatized bins for each channel 228 int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS]; 229 230 /// Reference to gmem privatized histograms for each channel 231 CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS]; 232 233 /// Reference to final output histograms (gmem) 234 CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS]; 235 236 /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel 237 OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS]; 238 239 /// The transform operator for determining privatized counter indices from samples, one for each channel 240 PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]; 241 242 /// Whether to prefer privatized smem counters vs privatized global counters 243 bool prefer_smem; 244 245 246 //--------------------------------------------------------------------- 247 // Initialize privatized bin counters 248 //--------------------------------------------------------------------- 249 250 // Initialize privatized bin counters InitBinCounterscub::AgentHistogram251 __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) 252 { 253 // Initialize histogram bin counts to zeros 254 #pragma unroll 255 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) 256 { 257 for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS) 258 { 259 privatized_histograms[CHANNEL][privatized_bin] = 0; 260 } 261 } 262 263 // Barrier to make sure all threads are done updating counters 264 CTA_SYNC(); 265 } 266 267 268 // Initialize privatized bin counters. Specialized for privatized shared-memory counters InitSmemBinCounterscub::AgentHistogram269 __device__ __forceinline__ void InitSmemBinCounters() 270 { 271 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; 272 273 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) 274 privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; 275 276 InitBinCounters(privatized_histograms); 277 } 278 279 280 // Initialize privatized bin counters. Specialized for privatized global-memory counters InitGmemBinCounterscub::AgentHistogram281 __device__ __forceinline__ void InitGmemBinCounters() 282 { 283 InitBinCounters(d_privatized_histograms); 284 } 285 286 287 //--------------------------------------------------------------------- 288 // Update final output histograms 289 //--------------------------------------------------------------------- 290 291 // Update final output histograms from privatized histograms StoreOutputcub::AgentHistogram292 __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) 293 { 294 // Barrier to make sure all threads are done updating counters 295 CTA_SYNC(); 296 297 // Apply privatized bin counts to output bin counts 298 #pragma unroll 299 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) 300 { 301 int channel_bins = num_privatized_bins[CHANNEL]; 302 for (int privatized_bin = threadIdx.x; 303 privatized_bin < channel_bins; 304 privatized_bin += BLOCK_THREADS) 305 { 306 int output_bin = -1; 307 CounterT count = privatized_histograms[CHANNEL][privatized_bin]; 308 bool is_valid = count > 0; 309 310 output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid); 311 312 if (output_bin >= 0) 313 { 314 atomicAdd(&d_output_histograms[CHANNEL][output_bin], count); 315 } 316 317 } 318 } 319 } 320 321 322 // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters StoreSmemOutputcub::AgentHistogram323 __device__ __forceinline__ void StoreSmemOutput() 324 { 325 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; 326 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) 327 privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; 328 329 StoreOutput(privatized_histograms); 330 } 331 332 333 // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters StoreGmemOutputcub::AgentHistogram334 __device__ __forceinline__ void StoreGmemOutput() 335 { 336 StoreOutput(d_privatized_histograms); 337 } 338 339 340 //--------------------------------------------------------------------- 341 // Tile accumulation 342 //--------------------------------------------------------------------- 343 344 // Accumulate pixels. Specialized for RLE compression. AccumulatePixelscub::AgentHistogram345 __device__ __forceinline__ void AccumulatePixels( 346 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], 347 bool is_valid[PIXELS_PER_THREAD], 348 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], 349 Int2Type<true> is_rle_compress) 350 { 351 #pragma unroll 352 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) 353 { 354 // Bin pixels 355 int bins[PIXELS_PER_THREAD]; 356 357 #pragma unroll 358 for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) 359 { 360 bins[PIXEL] = -1; 361 privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]); 362 } 363 364 CounterT accumulator = 1; 365 366 #pragma unroll 367 for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL) 368 { 369 if (bins[PIXEL] != bins[PIXEL + 1]) 370 { 371 if (bins[PIXEL] >= 0) 372 atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator); 373 374 accumulator = 0; 375 } 376 accumulator++; 377 } 378 379 // Last pixel 380 if (bins[PIXELS_PER_THREAD - 1] >= 0) 381 atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator); 382 } 383 } 384 385 386 // Accumulate pixels. Specialized for individual accumulation of each pixel. AccumulatePixelscub::AgentHistogram387 __device__ __forceinline__ void AccumulatePixels( 388 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], 389 bool is_valid[PIXELS_PER_THREAD], 390 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], 391 Int2Type<false> is_rle_compress) 392 { 393 #pragma unroll 394 for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) 395 { 396 #pragma unroll 397 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) 398 { 399 int bin = -1; 400 privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]); 401 if (bin >= 0) 402 atomicAdd(privatized_histograms[CHANNEL] + bin, 1); 403 } 404 } 405 } 406 407 408 /** 409 * Accumulate pixel, specialized for smem privatized histogram 410 */ AccumulateSmemPixelscub::AgentHistogram411 __device__ __forceinline__ void AccumulateSmemPixels( 412 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], 413 bool is_valid[PIXELS_PER_THREAD]) 414 { 415 CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; 416 417 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) 418 privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; 419 420 AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>()); 421 } 422 423 424 /** 425 * Accumulate pixel, specialized for gmem privatized histogram 426 */ AccumulateGmemPixelscub::AgentHistogram427 __device__ __forceinline__ void AccumulateGmemPixels( 428 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], 429 bool is_valid[PIXELS_PER_THREAD]) 430 { 431 AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>()); 432 } 433 434 435 436 //--------------------------------------------------------------------- 437 // Tile loading 438 //--------------------------------------------------------------------- 439 440 // Load full, aligned tile using pixel iterator (multi-channel) 441 template <int _NUM_ACTIVE_CHANNELS> LoadFullAlignedTilecub::AgentHistogram442 __device__ __forceinline__ void LoadFullAlignedTile( 443 OffsetT block_offset, 444 int valid_samples, 445 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], 446 Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels) 447 { 448 typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; 449 450 WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); 451 452 // Load using a wrapped pixel iterator 453 BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( 454 d_wrapped_pixels, 455 reinterpret_cast<AliasedPixels&>(samples)); 456 } 457 458 // Load full, aligned tile using quad iterator (single-channel) LoadFullAlignedTilecub::AgentHistogram459 __device__ __forceinline__ void LoadFullAlignedTile( 460 OffsetT block_offset, 461 int valid_samples, 462 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], 463 Int2Type<1> num_active_channels) 464 { 465 typedef QuadT AliasedQuads[QUADS_PER_THREAD]; 466 467 WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset)); 468 469 // Load using a wrapped quad iterator 470 BlockLoadQuadT(temp_storage.aliasable.quad_load).Load( 471 d_wrapped_quads, 472 reinterpret_cast<AliasedQuads&>(samples)); 473 } 474 475 // Load full, aligned tile LoadTilecub::AgentHistogram476 __device__ __forceinline__ void LoadTile( 477 OffsetT block_offset, 478 int valid_samples, 479 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], 480 Int2Type<true> is_full_tile, 481 Int2Type<true> is_aligned) 482 { 483 LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>()); 484 } 485 486 // Load full, mis-aligned tile using sample iterator LoadTilecub::AgentHistogram487 __device__ __forceinline__ void LoadTile( 488 OffsetT block_offset, 489 int valid_samples, 490 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], 491 Int2Type<true> is_full_tile, 492 Int2Type<false> is_aligned) 493 { 494 typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; 495 496 // Load using sample iterator 497 BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( 498 d_wrapped_samples + block_offset, 499 reinterpret_cast<AliasedSamples&>(samples)); 500 } 501 502 // Load partially-full, aligned tile using the pixel iterator LoadTilecub::AgentHistogram503 __device__ __forceinline__ void LoadTile( 504 OffsetT block_offset, 505 int valid_samples, 506 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], 507 Int2Type<false> is_full_tile, 508 Int2Type<true> is_aligned) 509 { 510 typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; 511 512 WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); 513 514 int valid_pixels = valid_samples / NUM_CHANNELS; 515 516 // Load using a wrapped pixel iterator 517 BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( 518 d_wrapped_pixels, 519 reinterpret_cast<AliasedPixels&>(samples), 520 valid_pixels); 521 } 522 523 // Load partially-full, mis-aligned tile using sample iterator LoadTilecub::AgentHistogram524 __device__ __forceinline__ void LoadTile( 525 OffsetT block_offset, 526 int valid_samples, 527 SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], 528 Int2Type<false> is_full_tile, 529 Int2Type<false> is_aligned) 530 { 531 typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; 532 533 BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( 534 d_wrapped_samples + block_offset, 535 reinterpret_cast<AliasedSamples&>(samples), 536 valid_samples); 537 } 538 539 540 //--------------------------------------------------------------------- 541 // Tile processing 542 //--------------------------------------------------------------------- 543 544 // Consume a tile of data samples 545 template < 546 bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel) 547 bool IS_FULL_TILE> // Whether the tile is full ConsumeTilecub::AgentHistogram548 __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples) 549 { 550 SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS]; 551 bool is_valid[PIXELS_PER_THREAD]; 552 553 // Load tile 554 LoadTile( 555 block_offset, 556 valid_samples, 557 samples, 558 Int2Type<IS_FULL_TILE>(), 559 Int2Type<IS_ALIGNED>()); 560 561 // Set valid flags 562 #pragma unroll 563 for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) 564 is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples); 565 566 // Accumulate samples 567 #if CUB_PTX_ARCH >= 120 568 if (prefer_smem) 569 AccumulateSmemPixels(samples, is_valid); 570 else 571 AccumulateGmemPixels(samples, is_valid); 572 #else 573 AccumulateGmemPixels(samples, is_valid); 574 #endif 575 576 } 577 578 579 // Consume row tiles. Specialized for work-stealing from queue 580 template <bool IS_ALIGNED> ConsumeTilescub::AgentHistogram581 __device__ __forceinline__ void ConsumeTiles( 582 OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest 583 OffsetT num_rows, ///< The number of rows in the region of interest 584 OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest 585 int tiles_per_row, ///< Number of image tiles per row 586 GridQueue<int> tile_queue, 587 Int2Type<true> is_work_stealing) 588 { 589 590 int num_tiles = num_rows * tiles_per_row; 591 int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x; 592 OffsetT num_even_share_tiles = gridDim.x * gridDim.y; 593 594 while (tile_idx < num_tiles) 595 { 596 int row = tile_idx / tiles_per_row; 597 int col = tile_idx - (row * tiles_per_row); 598 OffsetT row_offset = row * row_stride_samples; 599 OffsetT col_offset = (col * TILE_SAMPLES); 600 OffsetT tile_offset = row_offset + col_offset; 601 602 if (col == tiles_per_row - 1) 603 { 604 // Consume a partially-full tile at the end of the row 605 OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset; 606 ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining); 607 } 608 else 609 { 610 // Consume full tile 611 ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES); 612 } 613 614 CTA_SYNC(); 615 616 // Get next tile 617 if (threadIdx.x == 0) 618 temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles; 619 620 CTA_SYNC(); 621 622 tile_idx = temp_storage.tile_idx; 623 } 624 } 625 626 627 // Consume row tiles. Specialized for even-share (striped across thread blocks) 628 template <bool IS_ALIGNED> ConsumeTilescub::AgentHistogram629 __device__ __forceinline__ void ConsumeTiles( 630 OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest 631 OffsetT num_rows, ///< The number of rows in the region of interest 632 OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest 633 int tiles_per_row, ///< Number of image tiles per row 634 GridQueue<int> tile_queue, 635 Int2Type<false> is_work_stealing) 636 { 637 for (int row = blockIdx.y; row < num_rows; row += gridDim.y) 638 { 639 OffsetT row_begin = row * row_stride_samples; 640 OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS); 641 OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES); 642 643 while (tile_offset < row_end) 644 { 645 OffsetT num_remaining = row_end - tile_offset; 646 647 if (num_remaining < TILE_SAMPLES) 648 { 649 // Consume partial tile 650 ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining); 651 break; 652 } 653 654 // Consume full tile 655 ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES); 656 tile_offset += gridDim.x * TILE_SAMPLES; 657 } 658 } 659 } 660 661 662 //--------------------------------------------------------------------- 663 // Parameter extraction 664 //--------------------------------------------------------------------- 665 666 // Return a native pixel pointer (specialized for CacheModifiedInputIterator types) 667 template < 668 CacheLoadModifier _MODIFIER, 669 typename _ValueT, 670 typename _OffsetT> NativePointercub::AgentHistogram671 __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr) 672 { 673 return itr.ptr; 674 } 675 676 // Return a native pixel pointer (specialized for other types) 677 template <typename IteratorT> NativePointercub::AgentHistogram678 __device__ __forceinline__ SampleT* NativePointer(IteratorT itr) 679 { 680 return NULL; 681 } 682 683 684 685 //--------------------------------------------------------------------- 686 // Interface 687 //--------------------------------------------------------------------- 688 689 690 /** 691 * Constructor 692 */ AgentHistogramcub::AgentHistogram693 __device__ __forceinline__ AgentHistogram( 694 TempStorage &temp_storage, ///< Reference to temp_storage 695 SampleIteratorT d_samples, ///< Input data to reduce 696 int (&num_output_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per final output histogram 697 int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per privatized histogram 698 CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to final output histograms 699 CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to privatized histograms 700 OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel 701 PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) ///< The transform operator for determining privatized counter indices from samples, one for each channel 702 : 703 temp_storage(temp_storage.Alias()), 704 d_wrapped_samples(d_samples), 705 num_output_bins(num_output_bins), 706 num_privatized_bins(num_privatized_bins), 707 d_output_histograms(d_output_histograms), 708 privatized_decode_op(privatized_decode_op), 709 output_decode_op(output_decode_op), 710 d_native_samples(NativePointer(d_wrapped_samples)), 711 prefer_smem((MEM_PREFERENCE == SMEM) ? 712 true : // prefer smem privatized histograms 713 (MEM_PREFERENCE == GMEM) ? 714 false : // prefer gmem privatized histograms 715 blockIdx.x & 1) // prefer blended privatized histograms 716 { 717 int blockId = (blockIdx.y * gridDim.x) + blockIdx.x; 718 719 // Initialize the locations of this block's privatized histograms 720 for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) 721 this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]); 722 } 723 724 725 /** 726 * Consume image 727 */ ConsumeTilescub::AgentHistogram728 __device__ __forceinline__ void ConsumeTiles( 729 OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest 730 OffsetT num_rows, ///< The number of rows in the region of interest 731 OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest 732 int tiles_per_row, ///< Number of image tiles per row 733 GridQueue<int> tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks 734 { 735 // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel) 736 int quad_mask = AlignBytes<QuadT>::ALIGN_BYTES - 1; 737 int pixel_mask = AlignBytes<PixelT>::ALIGN_BYTES - 1; 738 size_t row_bytes = sizeof(SampleT) * row_stride_samples; 739 740 bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel 741 ((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned 742 ((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad 743 744 bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel 745 ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned 746 ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel 747 748 // Whether rows are aligned and can be vectorized 749 if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows)) 750 ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>()); 751 else 752 ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>()); 753 } 754 755 756 /** 757 * Initialize privatized bin counters. Specialized for privatized shared-memory counters 758 */ InitBinCounterscub::AgentHistogram759 __device__ __forceinline__ void InitBinCounters() 760 { 761 if (prefer_smem) 762 InitSmemBinCounters(); 763 else 764 InitGmemBinCounters(); 765 } 766 767 768 /** 769 * Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters 770 */ StoreOutputcub::AgentHistogram771 __device__ __forceinline__ void StoreOutput() 772 { 773 if (prefer_smem) 774 StoreSmemOutput(); 775 else 776 StoreGmemOutput(); 777 } 778 779 780 }; 781 782 783 784 785 } // CUB namespace 786 CUB_NS_POSTFIX // Optional outer namespace(s) 787 788