1 2 /****************************************************************************** 3 * Copyright (c) 2011, Duane Merrill. All rights reserved. 4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * * Neither the name of the NVIDIA CORPORATION nor the 14 * names of its contributors may be used to endorse or promote products 15 * derived from this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 * 28 ******************************************************************************/ 29 30 /** 31 * \file 32 * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. 33 */ 34 35 #pragma once 36 37 #include <stdio.h> 38 #include <iterator> 39 #include <limits> 40 41 #include "dispatch/dispatch_histogram.cuh" 42 #include "../util_namespace.cuh" 43 44 /// Optional outer namespace(s) 45 CUB_NS_PREFIX 46 47 /// CUB namespace 48 namespace cub { 49 50 51 /** 52 * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png) 53 * \ingroup SingleModule 54 * 55 * \par Overview 56 * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a> 57 * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>). 58 * 59 * \par Usage Considerations 60 * \cdp_class{DeviceHistogram} 61 * 62 */ 63 struct DeviceHistogram 64 { 65 /******************************************************************//** 66 * \name Evenly-segmented bin ranges 67 *********************************************************************/ 68 //@{ 69 70 /** 71 * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. 72 * 73 * \par 74 * - The number of histogram bins is (\p num_levels - 1) 75 * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) 76 * - \devicestorage 77 * 78 * \par Snippet 79 * The code snippet below illustrates the computation of a six-bin histogram 80 * from a sequence of float samples 81 * 82 * \par 83 * \code 84 * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh> 85 * 86 * // Declare, allocate, and initialize device-accessible pointers for input samples and 87 * // output histogram 88 * int num_samples; // e.g., 10 89 * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] 90 * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] 91 * int num_levels; // e.g., 7 (seven level boundaries for six bins) 92 * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) 93 * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) 94 * ... 95 * 96 * // Determine temporary device storage requirements 97 * void* d_temp_storage = NULL; 98 * size_t temp_storage_bytes = 0; 99 * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, 100 * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); 101 * 102 * // Allocate temporary storage 103 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 104 * 105 * // Compute histograms 106 * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, 107 * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); 108 * 109 * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; 110 * 111 * \endcode 112 * 113 * \tparam SampleIteratorT <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator 114 * \tparam CounterT <b>[inferred]</b> Integer type for histogram bin counters 115 * \tparam LevelT <b>[inferred]</b> Type for specifying boundaries (levels) 116 * \tparam OffsetT <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 117 */ 118 template < 119 typename SampleIteratorT, 120 typename CounterT, 121 typename LevelT, 122 typename OffsetT> 123 CUB_RUNTIME_FUNCTION HistogramEvencub::DeviceHistogram124 static cudaError_t HistogramEven( 125 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 126 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 127 SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. 128 CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1. 129 int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is <tt>num_levels</tt> - 1. 130 LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. 131 LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. 132 OffsetT num_samples, ///< [in] The number of input samples (i.e., the length of \p d_samples) 133 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 134 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 135 { 136 /// The sample value type of the input iterator 137 typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT; 138 139 CounterT* d_histogram1[1] = {d_histogram}; 140 int num_levels1[1] = {num_levels}; 141 LevelT lower_level1[1] = {lower_level}; 142 LevelT upper_level1[1] = {upper_level}; 143 144 return MultiHistogramEven<1, 1>( 145 d_temp_storage, 146 temp_storage_bytes, 147 d_samples, 148 d_histogram1, 149 num_levels1, 150 lower_level1, 151 upper_level1, 152 num_samples, 153 1, 154 sizeof(SampleT) * num_samples, 155 stream, 156 debug_synchronous); 157 } 158 159 160 /** 161 * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. 162 * 163 * \par 164 * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified 165 * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. 166 * - The row stride must be a whole multiple of the sample data type 167 * size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>. 168 * - The number of histogram bins is (\p num_levels - 1) 169 * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) 170 * - \devicestorage 171 * 172 * \par Snippet 173 * The code snippet below illustrates the computation of a six-bin histogram 174 * from a 2x5 region of interest within a flattened 2x7 array of float samples. 175 * 176 * \par 177 * \code 178 * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh> 179 * 180 * // Declare, allocate, and initialize device-accessible pointers for input samples and 181 * // output histogram 182 * int num_row_samples; // e.g., 5 183 * int num_rows; // e.g., 2; 184 * size_t row_stride_bytes; // e.g., 7 * sizeof(float) 185 * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, 186 * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] 187 * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] 188 * int num_levels; // e.g., 7 (seven level boundaries for six bins) 189 * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) 190 * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) 191 * ... 192 * 193 * // Determine temporary device storage requirements 194 * void* d_temp_storage = NULL; 195 * size_t temp_storage_bytes = 0; 196 * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, 197 * d_samples, d_histogram, num_levels, lower_level, upper_level, 198 * num_row_samples, num_rows, row_stride_bytes); 199 * 200 * // Allocate temporary storage 201 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 202 * 203 * // Compute histograms 204 * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, 205 * d_samples, d_histogram, num_levels, lower_level, upper_level, 206 * num_row_samples, num_rows, row_stride_bytes); 207 * 208 * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; 209 * 210 * \endcode 211 * 212 * \tparam SampleIteratorT <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator 213 * \tparam CounterT <b>[inferred]</b> Integer type for histogram bin counters 214 * \tparam LevelT <b>[inferred]</b> Type for specifying boundaries (levels) 215 * \tparam OffsetT <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 216 */ 217 template < 218 typename SampleIteratorT, 219 typename CounterT, 220 typename LevelT, 221 typename OffsetT> 222 CUB_RUNTIME_FUNCTION HistogramEvencub::DeviceHistogram223 static cudaError_t HistogramEven( 224 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 225 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 226 SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. 227 CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1. 228 int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is <tt>num_levels</tt> - 1. 229 LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. 230 LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. 231 OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest 232 OffsetT num_rows, ///< [in] The number of rows in the region of interest 233 size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest 234 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 235 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 236 { 237 CounterT* d_histogram1[1] = {d_histogram}; 238 int num_levels1[1] = {num_levels}; 239 LevelT lower_level1[1] = {lower_level}; 240 LevelT upper_level1[1] = {upper_level}; 241 242 return MultiHistogramEven<1, 1>( 243 d_temp_storage, 244 temp_storage_bytes, 245 d_samples, 246 d_histogram1, 247 num_levels1, 248 lower_level1, 249 upper_level1, 250 num_row_samples, 251 num_rows, 252 row_stride_bytes, 253 stream, 254 debug_synchronous); 255 } 256 257 /** 258 * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. 259 * 260 * \par 261 * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises 262 * a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel). 263 * - Of the \p NUM_CHANNELS specified, the function will only compute histograms 264 * for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em> 265 * pixel samples). 266 * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1. 267 * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins 268 * have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1) 269 * - \devicestorage 270 * 271 * \par Snippet 272 * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms 273 * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel) 274 * 275 * \par 276 * \code 277 * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh> 278 * 279 * // Declare, allocate, and initialize device-accessible pointers for input samples 280 * // and output histograms 281 * int num_pixels; // e.g., 5 282 * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), 283 * // (0, 6, 7, 5), (3, 0, 2, 6)] 284 * int* d_histogram[3]; // e.g., three device pointers to three device buffers, 285 * // each allocated with 256 integer counters 286 * int num_levels[3]; // e.g., {257, 257, 257}; 287 * unsigned int lower_level[3]; // e.g., {0, 0, 0}; 288 * unsigned int upper_level[3]; // e.g., {256, 256, 256}; 289 * ... 290 * 291 * // Determine temporary device storage requirements 292 * void* d_temp_storage = NULL; 293 * size_t temp_storage_bytes = 0; 294 * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, 295 * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); 296 * 297 * // Allocate temporary storage 298 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 299 * 300 * // Compute histograms 301 * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, 302 * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); 303 * 304 * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], 305 * // [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], 306 * // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] 307 * 308 * \endcode 309 * 310 * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) 311 * \tparam NUM_ACTIVE_CHANNELS <b>[inferred]</b> Number of channels actively being histogrammed 312 * \tparam SampleIteratorT <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator 313 * \tparam CounterT <b>[inferred]</b> Integer type for histogram bin counters 314 * \tparam LevelT <b>[inferred]</b> Type for specifying boundaries (levels) 315 * \tparam OffsetT <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 316 */ 317 template < 318 int NUM_CHANNELS, 319 int NUM_ACTIVE_CHANNELS, 320 typename SampleIteratorT, 321 typename CounterT, 322 typename LevelT, 323 typename OffsetT> 324 CUB_RUNTIME_FUNCTION MultiHistogramEvencub::DeviceHistogram325 static cudaError_t MultiHistogramEven( 326 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 327 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 328 SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples). 329 CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1. 330 int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1. 331 LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. 332 LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. 333 OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) 334 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 335 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 336 { 337 /// The sample value type of the input iterator 338 typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT; 339 340 return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>( 341 d_temp_storage, 342 temp_storage_bytes, 343 d_samples, 344 d_histogram, 345 num_levels, 346 lower_level, 347 upper_level, 348 num_pixels, 349 1, 350 sizeof(SampleT) * NUM_CHANNELS * num_pixels, 351 stream, 352 debug_synchronous); 353 } 354 355 356 /** 357 * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. 358 * 359 * \par 360 * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises 361 * a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel). 362 * - Of the \p NUM_CHANNELS specified, the function will only compute histograms 363 * for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em> 364 * pixel samples). 365 * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified 366 * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. 367 * - The row stride must be a whole multiple of the sample data type 368 * size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>. 369 * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1. 370 * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins 371 * have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1) 372 * - \devicestorage 373 * 374 * \par Snippet 375 * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms from a 2x3 region of 376 * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel). 377 * 378 * \par 379 * \code 380 * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh> 381 * 382 * // Declare, allocate, and initialize device-accessible pointers for input samples 383 * // and output histograms 384 * int num_row_pixels; // e.g., 3 385 * int num_rows; // e.g., 2 386 * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS 387 * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -), 388 * // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)] 389 * int* d_histogram[3]; // e.g., three device pointers to three device buffers, 390 * // each allocated with 256 integer counters 391 * int num_levels[3]; // e.g., {257, 257, 257}; 392 * unsigned int lower_level[3]; // e.g., {0, 0, 0}; 393 * unsigned int upper_level[3]; // e.g., {256, 256, 256}; 394 * ... 395 * 396 * // Determine temporary device storage requirements 397 * void* d_temp_storage = NULL; 398 * size_t temp_storage_bytes = 0; 399 * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, 400 * d_samples, d_histogram, num_levels, lower_level, upper_level, 401 * num_row_pixels, num_rows, row_stride_bytes); 402 * 403 * // Allocate temporary storage 404 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 405 * 406 * // Compute histograms 407 * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, 408 * d_samples, d_histogram, num_levels, lower_level, upper_level, 409 * num_row_pixels, num_rows, row_stride_bytes); 410 * 411 * // d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], 412 * // [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], 413 * // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] 414 * 415 * \endcode 416 * 417 * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) 418 * \tparam NUM_ACTIVE_CHANNELS <b>[inferred]</b> Number of channels actively being histogrammed 419 * \tparam SampleIteratorT <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator 420 * \tparam CounterT <b>[inferred]</b> Integer type for histogram bin counters 421 * \tparam LevelT <b>[inferred]</b> Type for specifying boundaries (levels) 422 * \tparam OffsetT <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 423 */ 424 template < 425 int NUM_CHANNELS, 426 int NUM_ACTIVE_CHANNELS, 427 typename SampleIteratorT, 428 typename CounterT, 429 typename LevelT, 430 typename OffsetT> 431 CUB_RUNTIME_FUNCTION MultiHistogramEvencub::DeviceHistogram432 static cudaError_t MultiHistogramEven( 433 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 434 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 435 SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples). 436 CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1. 437 int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1. 438 LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. 439 LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. 440 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest 441 OffsetT num_rows, ///< [in] The number of rows in the region of interest 442 size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest 443 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 444 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 445 { 446 /// The sample value type of the input iterator 447 typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT; 448 Int2Type<sizeof(SampleT) == 1> is_byte_sample; 449 450 if ((sizeof(OffsetT) > sizeof(int)) && 451 ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max())) 452 { 453 // Down-convert OffsetT data type 454 455 456 return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven( 457 d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, 458 (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), 459 stream, debug_synchronous, is_byte_sample); 460 } 461 462 return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven( 463 d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, 464 num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), 465 stream, debug_synchronous, is_byte_sample); 466 } 467 468 469 //@} end member group 470 /******************************************************************//** 471 * \name Custom bin ranges 472 *********************************************************************/ 473 //@{ 474 475 /** 476 * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. 477 * 478 * \par 479 * - The number of histogram bins is (\p num_levels - 1) 480 * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>) 481 * - \devicestorage 482 * 483 * \par Snippet 484 * The code snippet below illustrates the computation of an six-bin histogram 485 * from a sequence of float samples 486 * 487 * \par 488 * \code 489 * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh> 490 * 491 * // Declare, allocate, and initialize device-accessible pointers for input samples and 492 * // output histogram 493 * int num_samples; // e.g., 10 494 * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] 495 * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] 496 * int num_levels // e.g., 7 (seven level boundaries for six bins) 497 * float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] 498 * ... 499 * 500 * // Determine temporary device storage requirements 501 * void* d_temp_storage = NULL; 502 * size_t temp_storage_bytes = 0; 503 * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, 504 * d_samples, d_histogram, num_levels, d_levels, num_samples); 505 * 506 * // Allocate temporary storage 507 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 508 * 509 * // Compute histograms 510 * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, 511 * d_samples, d_histogram, num_levels, d_levels, num_samples); 512 * 513 * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; 514 * 515 * \endcode 516 * 517 * \tparam SampleIteratorT <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator 518 * \tparam CounterT <b>[inferred]</b> Integer type for histogram bin counters 519 * \tparam LevelT <b>[inferred]</b> Type for specifying boundaries (levels) 520 * \tparam OffsetT <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 521 */ 522 template < 523 typename SampleIteratorT, 524 typename CounterT, 525 typename LevelT, 526 typename OffsetT> 527 CUB_RUNTIME_FUNCTION HistogramRangecub::DeviceHistogram528 static cudaError_t HistogramRange( 529 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 530 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 531 SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. 532 CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1. 533 int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is <tt>num_levels</tt> - 1. 534 LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. 535 OffsetT num_samples, ///< [in] The number of data samples per row in the region of interest 536 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 537 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 538 { 539 /// The sample value type of the input iterator 540 typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT; 541 542 CounterT* d_histogram1[1] = {d_histogram}; 543 int num_levels1[1] = {num_levels}; 544 LevelT* d_levels1[1] = {d_levels}; 545 546 return MultiHistogramRange<1, 1>( 547 d_temp_storage, 548 temp_storage_bytes, 549 d_samples, 550 d_histogram1, 551 num_levels1, 552 d_levels1, 553 num_samples, 554 1, 555 sizeof(SampleT) * num_samples, 556 stream, 557 debug_synchronous); 558 } 559 560 561 /** 562 * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. 563 * 564 * \par 565 * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified 566 * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. 567 * - The row stride must be a whole multiple of the sample data type 568 * size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>. 569 * - The number of histogram bins is (\p num_levels - 1) 570 * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>) 571 * - \devicestorage 572 * 573 * \par Snippet 574 * The code snippet below illustrates the computation of a six-bin histogram 575 * from a 2x5 region of interest within a flattened 2x7 array of float samples. 576 * 577 * \par 578 * \code 579 * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh> 580 * 581 * // Declare, allocate, and initialize device-accessible pointers for input samples and 582 * // output histogram 583 * int num_row_samples; // e.g., 5 584 * int num_rows; // e.g., 2; 585 * int row_stride_bytes; // e.g., 7 * sizeof(float) 586 * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, 587 * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] 588 * int* d_histogram; // e.g., [ , , , , , , , ] 589 * int num_levels // e.g., 7 (seven level boundaries for six bins) 590 * float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] 591 * ... 592 * 593 * // Determine temporary device storage requirements 594 * void* d_temp_storage = NULL; 595 * size_t temp_storage_bytes = 0; 596 * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, 597 * d_samples, d_histogram, num_levels, d_levels, 598 * num_row_samples, num_rows, row_stride_bytes); 599 * 600 * // Allocate temporary storage 601 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 602 * 603 * // Compute histograms 604 * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, 605 * d_samples, d_histogram, num_levels, d_levels, 606 * num_row_samples, num_rows, row_stride_bytes); 607 * 608 * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; 609 * 610 * \endcode 611 * 612 * \tparam SampleIteratorT <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator 613 * \tparam CounterT <b>[inferred]</b> Integer type for histogram bin counters 614 * \tparam LevelT <b>[inferred]</b> Type for specifying boundaries (levels) 615 * \tparam OffsetT <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 616 */ 617 template < 618 typename SampleIteratorT, 619 typename CounterT, 620 typename LevelT, 621 typename OffsetT> 622 CUB_RUNTIME_FUNCTION HistogramRangecub::DeviceHistogram623 static cudaError_t HistogramRange( 624 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 625 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 626 SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. 627 CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1. 628 int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is <tt>num_levels</tt> - 1. 629 LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. 630 OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest 631 OffsetT num_rows, ///< [in] The number of rows in the region of interest 632 size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest 633 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 634 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 635 { 636 CounterT* d_histogram1[1] = {d_histogram}; 637 int num_levels1[1] = {num_levels}; 638 LevelT* d_levels1[1] = {d_levels}; 639 640 return MultiHistogramRange<1, 1>( 641 d_temp_storage, 642 temp_storage_bytes, 643 d_samples, 644 d_histogram1, 645 num_levels1, 646 d_levels1, 647 num_row_samples, 648 num_rows, 649 row_stride_bytes, 650 stream, 651 debug_synchronous); 652 } 653 654 /** 655 * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. 656 * 657 * \par 658 * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises 659 * a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel). 660 * - Of the \p NUM_CHANNELS specified, the function will only compute histograms 661 * for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em> 662 * pixel samples). 663 * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1. 664 * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins 665 * have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1) 666 * - \devicestorage 667 * 668 * \par Snippet 669 * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms 670 * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel) 671 * 672 * \par 673 * \code 674 * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh> 675 * 676 * // Declare, allocate, and initialize device-accessible pointers for input samples 677 * // and output histograms 678 * int num_pixels; // e.g., 5 679 * unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), 680 * // (0, 6, 7, 5),(3, 0, 2, 6)] 681 * unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; 682 * int num_levels[3]; // e.g., {5, 5, 5}; 683 * unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], 684 * // [0, 2, 4, 6, 8], 685 * // [0, 2, 4, 6, 8] ]; 686 * ... 687 * 688 * // Determine temporary device storage requirements 689 * void* d_temp_storage = NULL; 690 * size_t temp_storage_bytes = 0; 691 * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, 692 * d_samples, d_histogram, num_levels, d_levels, num_pixels); 693 * 694 * // Allocate temporary storage 695 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 696 * 697 * // Compute histograms 698 * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, 699 * d_samples, d_histogram, num_levels, d_levels, num_pixels); 700 * 701 * // d_histogram <-- [ [1, 3, 0, 1], 702 * // [3, 0, 0, 2], 703 * // [0, 2, 0, 3] ] 704 * 705 * \endcode 706 * 707 * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) 708 * \tparam NUM_ACTIVE_CHANNELS <b>[inferred]</b> Number of channels actively being histogrammed 709 * \tparam SampleIteratorT <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator 710 * \tparam CounterT <b>[inferred]</b> Integer type for histogram bin counters 711 * \tparam LevelT <b>[inferred]</b> Type for specifying boundaries (levels) 712 * \tparam OffsetT <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 713 */ 714 template < 715 int NUM_CHANNELS, 716 int NUM_ACTIVE_CHANNELS, 717 typename SampleIteratorT, 718 typename CounterT, 719 typename LevelT, 720 typename OffsetT> 721 CUB_RUNTIME_FUNCTION MultiHistogramRangecub::DeviceHistogram722 static cudaError_t MultiHistogramRange( 723 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 724 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 725 SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples). 726 CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1. 727 int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1. 728 LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. 729 OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) 730 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 731 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 732 { 733 /// The sample value type of the input iterator 734 typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT; 735 736 return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>( 737 d_temp_storage, 738 temp_storage_bytes, 739 d_samples, 740 d_histogram, 741 num_levels, 742 d_levels, 743 num_pixels, 744 1, 745 sizeof(SampleT) * NUM_CHANNELS * num_pixels, 746 stream, 747 debug_synchronous); 748 } 749 750 751 /** 752 * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. 753 * 754 * \par 755 * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises 756 * a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel). 757 * - Of the \p NUM_CHANNELS specified, the function will only compute histograms 758 * for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em> 759 * pixel samples). 760 * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified 761 * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. 762 * - The row stride must be a whole multiple of the sample data type 763 * size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>. 764 * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1. 765 * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins 766 * have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1) 767 * - \devicestorage 768 * 769 * \par Snippet 770 * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms from a 2x3 region of 771 * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel). 772 * 773 * \par 774 * \code 775 * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh> 776 * 777 * // Declare, allocate, and initialize device-accessible pointers for input samples 778 * // and output histograms 779 * int num_row_pixels; // e.g., 3 780 * int num_rows; // e.g., 2 781 * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS 782 * unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -), 783 * // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)] 784 * int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; 785 * int num_levels[3]; // e.g., {5, 5, 5}; 786 * unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], 787 * // [0, 2, 4, 6, 8], 788 * // [0, 2, 4, 6, 8] ]; 789 * ... 790 * 791 * // Determine temporary device storage requirements 792 * void* d_temp_storage = NULL; 793 * size_t temp_storage_bytes = 0; 794 * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, 795 * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); 796 * 797 * // Allocate temporary storage 798 * cudaMalloc(&d_temp_storage, temp_storage_bytes); 799 * 800 * // Compute histograms 801 * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, 802 * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); 803 * 804 * // d_histogram <-- [ [2, 3, 0, 1], 805 * // [3, 0, 0, 2], 806 * // [1, 2, 0, 3] ] 807 * 808 * \endcode 809 * 810 * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) 811 * \tparam NUM_ACTIVE_CHANNELS <b>[inferred]</b> Number of channels actively being histogrammed 812 * \tparam SampleIteratorT <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator 813 * \tparam CounterT <b>[inferred]</b> Integer type for histogram bin counters 814 * \tparam LevelT <b>[inferred]</b> Type for specifying boundaries (levels) 815 * \tparam OffsetT <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 816 */ 817 template < 818 int NUM_CHANNELS, 819 int NUM_ACTIVE_CHANNELS, 820 typename SampleIteratorT, 821 typename CounterT, 822 typename LevelT, 823 typename OffsetT> 824 CUB_RUNTIME_FUNCTION MultiHistogramRangecub::DeviceHistogram825 static cudaError_t MultiHistogramRange( 826 void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 827 size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 828 SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples). 829 CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1. 830 int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1. 831 LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. 832 OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest 833 OffsetT num_rows, ///< [in] The number of rows in the region of interest 834 size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest 835 cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>. 836 bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 837 { 838 /// The sample value type of the input iterator 839 typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT; 840 Int2Type<sizeof(SampleT) == 1> is_byte_sample; 841 842 if ((sizeof(OffsetT) > sizeof(int)) && 843 ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max())) 844 { 845 // Down-convert OffsetT data type 846 return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange( 847 d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, 848 (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), 849 stream, debug_synchronous, is_byte_sample); 850 } 851 852 return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange( 853 d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, 854 num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), 855 stream, debug_synchronous, is_byte_sample); 856 } 857 858 859 860 //@} end member group 861 }; 862 863 } // CUB namespace 864 CUB_NS_POSTFIX // Optional outer namespace(s) 865 866 867