1 /****************************************************************************** 2 * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the NVIDIA CORPORATION nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 ******************************************************************************/ 28 29 /** 30 * \file 31 * Static architectural properties by SM version. 32 */ 33 34 #pragma once 35 36 #include "util_namespace.cuh" 37 38 /// Optional outer namespace(s) 39 CUB_NS_PREFIX 40 41 /// CUB namespace 42 namespace cub { 43 44 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 45 46 #if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS) 47 #define CUB_USE_COOPERATIVE_GROUPS 48 #endif 49 50 /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). 51 #ifndef CUB_PTX_ARCH 52 #ifndef __CUDA_ARCH__ 53 #define CUB_PTX_ARCH 0 54 #else 55 #define CUB_PTX_ARCH __CUDA_ARCH__ 56 #endif 57 #endif 58 59 60 /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. 61 #ifndef CUB_RUNTIME_FUNCTION 62 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) 63 #define CUB_RUNTIME_ENABLED 64 #define CUB_RUNTIME_FUNCTION __host__ __device__ 65 #else 66 #define CUB_RUNTIME_FUNCTION __host__ 67 #endif 68 #endif 69 70 71 /// Number of threads per warp 72 #ifndef CUB_LOG_WARP_THREADS 73 #define CUB_LOG_WARP_THREADS(arch) \ 74 (5) 75 #define CUB_WARP_THREADS(arch) \ 76 (1 << CUB_LOG_WARP_THREADS(arch)) 77 78 #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) 79 #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) 80 #endif 81 82 83 /// Number of smem banks 84 #ifndef CUB_LOG_SMEM_BANKS 85 #define CUB_LOG_SMEM_BANKS(arch) \ 86 ((arch >= 200) ? \ 87 (5) : \ 88 (4)) 89 #define CUB_SMEM_BANKS(arch) \ 90 (1 << CUB_LOG_SMEM_BANKS(arch)) 91 92 #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) 93 #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) 94 #endif 95 96 97 /// Oversubscription factor 98 #ifndef CUB_SUBSCRIPTION_FACTOR 99 #define CUB_SUBSCRIPTION_FACTOR(arch) \ 100 ((arch >= 300) ? \ 101 (5) : \ 102 ((arch >= 200) ? \ 103 (3) : \ 104 (10))) 105 #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) 106 #endif 107 108 109 /// Prefer padding overhead vs X-way conflicts greater than this threshold 110 #ifndef CUB_PREFER_CONFLICT_OVER_PADDING 111 #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ 112 ((arch >= 300) ? \ 113 (1) : \ 114 (4)) 115 #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) 116 #endif 117 118 119 /// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data. Minimum of two warps. 120 #ifndef CUB_SCALED_BLOCK_THREADS 121 #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ 122 (CUB_MIN( \ 123 NOMINAL_4B_BLOCK_THREADS, \ 124 CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX( \ 125 2, \ 126 (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T)))) 127 #endif 128 129 /// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data. Minimum 1 item per thread 130 #ifndef CUB_SCALED_ITEMS_PER_THREAD 131 #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ 132 CUB_MAX( \ 133 1, \ 134 (sizeof(T) < 4) ? \ 135 ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 : \ 136 ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)) 137 #endif 138 139 /// Define both nominal threads-per-block and items-per-thread 140 #ifndef CUB_SCALED_GRANULARITIES 141 #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T) \ 142 CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200), \ 143 CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200) 144 #endif 145 146 147 148 #endif // Do not document 149 150 } // CUB namespace 151 CUB_NS_POSTFIX // Optional outer namespace(s) 152