1 /****************************************************************************** 2 * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the NVIDIA CORPORATION nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 ******************************************************************************/ 28 29 /** 30 * \file 31 * Static architectural properties by SM version. 32 */ 33 34 #pragma once 35 36 #include "util_cpp_dialect.cuh" 37 #include "util_namespace.cuh" 38 #include "util_macro.cuh" 39 40 /// Optional outer namespace(s) 41 CUB_NS_PREFIX 42 43 /// CUB namespace 44 namespace cub { 45 46 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 47 48 #if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \ 49 !defined(CUB_USE_COOPERATIVE_GROUPS) 50 #define CUB_USE_COOPERATIVE_GROUPS 51 #endif 52 53 /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are 54 /// compiling. In host code, CUB_PTX_ARCH's value is implementation defined. 55 #ifndef CUB_PTX_ARCH 56 #if defined(__NVCOMPILER_CUDA__) 57 // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined 58 // when compiling both host code and device code. Currently, only one 59 // PTX version can be targeted. 60 #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__ 61 #elif !defined(__CUDA_ARCH__) 62 #define CUB_PTX_ARCH 0 63 #else 64 #define CUB_PTX_ARCH __CUDA_ARCH__ 65 #endif 66 #endif 67 68 #ifndef CUB_IS_DEVICE_CODE 69 #if defined(__NVCOMPILER_CUDA__) 70 #define CUB_IS_DEVICE_CODE __builtin_is_device_code() 71 #define CUB_IS_HOST_CODE (!__builtin_is_device_code()) 72 #define CUB_INCLUDE_DEVICE_CODE 1 73 #define CUB_INCLUDE_HOST_CODE 1 74 #elif CUB_PTX_ARCH > 0 75 #define CUB_IS_DEVICE_CODE 1 76 #define CUB_IS_HOST_CODE 0 77 #define CUB_INCLUDE_DEVICE_CODE 1 78 #define CUB_INCLUDE_HOST_CODE 0 79 #else 80 #define CUB_IS_DEVICE_CODE 0 81 #define CUB_IS_HOST_CODE 1 82 #define CUB_INCLUDE_DEVICE_CODE 0 83 #define CUB_INCLUDE_HOST_CODE 1 84 #endif 85 #endif 86 87 /// Maximum number of devices supported. 88 #ifndef CUB_MAX_DEVICES 89 #define CUB_MAX_DEVICES 128 90 #endif 91 92 #if CUB_CPP_DIALECT >= 2011 93 static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0."); 94 #endif 95 96 /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. 97 #ifndef CUB_RUNTIME_FUNCTION 98 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) 99 #define CUB_RUNTIME_ENABLED 100 #define CUB_RUNTIME_FUNCTION __host__ __device__ 101 #else 102 #define CUB_RUNTIME_FUNCTION __host__ 103 #endif 104 #endif 105 106 107 /// Number of threads per warp 108 #ifndef CUB_LOG_WARP_THREADS 109 #define CUB_LOG_WARP_THREADS(arch) \ 110 (5) 111 #define CUB_WARP_THREADS(arch) \ 112 (1 << CUB_LOG_WARP_THREADS(arch)) 113 114 #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) 115 #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) 116 #endif 117 118 119 /// Number of smem banks 120 #ifndef CUB_LOG_SMEM_BANKS 121 #define CUB_LOG_SMEM_BANKS(arch) \ 122 ((arch >= 200) ? \ 123 (5) : \ 124 (4)) 125 #define CUB_SMEM_BANKS(arch) \ 126 (1 << CUB_LOG_SMEM_BANKS(arch)) 127 128 #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) 129 #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) 130 #endif 131 132 133 /// Oversubscription factor 134 #ifndef CUB_SUBSCRIPTION_FACTOR 135 #define CUB_SUBSCRIPTION_FACTOR(arch) \ 136 ((arch >= 300) ? \ 137 (5) : \ 138 ((arch >= 200) ? \ 139 (3) : \ 140 (10))) 141 #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) 142 #endif 143 144 145 /// Prefer padding overhead vs X-way conflicts greater than this threshold 146 #ifndef CUB_PREFER_CONFLICT_OVER_PADDING 147 #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ 148 ((arch >= 300) ? \ 149 (1) : \ 150 (4)) 151 #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) 152 #endif 153 154 155 template < 156 int NOMINAL_4B_BLOCK_THREADS, 157 int NOMINAL_4B_ITEMS_PER_THREAD, 158 typename T> 159 struct RegBoundScaling 160 { 161 enum { 162 ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))), 163 BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), 164 }; 165 }; 166 167 168 template < 169 int NOMINAL_4B_BLOCK_THREADS, 170 int NOMINAL_4B_ITEMS_PER_THREAD, 171 typename T> 172 struct MemBoundScaling 173 { 174 enum { 175 ITEMS_PER_THREAD = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)), 176 BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), 177 }; 178 }; 179 180 181 182 183 #endif // Do not document 184 185 } // CUB namespace 186 CUB_NS_POSTFIX // Optional outer namespace(s) 187