1 /****************************************************************************** 2 * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the NVIDIA CORPORATION nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 ******************************************************************************/ 28 29 /** 30 * \file 31 * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid 32 */ 33 34 #pragma once 35 36 #include "../util_debug.cuh" 37 #include "../util_namespace.cuh" 38 #include "../thread/thread_load.cuh" 39 40 /// Optional outer namespace(s) 41 CUB_NS_PREFIX 42 43 /// CUB namespace 44 namespace cub { 45 46 47 /** 48 * \addtogroup GridModule 49 * @{ 50 */ 51 52 53 /** 54 * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid 55 */ 56 class GridBarrier 57 { 58 protected : 59 60 typedef unsigned int SyncFlag; 61 62 // Counters in global device memory 63 SyncFlag* d_sync; 64 65 public: 66 67 /** 68 * Constructor 69 */ GridBarrier()70 GridBarrier() : d_sync(NULL) {} 71 72 73 /** 74 * Synchronize 75 */ Sync() const76 __device__ __forceinline__ void Sync() const 77 { 78 volatile SyncFlag *d_vol_sync = d_sync; 79 80 // Threadfence and syncthreads to make sure global writes are visible before 81 // thread-0 reports in with its sync counter 82 __threadfence(); 83 CTA_SYNC(); 84 85 if (blockIdx.x == 0) 86 { 87 // Report in ourselves 88 if (threadIdx.x == 0) 89 { 90 d_vol_sync[blockIdx.x] = 1; 91 } 92 93 CTA_SYNC(); 94 95 // Wait for everyone else to report in 96 for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) 97 { 98 while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0) 99 { 100 __threadfence_block(); 101 } 102 } 103 104 CTA_SYNC(); 105 106 // Let everyone know it's safe to proceed 107 for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) 108 { 109 d_vol_sync[peer_block] = 0; 110 } 111 } 112 else 113 { 114 if (threadIdx.x == 0) 115 { 116 // Report in 117 d_vol_sync[blockIdx.x] = 1; 118 119 // Wait for acknowledgment 120 while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1) 121 { 122 __threadfence_block(); 123 } 124 } 125 126 CTA_SYNC(); 127 } 128 } 129 }; 130 131 132 /** 133 * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. 134 * 135 * Uses RAII for lifetime, i.e., device resources are reclaimed when 136 * the destructor is called. 137 */ 138 class GridBarrierLifetime : public GridBarrier 139 { 140 protected: 141 142 // Number of bytes backed by d_sync 143 size_t sync_bytes; 144 145 public: 146 147 /** 148 * Constructor 149 */ GridBarrierLifetime()150 GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} 151 152 153 /** 154 * DeviceFrees and resets the progress counters 155 */ HostReset()156 cudaError_t HostReset() 157 { 158 cudaError_t retval = cudaSuccess; 159 if (d_sync) 160 { 161 CubDebug(retval = cudaFree(d_sync)); 162 d_sync = NULL; 163 } 164 sync_bytes = 0; 165 return retval; 166 } 167 168 169 /** 170 * Destructor 171 */ ~GridBarrierLifetime()172 virtual ~GridBarrierLifetime() 173 { 174 HostReset(); 175 } 176 177 178 /** 179 * Sets up the progress counters for the next kernel launch (lazily 180 * allocating and initializing them if necessary) 181 */ Setup(int sweep_grid_size)182 cudaError_t Setup(int sweep_grid_size) 183 { 184 cudaError_t retval = cudaSuccess; 185 do { 186 size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); 187 if (new_sync_bytes > sync_bytes) 188 { 189 if (d_sync) 190 { 191 if (CubDebug(retval = cudaFree(d_sync))) break; 192 } 193 194 sync_bytes = new_sync_bytes; 195 196 // Allocate and initialize to zero 197 if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; 198 if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; 199 } 200 } while (0); 201 202 return retval; 203 } 204 }; 205 206 207 /** @} */ // end group GridModule 208 209 } // CUB namespace 210 CUB_NS_POSTFIX // Optional outer namespace(s) 211 212