1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /**
30  * \file
31  * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
32  */
33 
34 #pragma once
35 
36 #include "../util_debug.cuh"
37 #include "../util_namespace.cuh"
38 #include "../thread/thread_load.cuh"
39 
40 /// Optional outer namespace(s)
41 CUB_NS_PREFIX
42 
43 /// CUB namespace
44 namespace cub {
45 
46 
47 /**
48  * \addtogroup GridModule
49  * @{
50  */
51 
52 
53 /**
54  * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
55  */
56 class GridBarrier
57 {
58 protected :
59 
60     typedef unsigned int SyncFlag;
61 
62     // Counters in global device memory
63     SyncFlag* d_sync;
64 
65 public:
66 
67     /**
68      * Constructor
69      */
GridBarrier()70     GridBarrier() : d_sync(NULL) {}
71 
72 
73     /**
74      * Synchronize
75      */
Sync() const76     __device__ __forceinline__ void Sync() const
77     {
78         volatile SyncFlag *d_vol_sync = d_sync;
79 
80         // Threadfence and syncthreads to make sure global writes are visible before
81         // thread-0 reports in with its sync counter
82         __threadfence();
83         CTA_SYNC();
84 
85         if (blockIdx.x == 0)
86         {
87             // Report in ourselves
88             if (threadIdx.x == 0)
89             {
90                 d_vol_sync[blockIdx.x] = 1;
91             }
92 
93             CTA_SYNC();
94 
95             // Wait for everyone else to report in
96             for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
97             {
98                 while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
99                 {
100                     __threadfence_block();
101                 }
102             }
103 
104             CTA_SYNC();
105 
106             // Let everyone know it's safe to proceed
107             for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
108             {
109                 d_vol_sync[peer_block] = 0;
110             }
111         }
112         else
113         {
114             if (threadIdx.x == 0)
115             {
116                 // Report in
117                 d_vol_sync[blockIdx.x] = 1;
118 
119                 // Wait for acknowledgment
120                 while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
121                 {
122                     __threadfence_block();
123                 }
124             }
125 
126             CTA_SYNC();
127         }
128     }
129 };
130 
131 
132 /**
133  * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
134  *
135  * Uses RAII for lifetime, i.e., device resources are reclaimed when
136  * the destructor is called.
137  */
138 class GridBarrierLifetime : public GridBarrier
139 {
140 protected:
141 
142     // Number of bytes backed by d_sync
143     size_t sync_bytes;
144 
145 public:
146 
147     /**
148      * Constructor
149      */
GridBarrierLifetime()150     GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
151 
152 
153     /**
154      * DeviceFrees and resets the progress counters
155      */
HostReset()156     cudaError_t HostReset()
157     {
158         cudaError_t retval = cudaSuccess;
159         if (d_sync)
160         {
161             CubDebug(retval = cudaFree(d_sync));
162             d_sync = NULL;
163         }
164         sync_bytes = 0;
165         return retval;
166     }
167 
168 
169     /**
170      * Destructor
171      */
~GridBarrierLifetime()172     virtual ~GridBarrierLifetime()
173     {
174         HostReset();
175     }
176 
177 
178     /**
179      * Sets up the progress counters for the next kernel launch (lazily
180      * allocating and initializing them if necessary)
181      */
Setup(int sweep_grid_size)182     cudaError_t Setup(int sweep_grid_size)
183     {
184         cudaError_t retval = cudaSuccess;
185         do {
186             size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
187             if (new_sync_bytes > sync_bytes)
188             {
189                 if (d_sync)
190                 {
191                     if (CubDebug(retval = cudaFree(d_sync))) break;
192                 }
193 
194                 sync_bytes = new_sync_bytes;
195 
196                 // Allocate and initialize to zero
197                 if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
198                 if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
199             }
200         } while (0);
201 
202         return retval;
203     }
204 };
205 
206 
207 /** @} */       // end group GridModule
208 
209 }               // CUB namespace
210 CUB_NS_POSTFIX  // Optional outer namespace(s)
211 
212