1 #pragma once
2
3 #include "cuda_graph.hpp"
4
5 /**
6 @file cuda_optimizer.hpp
7 @brief %cudaFlow capturing algorithms include file
8 */
9
10 namespace tf {
11
12 // ----------------------------------------------------------------------------
13 // cudaCapturingBase
14 // ----------------------------------------------------------------------------
15
16 /**
17 @private
18
19 @brief class to provide helper common methods for optimization algorithms
20 */
21 class cudaCapturingBase {
22
23 protected:
24
25 std::vector<cudaNode*> _toposort(cudaGraph&);
26 std::vector<std::vector<cudaNode*>> _levelize(cudaGraph&);
27 };
28
29 // Function: _toposort
_toposort(cudaGraph & graph)30 inline std::vector<cudaNode*> cudaCapturingBase::_toposort(cudaGraph& graph) {
31
32 std::vector<cudaNode*> res;
33 std::queue<cudaNode*> bfs;
34
35 res.reserve(graph._nodes.size());
36
37 // insert the first level of nodes into the queue
38 for(auto& u : graph._nodes) {
39
40 auto& hu = std::get<cudaNode::Capture>(u->_handle);
41 hu.level = u->_dependents.size();
42
43 if(hu.level == 0) {
44 bfs.push(u.get());
45 }
46 }
47
48 // levelize the graph using bfs
49 while(!bfs.empty()) {
50
51 auto u = bfs.front();
52 bfs.pop();
53
54 res.push_back(u);
55
56 auto& hu = std::get<cudaNode::Capture>(u->_handle);
57
58 for(auto v : u->_successors) {
59 auto& hv = std::get<cudaNode::Capture>(v->_handle);
60 if(--hv.level == 0) {
61 bfs.push(v);
62 }
63 }
64 }
65
66 return res;
67 }
68
69 // Function: _levelize
70 inline std::vector<std::vector<cudaNode*>>
_levelize(cudaGraph & graph)71 cudaCapturingBase::_levelize(cudaGraph& graph) {
72
73 std::queue<cudaNode*> bfs;
74
75 size_t max_level = 0;
76
77 // insert the first level of nodes into the queue
78 for(auto& u : graph._nodes) {
79
80 auto& hu = std::get<cudaNode::Capture>(u->_handle);
81 hu.level = u->_dependents.size();
82
83 if(hu.level == 0) {
84 bfs.push(u.get());
85 }
86 }
87
88 // levelize the graph using bfs
89 while(!bfs.empty()) {
90
91 auto u = bfs.front();
92 bfs.pop();
93
94 auto& hu = std::get<cudaNode::Capture>(u->_handle);
95
96 for(auto v : u->_successors) {
97 auto& hv = std::get<cudaNode::Capture>(v->_handle);
98 if(--hv.level == 0) {
99 hv.level = hu.level + 1;
100 if(hv.level > max_level) {
101 max_level = hv.level;
102 }
103 bfs.push(v);
104 }
105 }
106 }
107
108 // set level_graph and each node's idx
109 std::vector<std::vector<cudaNode*>> level_graph(max_level+1);
110 for(auto& u : graph._nodes) {
111 auto& hu = std::get<cudaNode::Capture>(u->_handle);
112 hu.lid = level_graph[hu.level].size();
113 level_graph[hu.level].emplace_back(u.get());
114
115 //for(auto s : u->_successors) {
116 // assert(hu.level < std::get<cudaNode::Capture>(s->_handle).level);
117 //}
118 }
119
120 return level_graph;
121 }
122
123 // ----------------------------------------------------------------------------
124 // class definition: cudaSequentialCapturing
125 // ----------------------------------------------------------------------------
126
127 /**
128 @class cudaSequentialCapturing
129
130 @brief class to capture a CUDA graph using a sequential stream
131
132 A sequential capturing algorithm finds a topological order of
133 the described graph and captures dependent GPU tasks using a single stream.
134 All GPU tasks run sequentially without breaking inter dependencies.
135 */
136 class cudaSequentialCapturing : public cudaCapturingBase {
137
138 friend class cudaFlowCapturer;
139
140 public:
141
142 /**
143 @brief constructs a sequential optimizer
144 */
145 cudaSequentialCapturing() = default;
146
147 private:
148
149 cudaGraph_t _optimize(cudaGraph& graph);
150 };
151
_optimize(cudaGraph & graph)152 inline cudaGraph_t cudaSequentialCapturing::_optimize(cudaGraph& graph) {
153 // acquire per-thread stream and turn it into capture mode
154 // we must use ThreadLocal mode to avoid clashing with CUDA global states
155 cudaScopedPerThreadStream stream;
156
157 cudaGraph_t native_g;
158
159 TF_CHECK_CUDA(
160 cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal),
161 "failed to turn stream into per-thread capture mode"
162 );
163
164 auto ordered = _toposort(graph);
165 for(auto& node : ordered) {
166 std::get<cudaNode::Capture>(node->_handle).work(stream);
167 }
168
169 TF_CHECK_CUDA(
170 cudaStreamEndCapture(stream, &native_g), "failed to end capture"
171 );
172
173 return native_g;
174 }
175
176 // ----------------------------------------------------------------------------
177 // class definition: cudaLinearCapturing
178 // ----------------------------------------------------------------------------
179
180 /**
181 @class cudaLinearCapturing
182
183 @brief class to capture a linear CUDA graph using a sequential stream
184
185 A linear capturing algorithm is a special case of tf::cudaSequentialCapturing
186 and assumes the input task graph to be a single linear chain of tasks
187 (i.e., a straight line).
188 This assumption allows faster optimization during the capturing process.
189 If the input task graph is not a linear chain, the behavior is undefined.
190 */
191 class cudaLinearCapturing : public cudaCapturingBase {
192
193 friend class cudaFlowCapturer;
194
195 public:
196
197 /**
198 @brief constructs a linear optimizer
199 */
200 cudaLinearCapturing() = default;
201
202 private:
203
204 cudaGraph_t _optimize(cudaGraph& graph);
205 };
206
_optimize(cudaGraph & graph)207 inline cudaGraph_t cudaLinearCapturing::_optimize(cudaGraph& graph) {
208
209 // acquire per-thread stream and turn it into capture mode
210 // we must use ThreadLocal mode to avoid clashing with CUDA global states
211 cudaScopedPerThreadStream stream;
212
213 cudaGraph_t native_g;
214
215 TF_CHECK_CUDA(
216 cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal),
217 "failed to turn stream into per-thread capture mode"
218 );
219
220 // find the source node
221 cudaNode* src {nullptr};
222 for(auto& u : graph._nodes) {
223 if(u->_dependents.size() == 0) {
224 src = u.get();
225 while(src) {
226 std::get<cudaNode::Capture>(src->_handle).work(stream);
227 src = src->_successors.empty() ? nullptr : src->_successors[0];
228 }
229 break;
230 }
231 // ideally, there should be only one source
232 }
233
234 TF_CHECK_CUDA(
235 cudaStreamEndCapture(stream, &native_g), "failed to end capture"
236 );
237
238 return native_g;
239 }
240
241 // ----------------------------------------------------------------------------
242 // class definition: cudaRoundRobinCapturing
243 // ----------------------------------------------------------------------------
244
245 /**
246 @class cudaRoundRobinCapturing
247
248 @brief class to capture a CUDA graph using a round-robin algorithm
249
250 A round-robin capturing algorithm levelizes the user-described graph
251 and assign streams to nodes in a round-robin order level by level.
252 The algorithm is based on the following paper published in Euro-Par 2021:
253 + Dian-Lun Lin and Tsung-Wei Huang, "Efficient GPU Computation using %Task Graph Parallelism," <i>European Conference on Parallel and Distributed Computing (Euro-Par)</i>, 2021
254
255 The round-robin optimization algorithm is best suited for large %cudaFlow graphs
256 that compose hundreds of or thousands of GPU operations
257 (e.g., kernels and memory copies) with many of them being able to run in parallel.
258 You can configure the number of streams to the optimizer to adjust the
259 maximum kernel currency in the captured CUDA graph.
260 */
261 class cudaRoundRobinCapturing : public cudaCapturingBase {
262
263 friend class cudaFlowCapturer;
264
265 public:
266
267 /**
268 @brief constructs a round-robin optimizer with 4 streams by default
269 */
270 cudaRoundRobinCapturing() = default;
271
272 /**
273 @brief constructs a round-robin optimizer with the given number of streams
274 */
275 cudaRoundRobinCapturing(size_t num_streams);
276
277 /**
278 @brief queries the number of streams used by the optimizer
279 */
280 size_t num_streams() const;
281
282 /**
283 @brief sets the number of streams used by the optimizer
284 */
285 void num_streams(size_t n);
286
287 private:
288
289 size_t _num_streams {4};
290
291 cudaGraph_t _optimize(cudaGraph& graph);
292
293 void _reset(std::vector<std::vector<cudaNode*>>& graph);
294
295 };
296
297 // Constructor
cudaRoundRobinCapturing(size_t num_streams)298 inline cudaRoundRobinCapturing::cudaRoundRobinCapturing(size_t num_streams) :
299 _num_streams {num_streams} {
300
301 if(num_streams == 0) {
302 TF_THROW("number of streams must be at least one");
303 }
304 }
305
306 // Function: num_streams
num_streams() const307 inline size_t cudaRoundRobinCapturing::num_streams() const {
308 return _num_streams;
309 }
310
311 // Procedure: num_streams
num_streams(size_t n)312 inline void cudaRoundRobinCapturing::num_streams(size_t n) {
313 if(n == 0) {
314 TF_THROW("number of streams must be at least one");
315 }
316 _num_streams = n;
317 }
318
_reset(std::vector<std::vector<cudaNode * >> & graph)319 inline void cudaRoundRobinCapturing::_reset(
320 std::vector<std::vector<cudaNode*>>& graph
321 ) {
322 //level == global id
323 //idx == stream id we want to skip
324 size_t id{0};
325 for(auto& each_level: graph) {
326 for(auto& node: each_level) {
327 auto& hn = std::get<cudaNode::Capture>(node->_handle);
328 hn.level = id++;
329 hn.idx = _num_streams;
330 hn.event = nullptr;
331 }
332 }
333 }
334
335 // Function: _optimize
_optimize(cudaGraph & graph)336 inline cudaGraph_t cudaRoundRobinCapturing::_optimize(cudaGraph& graph) {
337
338 // levelize the graph
339 auto levelized = _levelize(graph);
340
341 // initialize the data structure
342 _reset(levelized);
343
344 // begin to capture
345 std::vector<cudaScopedPerThreadStream> streams(_num_streams);
346
347 TF_CHECK_CUDA(
348 cudaStreamBeginCapture(streams[0], cudaStreamCaptureModeThreadLocal),
349 "failed to turn stream into per-thread capture mode"
350 );
351
352 // reserve space for scoped events
353 std::vector<cudaScopedPerThreadEvent> events;
354 events.reserve((_num_streams >> 1) + levelized.size());
355
356 // fork
357 cudaEvent_t fork_event = events.emplace_back();
358 TF_CHECK_CUDA(
359 cudaEventRecord(fork_event, streams[0]), "faid to record fork"
360 );
361
362 for(size_t i = 1; i < streams.size(); ++i) {
363 TF_CHECK_CUDA(
364 cudaStreamWaitEvent(streams[i], fork_event, 0), "failed to wait on fork"
365 );
366 }
367
368 // assign streams to levelized nodes in a round-robin manner
369 for(auto& each_level: levelized) {
370 for(auto& node: each_level) {
371 auto& hn = std::get<cudaNode::Capture>(node->_handle);
372 size_t sid = hn.lid % _num_streams;
373
374 //wait events
375 cudaNode* wait_node{nullptr};
376 for(auto& pn: node->_dependents) {
377 auto& phn = std::get<cudaNode::Capture>(pn->_handle);
378 size_t psid = phn.lid % _num_streams;
379
380 //level == global id
381 //idx == stream id we want to skip
382 if(psid == hn.idx) {
383 if(wait_node == nullptr || std::get<cudaNode::Capture>(wait_node->_handle).level < phn.level) {
384 wait_node = pn;
385 }
386 }
387 else if(psid != sid) {
388 TF_CHECK_CUDA(
389 cudaStreamWaitEvent(streams[sid], phn.event, 0),
390 "failed to wait on node's stream"
391 );
392 }
393 }
394
395 if(wait_node != nullptr) {
396 assert(std::get<cudaNode::Capture>(wait_node->_handle).event);
397 TF_CHECK_CUDA(
398 cudaStreamWaitEvent(
399 streams[sid],
400 std::get<cudaNode::Capture>(wait_node->_handle).event,
401 0
402 ), "failed to wait on node's stream"
403 );
404 }
405
406 //capture
407 hn.work(streams[sid]);
408
409 //create/record stream
410 for(auto& sn: node->_successors) {
411 auto& shn = std::get<cudaNode::Capture>(sn->_handle);
412 size_t ssid = shn.lid % _num_streams;
413 if(ssid != sid) {
414 if(!hn.event) {
415 hn.event = events.emplace_back();
416 TF_CHECK_CUDA(
417 cudaEventRecord(hn.event, streams[sid]), "failed to record node's stream"
418 );
419 }
420 //idx == stream id we want to skip
421 shn.idx = sid;
422 }
423 }
424 }
425 }
426
427 // join
428 for(size_t i=1; i<_num_streams; ++i) {
429 cudaEvent_t join_event = events.emplace_back();
430 TF_CHECK_CUDA(
431 cudaEventRecord(join_event, streams[i]), "failed to record join"
432 );
433 TF_CHECK_CUDA(
434 cudaStreamWaitEvent(streams[0], join_event), "failed to wait on join"
435 );
436 }
437
438 cudaGraph_t native_g;
439
440 TF_CHECK_CUDA(
441 cudaStreamEndCapture(streams[0], &native_g), "failed to end capture"
442 );
443
444 //tf::cuda_dump_graph(std::cout, native_g);
445 //std::cout << '\n';
446
447 return native_g;
448 }
449
450
451 } // end of namespace tf -----------------------------------------------------
452
453