1 #pragma once
2 
3 #include "cuda_graph.hpp"
4 
5 /**
6 @file cuda_optimizer.hpp
7 @brief %cudaFlow capturing algorithms include file
8 */
9 
10 namespace tf {
11 
12 // ----------------------------------------------------------------------------
13 // cudaCapturingBase
14 // ----------------------------------------------------------------------------
15 
16 /**
17 @private
18 
19 @brief class to provide helper common methods for optimization algorithms
20 */
21 class cudaCapturingBase {
22 
23   protected:
24 
25     std::vector<cudaNode*> _toposort(cudaGraph&);
26     std::vector<std::vector<cudaNode*>> _levelize(cudaGraph&);
27 };
28 
29 // Function: _toposort
_toposort(cudaGraph & graph)30 inline std::vector<cudaNode*> cudaCapturingBase::_toposort(cudaGraph& graph) {
31 
32   std::vector<cudaNode*> res;
33   std::queue<cudaNode*> bfs;
34 
35   res.reserve(graph._nodes.size());
36 
37   // insert the first level of nodes into the queue
38   for(auto& u : graph._nodes) {
39 
40     auto& hu = std::get<cudaNode::Capture>(u->_handle);
41     hu.level = u->_dependents.size();
42 
43     if(hu.level == 0) {
44       bfs.push(u.get());
45     }
46   }
47 
48   // levelize the graph using bfs
49   while(!bfs.empty()) {
50 
51     auto u = bfs.front();
52     bfs.pop();
53 
54     res.push_back(u);
55 
56     auto& hu = std::get<cudaNode::Capture>(u->_handle);
57 
58     for(auto v : u->_successors) {
59       auto& hv = std::get<cudaNode::Capture>(v->_handle);
60       if(--hv.level == 0) {
61         bfs.push(v);
62       }
63     }
64   }
65 
66   return res;
67 }
68 
69 // Function: _levelize
70 inline std::vector<std::vector<cudaNode*>>
_levelize(cudaGraph & graph)71 cudaCapturingBase::_levelize(cudaGraph& graph) {
72 
73   std::queue<cudaNode*> bfs;
74 
75   size_t max_level = 0;
76 
77   // insert the first level of nodes into the queue
78   for(auto& u : graph._nodes) {
79 
80     auto& hu = std::get<cudaNode::Capture>(u->_handle);
81     hu.level = u->_dependents.size();
82 
83     if(hu.level == 0) {
84       bfs.push(u.get());
85     }
86   }
87 
88   // levelize the graph using bfs
89   while(!bfs.empty()) {
90 
91     auto u = bfs.front();
92     bfs.pop();
93 
94     auto& hu = std::get<cudaNode::Capture>(u->_handle);
95 
96     for(auto v : u->_successors) {
97       auto& hv = std::get<cudaNode::Capture>(v->_handle);
98       if(--hv.level == 0) {
99         hv.level = hu.level + 1;
100         if(hv.level > max_level) {
101           max_level = hv.level;
102         }
103         bfs.push(v);
104       }
105     }
106   }
107 
108   // set level_graph and each node's idx
109   std::vector<std::vector<cudaNode*>> level_graph(max_level+1);
110   for(auto& u : graph._nodes) {
111     auto& hu = std::get<cudaNode::Capture>(u->_handle);
112     hu.lid = level_graph[hu.level].size();
113     level_graph[hu.level].emplace_back(u.get());
114 
115     //for(auto s : u->_successors) {
116     //  assert(hu.level < std::get<cudaNode::Capture>(s->_handle).level);
117     //}
118   }
119 
120   return level_graph;
121 }
122 
123 // ----------------------------------------------------------------------------
124 // class definition: cudaSequentialCapturing
125 // ----------------------------------------------------------------------------
126 
127 /**
128 @class cudaSequentialCapturing
129 
130 @brief class to capture a CUDA graph using a sequential stream
131 
132 A sequential capturing algorithm finds a topological order of
133 the described graph and captures dependent GPU tasks using a single stream.
134 All GPU tasks run sequentially without breaking inter dependencies.
135 */
136 class cudaSequentialCapturing : public cudaCapturingBase {
137 
138   friend class cudaFlowCapturer;
139 
140   public:
141 
142     /**
143     @brief constructs a sequential optimizer
144     */
145     cudaSequentialCapturing() = default;
146 
147   private:
148 
149     cudaGraph_t _optimize(cudaGraph& graph);
150 };
151 
_optimize(cudaGraph & graph)152 inline cudaGraph_t cudaSequentialCapturing::_optimize(cudaGraph& graph) {
153   // acquire per-thread stream and turn it into capture mode
154   // we must use ThreadLocal mode to avoid clashing with CUDA global states
155   cudaScopedPerThreadStream stream;
156 
157   cudaGraph_t native_g;
158 
159   TF_CHECK_CUDA(
160     cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal),
161     "failed to turn stream into per-thread capture mode"
162   );
163 
164   auto ordered = _toposort(graph);
165   for(auto& node : ordered) {
166     std::get<cudaNode::Capture>(node->_handle).work(stream);
167   }
168 
169   TF_CHECK_CUDA(
170     cudaStreamEndCapture(stream, &native_g), "failed to end capture"
171   );
172 
173   return native_g;
174 }
175 
176 // ----------------------------------------------------------------------------
177 // class definition: cudaLinearCapturing
178 // ----------------------------------------------------------------------------
179 
180 /**
181 @class cudaLinearCapturing
182 
183 @brief class to capture a linear CUDA graph using a sequential stream
184 
185 A linear capturing algorithm is a special case of tf::cudaSequentialCapturing
186 and assumes the input task graph to be a single linear chain of tasks
187 (i.e., a straight line).
188 This assumption allows faster optimization during the capturing process.
189 If the input task graph is not a linear chain, the behavior is undefined.
190 */
191 class cudaLinearCapturing : public cudaCapturingBase {
192 
193   friend class cudaFlowCapturer;
194 
195   public:
196 
197     /**
198     @brief constructs a linear optimizer
199     */
200     cudaLinearCapturing() = default;
201 
202   private:
203 
204     cudaGraph_t _optimize(cudaGraph& graph);
205 };
206 
_optimize(cudaGraph & graph)207 inline cudaGraph_t cudaLinearCapturing::_optimize(cudaGraph& graph) {
208 
209   // acquire per-thread stream and turn it into capture mode
210   // we must use ThreadLocal mode to avoid clashing with CUDA global states
211   cudaScopedPerThreadStream stream;
212 
213   cudaGraph_t native_g;
214 
215   TF_CHECK_CUDA(
216     cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal),
217     "failed to turn stream into per-thread capture mode"
218   );
219 
220   // find the source node
221   cudaNode* src {nullptr};
222   for(auto& u : graph._nodes) {
223     if(u->_dependents.size() == 0) {
224       src = u.get();
225       while(src) {
226         std::get<cudaNode::Capture>(src->_handle).work(stream);
227         src = src->_successors.empty() ? nullptr : src->_successors[0];
228       }
229       break;
230     }
231     // ideally, there should be only one source
232   }
233 
234   TF_CHECK_CUDA(
235     cudaStreamEndCapture(stream, &native_g), "failed to end capture"
236   );
237 
238   return native_g;
239 }
240 
241 // ----------------------------------------------------------------------------
242 // class definition: cudaRoundRobinCapturing
243 // ----------------------------------------------------------------------------
244 
245 /**
246 @class cudaRoundRobinCapturing
247 
248 @brief class to capture a CUDA graph using a round-robin algorithm
249 
250 A round-robin capturing algorithm levelizes the user-described graph
251 and assign streams to nodes in a round-robin order level by level.
252 The algorithm is based on the following paper published in Euro-Par 2021:
253   + Dian-Lun Lin and Tsung-Wei Huang, &quot;Efficient GPU Computation using %Task Graph Parallelism,&quot; <i>European Conference on Parallel and Distributed Computing (Euro-Par)</i>, 2021
254 
255 The round-robin optimization algorithm is best suited for large %cudaFlow graphs
256 that compose hundreds of or thousands of GPU operations
257 (e.g., kernels and memory copies) with many of them being able to run in parallel.
258 You can configure the number of streams to the optimizer to adjust the
259 maximum kernel currency in the captured CUDA graph.
260 */
261 class cudaRoundRobinCapturing : public cudaCapturingBase {
262 
263   friend class cudaFlowCapturer;
264 
265   public:
266 
267     /**
268     @brief constructs a round-robin optimizer with 4 streams by default
269      */
270     cudaRoundRobinCapturing() = default;
271 
272     /**
273     @brief constructs a round-robin optimizer with the given number of streams
274      */
275     cudaRoundRobinCapturing(size_t num_streams);
276 
277     /**
278     @brief queries the number of streams used by the optimizer
279      */
280     size_t num_streams() const;
281 
282     /**
283     @brief sets the number of streams used by the optimizer
284      */
285     void num_streams(size_t n);
286 
287   private:
288 
289     size_t _num_streams {4};
290 
291     cudaGraph_t _optimize(cudaGraph& graph);
292 
293     void _reset(std::vector<std::vector<cudaNode*>>& graph);
294 
295 };
296 
297 // Constructor
cudaRoundRobinCapturing(size_t num_streams)298 inline cudaRoundRobinCapturing::cudaRoundRobinCapturing(size_t num_streams) :
299   _num_streams {num_streams} {
300 
301   if(num_streams == 0) {
302     TF_THROW("number of streams must be at least one");
303   }
304 }
305 
306 // Function: num_streams
num_streams() const307 inline size_t cudaRoundRobinCapturing::num_streams() const {
308   return _num_streams;
309 }
310 
311 // Procedure: num_streams
num_streams(size_t n)312 inline void cudaRoundRobinCapturing::num_streams(size_t n) {
313   if(n == 0) {
314     TF_THROW("number of streams must be at least one");
315   }
316   _num_streams = n;
317 }
318 
_reset(std::vector<std::vector<cudaNode * >> & graph)319 inline void cudaRoundRobinCapturing::_reset(
320   std::vector<std::vector<cudaNode*>>& graph
321 ) {
322   //level == global id
323   //idx == stream id we want to skip
324   size_t id{0};
325   for(auto& each_level: graph) {
326     for(auto& node: each_level) {
327       auto& hn = std::get<cudaNode::Capture>(node->_handle);
328       hn.level = id++;
329       hn.idx = _num_streams;
330       hn.event = nullptr;
331     }
332   }
333 }
334 
335 // Function: _optimize
_optimize(cudaGraph & graph)336 inline cudaGraph_t cudaRoundRobinCapturing::_optimize(cudaGraph& graph) {
337 
338   // levelize the graph
339   auto levelized = _levelize(graph);
340 
341   // initialize the data structure
342   _reset(levelized);
343 
344   // begin to capture
345   std::vector<cudaScopedPerThreadStream> streams(_num_streams);
346 
347   TF_CHECK_CUDA(
348     cudaStreamBeginCapture(streams[0], cudaStreamCaptureModeThreadLocal),
349     "failed to turn stream into per-thread capture mode"
350   );
351 
352   // reserve space for scoped events
353   std::vector<cudaScopedPerThreadEvent> events;
354   events.reserve((_num_streams >> 1) + levelized.size());
355 
356   // fork
357   cudaEvent_t fork_event = events.emplace_back();
358   TF_CHECK_CUDA(
359     cudaEventRecord(fork_event, streams[0]), "faid to record fork"
360   );
361 
362   for(size_t i = 1; i < streams.size(); ++i) {
363     TF_CHECK_CUDA(
364       cudaStreamWaitEvent(streams[i], fork_event, 0), "failed to wait on fork"
365     );
366   }
367 
368   // assign streams to levelized nodes in a round-robin manner
369   for(auto& each_level: levelized) {
370     for(auto& node: each_level) {
371       auto& hn = std::get<cudaNode::Capture>(node->_handle);
372       size_t sid = hn.lid % _num_streams;
373 
374       //wait events
375       cudaNode* wait_node{nullptr};
376       for(auto& pn: node->_dependents) {
377         auto& phn = std::get<cudaNode::Capture>(pn->_handle);
378         size_t psid = phn.lid % _num_streams;
379 
380         //level == global id
381         //idx == stream id we want to skip
382         if(psid == hn.idx) {
383           if(wait_node == nullptr || std::get<cudaNode::Capture>(wait_node->_handle).level < phn.level) {
384             wait_node = pn;
385           }
386         }
387         else if(psid != sid) {
388           TF_CHECK_CUDA(
389             cudaStreamWaitEvent(streams[sid], phn.event, 0),
390             "failed to wait on node's stream"
391           );
392         }
393       }
394 
395       if(wait_node != nullptr) {
396         assert(std::get<cudaNode::Capture>(wait_node->_handle).event);
397         TF_CHECK_CUDA(
398           cudaStreamWaitEvent(
399             streams[sid],
400             std::get<cudaNode::Capture>(wait_node->_handle).event,
401             0
402           ), "failed to wait on node's stream"
403         );
404       }
405 
406       //capture
407       hn.work(streams[sid]);
408 
409       //create/record stream
410       for(auto& sn: node->_successors) {
411         auto& shn = std::get<cudaNode::Capture>(sn->_handle);
412         size_t ssid = shn.lid % _num_streams;
413         if(ssid != sid) {
414           if(!hn.event) {
415             hn.event = events.emplace_back();
416             TF_CHECK_CUDA(
417               cudaEventRecord(hn.event, streams[sid]), "failed to record node's stream"
418             );
419           }
420           //idx == stream id we want to skip
421           shn.idx = sid;
422         }
423       }
424     }
425   }
426 
427   // join
428   for(size_t i=1; i<_num_streams; ++i) {
429     cudaEvent_t join_event = events.emplace_back();
430     TF_CHECK_CUDA(
431       cudaEventRecord(join_event, streams[i]), "failed to record join"
432     );
433     TF_CHECK_CUDA(
434       cudaStreamWaitEvent(streams[0], join_event), "failed to wait on join"
435     );
436   }
437 
438   cudaGraph_t native_g;
439 
440   TF_CHECK_CUDA(
441     cudaStreamEndCapture(streams[0], &native_g), "failed to end capture"
442   );
443 
444   //tf::cuda_dump_graph(std::cout, native_g);
445   //std::cout << '\n';
446 
447   return native_g;
448 }
449 
450 
451 }  // end of namespace tf -----------------------------------------------------
452 
453