1 #pragma once
2 
3 #include "sycl_task.hpp"
4 
5 /**
6 @file sycl_flow.hpp
7 @brief syclFlow include file
8 */
9 
10 namespace tf {
11 
12 // ----------------------------------------------------------------------------
13 // class definition: syclFlow
14 // ----------------------------------------------------------------------------
15 
16 /**
17 @class syclFlow
18 
19 @brief class for building a SYCL task dependency graph
20 
21 */
22 class syclFlow {
23 
24   friend class Executor;
25 
26   struct External {
27     syclGraph graph;
28   };
29 
30   struct Internal {
31     Executor& executor;
Internaltf::syclFlow::Internal32     Internal(Executor& e) : executor {e} {}
33   };
34 
35   using handle_t = std::variant<External, Internal>;
36 
37   public:
38 
39     /**
40     @brief constructs a standalone %syclFlow from the default queue
41 
42     A standalone %syclFlow does not go through any taskflow and
43     can be run by the caller thread using explicit offload methods
44     (e.g., tf::syclFlow::offload).
45     */
46     syclFlow();
47 
48     /**
49     @brief constructs a standalone %syclFlow from the given queue
50 
51     A standalone %syclFlow does not go through any taskflow and
52     can be run by the caller thread using explicit offload methods
53     (e.g., tf::syclFlow::offload).
54     */
55     syclFlow(sycl::queue queue);
56 
57     /**
58     @brief destroys the %syclFlow
59      */
60     ~syclFlow() = default;
61 
62     /**
63     @brief queries the emptiness of the graph
64     */
65     bool empty() const;
66 
67     /**
68     @brief queries the number of tasks
69     */
70     size_t num_tasks() const;
71 
72     /**
73     @brief dumps the %syclFlow graph into a DOT format through an
74            output stream
75     */
76     void dump(std::ostream& os) const;
77 
78     /**
79     @brief clear the associated graph
80     */
81     void clear();
82 
83     // ------------------------------------------------------------------------
84     // Generic device operations
85     // ------------------------------------------------------------------------
86 
87     /**
88     @brief creates a task that launches the given command group function object
89 
90     @tparam F type of command group function object
91     @param func function object that is constructible from
92                 std::function<void(sycl::handler&)>
93 
94     Creates a task that is associated from the given command group.
95     In SYCL, each command group function object is given a unique
96     command group handler object to perform all the necessary work
97     required to correctly process data on a device using a kernel.
98     */
99     template <typename F, std::enable_if_t<
100       std::is_invocable_r_v<void, F, sycl::handler&>, void>* = nullptr
101     >
102     syclTask on(F&& func);
103 
104     /**
105     @brief updates the task to the given command group function object
106 
107     Similar to tf::syclFlow::on but operates on an existing task.
108     */
109     template <typename F, std::enable_if_t<
110       std::is_invocable_r_v<void, F, sycl::handler&>, void>* = nullptr
111     >
112     void on(syclTask task, F&& func);
113 
114     // TODO
115     template <typename F, std::enable_if_t<
116       std::is_invocable_r_v<
117         sycl::event, F, sycl::queue&, std::vector<sycl::event>>, void
118       >* = nullptr
119     >
120     syclTask on(F&& func);
121 
122     // TODO
123     template <typename F, std::enable_if_t<
124       std::is_invocable_r_v<
125         sycl::event, F, sycl::queue&, std::vector<sycl::event>>, void
126       >* = nullptr
127     >
128     void on(syclTask task, F&& func);
129 
130 
131     /**
132     @brief creates a memcpy task that copies untyped data in bytes
133 
134     @param tgt pointer to the target memory block
135     @param src pointer to the source memory block
136     @param bytes bytes to copy
137 
138     @return a tf::syclTask handle
139 
140     A memcpy task transfers @c bytes of data from a source locationA @c src
141     to a target location @c tgt. Both @c src and @c tgt may be either host
142     or USM pointers.
143     */
144     syclTask memcpy(void* tgt, const void* src, size_t bytes);
145 
146     /**
147     @brief creates a memset task that fills untyped data with a byte value
148 
149     @param ptr pointer to the destination device memory area
150     @param value value to set for each byte of specified memory
151     @param bytes number of bytes to set
152 
153     @return a tf::syclTask handle
154 
155     Fills @c bytes of memory beginning at address @c ptr with @c value.
156     @c ptr must be a USM allocation.
157     @c value is interpreted as an unsigned char.
158     */
159     syclTask memset(void* ptr, int value, size_t bytes);
160 
161     /**
162     @brief creates a fill task that fills typed data with the given value
163 
164     @tparam T trivially copyable value type
165 
166     @param ptr pointer to the memory to fill
167     @param pattern pattern value to fill into the memory
168     @param count number of items to fill the value
169 
170     Creates a task that fills the specified memory with the
171     specified value.
172     */
173     template <typename T>
174     syclTask fill(void* ptr, const T& pattern, size_t count);
175 
176     /**
177     @brief creates a copy task that copies typed data from a source to a target
178            memory block
179 
180     @tparam T trivially copyable value type
181 
182     @param target pointer to the memory to fill
183     @param source pointer to the pattern value to fill into the memory
184     @param count number of items to fill the value
185 
186     Creates a task that copies @c count items of type @c T from a source memory
187     location to a target memory location.
188     */
189     template <typename T,
190       std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
191     >
192     syclTask copy(T* target, const T* source, size_t count);
193 
194     /**
195     @brief creates a kernel task
196 
197     @tparam ArgsT arguments types
198 
199     @param args arguments to forward to the parallel_for methods defined
200                 in the handler object
201 
202     Creates a kernel task from a parallel_for method through the handler
203     object associated with a command group.
204     */
205     template <typename...ArgsT>
206     syclTask parallel_for(ArgsT&&... args);
207 
208     // ------------------------------------------------------------------------
209     // algorithms
210     // ------------------------------------------------------------------------
211 
212     /**
213     @brief invokes a SYCL kernel function using only one thread
214 
215     @tparam F kernel function type
216     @param func kernel function
217 
218     Creates a task that launches the given function object using only one
219     kernel thread.
220     */
221     template <typename F>
222     syclTask single_task(F&& func);
223 
224     /**
225     @brief applies a callable to each dereferenced element of the data array
226 
227     @tparam I iterator type
228     @tparam C callable type
229 
230     @param first iterator to the beginning (inclusive)
231     @param last iterator to the end (exclusive)
232     @param callable a callable object to apply to the dereferenced iterator
233 
234     @return a tf::syclTask handle
235 
236     This method is equivalent to the parallel execution of the following loop on a GPU:
237 
238     @code{.cpp}
239     for(auto itr = first; itr != last; itr++) {
240       callable(*itr);
241     }
242     @endcode
243     */
244     template <typename I, typename C>
245     syclTask for_each(I first, I last, C&& callable);
246 
247     /**
248     @brief applies a callable to each index in the range with the step size
249 
250     @tparam I index type
251     @tparam C callable type
252 
253     @param first beginning index
254     @param last last index
255     @param step step size
256     @param callable the callable to apply to each element in the data array
257 
258     @return a tf::syclTask handle
259 
260     This method is equivalent to the parallel execution of the following loop on a GPU:
261 
262     @code{.cpp}
263     // step is positive [first, last)
264     for(auto i=first; i<last; i+=step) {
265       callable(i);
266     }
267 
268     // step is negative [first, last)
269     for(auto i=first; i>last; i+=step) {
270       callable(i);
271     }
272     @endcode
273     */
274     template <typename I, typename C>
275     syclTask for_each_index(I first, I last, I step, C&& callable);
276 
277     /**
278     @brief applies a callable to a source range and stores the result in a target range
279 
280     @tparam I iterator type
281     @tparam C callable type
282     @tparam S source types
283 
284     @param first iterator to the beginning (inclusive)
285     @param last iterator to the end (exclusive)
286     @param callable the callable to apply to each element in the range
287     @param srcs iterators to the source ranges
288 
289     @return a tf::syclTask handle
290 
291     This method is equivalent to the parallel execution of the following
292     loop on a SYCL device:
293 
294     @code{.cpp}
295     while (first != last) {
296       *first++ = callable(*src1++, *src2++, *src3++, ...);
297     }
298     @endcode
299     */
300     template <typename I, typename C, typename... S>
301     syclTask transform(I first, I last, C&& callable, S... srcs);
302 
303     /**
304     @brief performs parallel reduction over a range of items
305 
306     @tparam I input iterator type
307     @tparam T value type
308     @tparam C callable type
309 
310     @param first iterator to the beginning (inclusive)
311     @param last iterator to the end (exclusive)
312     @param result pointer to the result with an initialized value
313     @param op binary reduction operator
314 
315     @return a tf::syclTask handle
316 
317     This method is equivalent to the parallel execution of the following loop
318     on a SYCL device:
319 
320     @code{.cpp}
321     while (first != last) {
322       *result = op(*result, *first++);
323     }
324     @endcode
325     */
326     template <typename I, typename T, typename C>
327     syclTask reduce(I first, I last, T* result, C&& op);
328 
329     /**
330     @brief similar to tf::syclFlow::reduce but does not assume any initial
331            value to reduce
332 
333     This method is equivalent to the parallel execution of the following loop
334     on a SYCL device:
335 
336     @code{.cpp}
337     *result = *first++;  // no initial values partitipcate in the loop
338     while (first != last) {
339       *result = op(*result, *first++);
340     }
341     @endcode
342     */
343     template <typename I, typename T, typename C>
344     syclTask uninitialized_reduce(I first, I last, T* result, C&& op);
345 
346     // ------------------------------------------------------------------------
347     // offload methods
348     // ------------------------------------------------------------------------
349 
350     /**
351     @brief offloads the %syclFlow onto a GPU and repeatedly runs it until
352     the predicate becomes true
353 
354     @tparam P predicate type (a binary callable)
355 
356     @param predicate a binary predicate (returns @c true for stop)
357 
358     Repetitively executes the present %syclFlow through the given queue object
359     until the predicate returns @c true.
360 
361     By default, if users do not offload the %syclFlow,
362     the executor will offload it once.
363     */
364     template <typename P>
365     void offload_until(P&& predicate);
366 
367     /**
368     @brief offloads the %syclFlow and executes it by the given times
369 
370     @param N number of executions
371     */
372     void offload_n(size_t N);
373 
374     /**
375     @brief offloads the %syclFlow and executes it once
376     */
377     void offload();
378 
379     // ------------------------------------------------------------------------
380     // rebind methods
381     // ------------------------------------------------------------------------
382 
383 
384     /**
385     @brief rebinds the task to a memcpy task
386 
387     Similar to tf::syclFlow::memcpy but operates on an existing task.
388     */
389     void memcpy(syclTask task, void* tgt, const void* src, size_t bytes);
390 
391     /**
392     @brief rebinds the task to a memset task
393 
394     Similar to tf::syclFlow::memset but operates on an existing task.
395     */
396     void memset(syclTask task, void* ptr, int value, size_t bytes);
397 
398     /**
399     @brief rebinds the task to a fill task
400 
401     Similar to tf::syclFlow::fill but operates on an existing task.
402     */
403     template <typename T>
404     void fill(syclTask task, void* ptr, const T& pattern, size_t count);
405 
406     /**
407     @brief rebinds the task to a copy task
408 
409     Similar to tf::syclFlow::copy but operates on an existing task.
410     */
411     template <typename T,
412       std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
413     >
414     void copy(syclTask task, T* target, const T* source, size_t count);
415 
416     /**
417     @brief rebinds the task to a parallel-for kernel task
418 
419     Similar to tf::syclFlow::parallel_for but operates on an existing task.
420     */
421     template <typename...ArgsT>
422     void parallel_for(syclTask task, ArgsT&&... args);
423 
424     /**
425     @brief rebinds the task to a single-threaded kernel task
426 
427     Similar to tf::syclFlow::single_task but operates on an existing task.
428     */
429     template <typename F>
430     void single_task(syclTask task, F&& func);
431 
432     /**
433     @brief rebinds the task to a for-each task
434 
435     Similar to tf::syclFlow::for_each but operates on an existing task.
436     */
437     template <typename I, typename C>
438     void for_each(syclTask task, I first, I last, C&& callable);
439 
440     /**
441     @brief rebinds the task to a for-each-index task
442 
443     Similar to tf::syclFlow::for_each_index but operates on an existing task.
444      */
445     template <typename I, typename C>
446     void for_each_index(
447       syclTask task, I first, I last, I step, C&& callable
448     );
449 
450     /**
451     @brief rebinds the task to a transform task
452 
453     Similar to tf::syclFlow::transform but operates on an existing task.
454      */
455     template <typename I, typename C, typename... S>
456     void transform(
457       syclTask task, I first, I last, C&& callable, S... srcs
458     );
459 
460     /**
461     @brief rebinds the task to a reduce task
462 
463     Similar to tf::syclFlow::reduce but operates on an existing task.
464     */
465     template <typename I, typename T, typename C>
466     void reduce(
467       syclTask task, I first, I last, T* result, C&& op
468     );
469 
470     /**
471     @brief rebinds the task to an unitialized reduce task
472 
473     Similar to tf::syclFlow::uninitialized_reduce but operates on an existing task.
474     */
475     template <typename I, typename T, typename C>
476     void uninitialized_reduce(
477       syclTask task, I first, I last, T* result, C&& op
478     );
479 
480   private:
481 
482     syclFlow(Executor&, syclGraph&, sycl::queue&);
483 
484     sycl::queue _queue;
485 
486     const size_t _MAX_WORK_GROUP_SIZE;
487 
488     handle_t _handle;
489 
490     syclGraph& _graph;
491 
492     std::vector<syclNode*> _tpg;
493     std::queue<syclNode*> _bfs;
494 
495     size_t _default_group_size(size_t N) const;
496 
497     template <typename I, typename C>
498     auto _for_each_cgh(I, I, C&&);
499 
500     template <typename I, typename C>
501     auto _for_each_index_cgh(I, I, I, C&&);
502 
503     template <typename I, typename T, typename C, bool>
504     auto _reduce_cgh(I, I, T*, C&&);
505 
506     template <typename I, typename C, typename... S>
507     auto _transform_cgh(I, I, C&&, S...);
508 };
509 
510 // constructor
syclFlow()511 inline syclFlow::syclFlow() :
512   _MAX_WORK_GROUP_SIZE {
513     _queue.get_device().get_info<sycl::info::device::max_work_group_size>()
514   },
515   _handle {std::in_place_type_t<External>{}},
516   _graph  {std::get<External>(_handle).graph} {
517 }
518 
519 // constructor
syclFlow(sycl::queue queue)520 inline syclFlow::syclFlow(sycl::queue queue) :
521   _queue  {std::move(queue)},
522   _MAX_WORK_GROUP_SIZE {
523     _queue.get_device().get_info<sycl::info::device::max_work_group_size>()
524   },
525   _handle {std::in_place_type_t<External>{}},
526   _graph  {std::get<External>(_handle).graph} {
527 }
528 
529 // Construct the syclFlow from executor (internal graph)
syclFlow(Executor & e,syclGraph & g,sycl::queue & queue)530 inline syclFlow::syclFlow(Executor& e, syclGraph& g, sycl::queue& queue) :
531   _queue  {queue},
532   _MAX_WORK_GROUP_SIZE {
533     _queue.get_device().get_info<sycl::info::device::max_work_group_size>()
534   } ,
535   _handle {std::in_place_type_t<Internal>{}, e},
536   _graph  {g} {
537 }
538 
539 // Function: _default_group_size
_default_group_size(size_t N) const540 inline size_t syclFlow::_default_group_size(size_t N) const {
541   return N <= 32u ? 32u : std::min(_MAX_WORK_GROUP_SIZE, next_pow2(N));
542 }
543 
544 // Function: empty
empty() const545 inline bool syclFlow::empty() const {
546   return _graph._nodes.empty();
547 }
548 
549 // Function: num_tasks
num_tasks() const550 inline size_t syclFlow::num_tasks() const {
551   return _graph._nodes.size();
552 }
553 
554 // Procedure: dump
dump(std::ostream & os) const555 inline void syclFlow::dump(std::ostream& os) const {
556   _graph.dump(os, nullptr, "");
557 }
558 
559 // Procedure: clear
clear()560 inline void syclFlow::clear() {
561   _graph.clear();
562 }
563 
564 // Function: memcpy
memcpy(void * tgt,const void * src,size_t bytes)565 inline syclTask syclFlow::memcpy(void* tgt, const void* src, size_t bytes) {
566   return on([=](sycl::handler& h){ h.memcpy(tgt, src, bytes); });
567 }
568 
569 // Function: memset
memset(void * ptr,int value,size_t bytes)570 inline syclTask syclFlow::memset(void* ptr, int value, size_t bytes) {
571   return on([=](sycl::handler& h){ h.memset(ptr, value, bytes); });
572 }
573 
574 // Function: fill
575 template <typename T>
fill(void * ptr,const T & pattern,size_t count)576 syclTask syclFlow::fill(void* ptr, const T& pattern, size_t count) {
577   return on([=](sycl::handler& h){ h.fill(ptr, pattern, count); });
578 }
579 
580 // Function: copy
581 template <typename T,
582   std::enable_if_t<!std::is_same_v<T, void>, void>*
583 >
copy(T * target,const T * source,size_t count)584 syclTask syclFlow::copy(T* target, const T* source, size_t count) {
585   return on([=](sycl::handler& h){ h.memcpy(target, source, count*sizeof(T)); });
586 }
587 
588 // Function: on
589 template <typename F, std::enable_if_t<
590   std::is_invocable_r_v<void, F, sycl::handler&>, void>*
591 >
on(F && f)592 syclTask syclFlow::on(F&& f) {
593   auto node = _graph.emplace_back(_graph,
594     std::in_place_type_t<syclNode::CommandGroupHandler>{}, std::forward<F>(f)
595   );
596   return syclTask(node);
597 }
598 
599 // Function: on
600 template <typename F, std::enable_if_t<std::is_invocable_r_v<
601   sycl::event, F, sycl::queue&, std::vector<sycl::event>>, void
602 >*>
on(F && f)603 syclTask syclFlow::on(F&& f) {
604   auto node = _graph.emplace_back(_graph,
605     std::in_place_type_t<syclNode::DependentSubmit>{}, std::forward<F>(f)
606   );
607   return syclTask(node);
608 }
609 
610 // Function: single_task
611 template <typename F>
single_task(F && func)612 syclTask syclFlow::single_task(F&& func) {
613   return on([f=std::forward<F>(func)] (sycl::handler& h) {
614     h.single_task(f);
615   });
616 }
617 
618 // Function: parallel_for
619 template <typename...ArgsT>
parallel_for(ArgsT &&...args)620 syclTask syclFlow::parallel_for(ArgsT&&... args) {
621   return on([args...] (sycl::handler& h) { h.parallel_for(args...); });
622 }
623 
624 // Procedure: offload_until
625 template <typename P>
offload_until(P && predicate)626 void syclFlow::offload_until(P&& predicate) {
627 
628   if(!(_graph._state & syclGraph::TOPOLOGY_CHANGED)) {
629     goto offload;
630   }
631 
632   // levelize the graph
633   _tpg.clear();
634 
635   // insert the first level of nodes into the queue
636   for(auto& u : _graph._nodes) {
637     u->_level = u->_dependents.size();
638     if(u->_level == 0) {
639       _bfs.push(u.get());
640     }
641   }
642 
643   while(!_bfs.empty()) {
644     auto u = _bfs.front();
645     _bfs.pop();
646     _tpg.push_back(u);
647     for(auto v : u->_successors) {
648       if(--(v->_level) == 0) {
649         v->_level = u->_level + 1;
650         _bfs.push(v);
651       }
652     }
653   }
654 
655   offload:
656 
657   // offload the syclFlow graph
658   bool in_order = _queue.is_in_order();
659 
660   while(!predicate()) {
661 
662     // traverse node in a topological order
663     for(auto u : _tpg) {
664 
665       switch(u->_handle.index()) {
666         // task type 1: command group handler
667         case syclNode::COMMAND_GROUP_HANDLER:
668           u->_event = _queue.submit([u, in_order](sycl::handler& h){
669             // wait on all predecessors
670             if(!in_order) {
671               for(auto p : u->_dependents) {
672                 h.depends_on(p->_event);
673               }
674             }
675             std::get<syclNode::CommandGroupHandler>(u->_handle).work(h);
676           });
677         break;
678 
679         // task type 2: dependent submit
680         case syclNode::DEPENDENT_SUBMIT:
681           std::vector<sycl::event> events;
682           if(!in_order) {
683             events.reserve(u->_dependents.size());
684             for(auto p : u->_dependents) {
685               events.push_back(p->_event);
686             }
687           }
688           u->_event = std::get<syclNode::DependentSubmit>(u->_handle).work(
689             _queue, std::move(events)
690           );
691         break;
692       }
693     }
694 
695     // synchronize the execution
696     _queue.wait();
697   }
698 
699   _graph._state = syclGraph::OFFLOADED;
700 }
701 
702 // Procedure: offload_n
offload_n(size_t n)703 inline void syclFlow::offload_n(size_t n) {
704   offload_until([repeat=n] () mutable { return repeat-- == 0; });
705 }
706 
707 // Procedure: offload
offload()708 inline void syclFlow::offload() {
709   offload_until([repeat=1] () mutable { return repeat-- == 0; });
710 }
711 
712 // Function: on
713 template <typename F, std::enable_if_t<
714   std::is_invocable_r_v<void, F, sycl::handler&>, void>*
715 >
on(syclTask task,F && f)716 void syclFlow::on(syclTask task, F&& f) {
717   std::get<syclNode::CommandGroupHandler>((task._node)->_handle).work =
718     std::forward<F>(f);
719 }
720 
721 // Function: on
722 template <typename F, std::enable_if_t<std::is_invocable_r_v<
723   sycl::event, F, sycl::queue&, std::vector<sycl::event>>, void
724 >*>
on(syclTask task,F && f)725 void syclFlow::on(syclTask task, F&& f) {
726   std::get<syclNode::DependentSubmit>((task._node)->_handle).work =
727     std::forward<F>(f);
728 }
729 
730 // Function: memcpy
memcpy(syclTask task,void * tgt,const void * src,size_t bytes)731 inline void syclFlow::memcpy(
732   syclTask task, void* tgt, const void* src, size_t bytes
733 ) {
734   on(task, [=](sycl::handler& h){ h.memcpy(tgt, src, bytes); });
735 }
736 
737 // Function: memset
memset(syclTask task,void * ptr,int value,size_t bytes)738 inline void syclFlow::memset(
739   syclTask task, void* ptr, int value, size_t bytes
740 ) {
741   on(task, [=](sycl::handler& h){ h.memset(ptr, value, bytes); });
742 }
743 
744 // Function: fill
745 template <typename T>
fill(syclTask task,void * ptr,const T & pattern,size_t count)746 void syclFlow::fill(
747   syclTask task, void* ptr, const T& pattern, size_t count
748 ) {
749   on(task, [=](sycl::handler& h){ h.fill(ptr, pattern, count); });
750 }
751 
752 // Function: copy
753 template <typename T,
754   std::enable_if_t<!std::is_same_v<T, void>, void>*
755 >
copy(syclTask task,T * target,const T * source,size_t count)756 void syclFlow::copy(
757   syclTask task, T* target, const T* source, size_t count
758 ) {
759   on(task, [=](sycl::handler& h){h.memcpy(target, source, count*sizeof(T));});
760 }
761 
762 // Function: parallel_for
763 template <typename...ArgsT>
parallel_for(syclTask task,ArgsT &&...args)764 void syclFlow::parallel_for(syclTask task, ArgsT&&... args) {
765   on(task, [args...] (sycl::handler& h) { h.parallel_for(args...); });
766 }
767 
768 // Function: single_task
769 template <typename F>
single_task(syclTask task,F && func)770 void syclFlow::single_task(syclTask task, F&& func) {
771   on(task, [f=std::forward<F>(func)] (sycl::handler& h) { h.single_task(f); });
772 }
773 
774 // ############################################################################
775 // Forward declaration: FlowBuilder
776 // ############################################################################
777 
778 // FlowBuilder::emplace_on
779 template <typename C, typename Q, std::enable_if_t<is_syclflow_task_v<C>, void>*>
emplace_on(C && callable,Q && q)780 Task FlowBuilder::emplace_on(C&& callable, Q&& q) {
781   auto n = _graph.emplace_back(
782     std::in_place_type_t<Node::syclFlow>{},
783     [c=std::forward<C>(callable), queue=std::forward<Q>(q)]
784     (Executor& e, Node* p) mutable {
785       e._invoke_syclflow_task_entry(p, c, queue);
786     },
787     std::make_unique<syclGraph>()
788   );
789   return Task(n);
790 }
791 
792 // FlowBuilder::emplace
793 template <typename C, std::enable_if_t<is_syclflow_task_v<C>, void>*>
emplace(C && callable)794 Task FlowBuilder::emplace(C&& callable) {
795   return emplace_on(std::forward<C>(callable), sycl::queue{});
796 }
797 
798 // ############################################################################
799 // Forward declaration: Executor
800 // ############################################################################
801 
802 // Procedure: _invoke_syclflow_task_entry (syclFlow)
803 template <typename C, typename Q,
804   std::enable_if_t<is_syclflow_task_v<C>, void>*
805 >
_invoke_syclflow_task_entry(Node * node,C && c,Q & queue)806 void Executor::_invoke_syclflow_task_entry(Node* node, C&& c, Q& queue) {
807 
808   auto& h = std::get<Node::syclFlow>(node->_handle);
809 
810   syclGraph* g = dynamic_cast<syclGraph*>(h.graph.get());
811 
812   g->clear();
813 
814   syclFlow sf(*this, *g, queue);
815 
816   c(sf);
817 
818   if(!(g->_state & syclGraph::OFFLOADED)) {
819     sf.offload();
820   }
821 }
822 
823 }  // end of namespace tf -----------------------------------------------------
824 
825 
826