1 /*
2     Copyright (c) 2005-2020 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "internal/_deprecated_header_message_guard.h"
18 
19 #if !defined(__TBB_show_deprecation_message_flow_graph_opencl_node_H) && defined(__TBB_show_deprecated_header_message)
20 #define  __TBB_show_deprecation_message_flow_graph_opencl_node_H
21 #pragma message("TBB Warning: tbb/flow_graph_opencl_node.h is deprecated. For details, please see Deprecated Features appendix in the TBB reference manual.")
22 #endif
23 
24 #if defined(__TBB_show_deprecated_header_message)
25 #undef __TBB_show_deprecated_header_message
26 #endif
27 
28 #ifndef __TBB_flow_graph_opencl_node_H
29 #define __TBB_flow_graph_opencl_node_H
30 
31 #define __TBB_flow_graph_opencl_node_H_include_area
32 #include "internal/_warning_suppress_enable_notice.h"
33 
34 #include "tbb/tbb_config.h"
35 #if __TBB_PREVIEW_OPENCL_NODE
36 
37 #include "flow_graph.h"
38 
39 #include <vector>
40 #include <string>
41 #include <algorithm>
42 #include <iostream>
43 #include <fstream>
44 #include <map>
45 #include <mutex>
46 
47 #ifdef __APPLE__
48 #include <OpenCL/opencl.h>
49 #else
50 #include <CL/cl.h>
51 #endif
52 
53 namespace tbb {
54 namespace flow {
55 
56 namespace interface11 {
57 
58 template <typename DeviceFilter>
59 class opencl_factory;
60 
61 namespace opencl_info {
62 class default_opencl_factory;
63 }
64 
65 template <typename Factory>
66 class opencl_program;
67 
enforce_cl_retcode(cl_int err,std::string msg)68 inline void enforce_cl_retcode(cl_int err, std::string msg) {
69     if (err != CL_SUCCESS) {
70         std::cerr << msg << "; error code: " << err << std::endl;
71         throw msg;
72     }
73 }
74 
75 template <typename T>
event_info(cl_event e,cl_event_info i)76 T event_info(cl_event e, cl_event_info i) {
77     T res;
78     enforce_cl_retcode(clGetEventInfo(e, i, sizeof(res), &res, NULL), "Failed to get OpenCL event information");
79     return res;
80 }
81 
82 template <typename T>
device_info(cl_device_id d,cl_device_info i)83 T device_info(cl_device_id d, cl_device_info i) {
84     T res;
85     enforce_cl_retcode(clGetDeviceInfo(d, i, sizeof(res), &res, NULL), "Failed to get OpenCL device information");
86     return res;
87 }
88 
89 template <>
90 inline std::string device_info<std::string>(cl_device_id d, cl_device_info i) {
91     size_t required;
92     enforce_cl_retcode(clGetDeviceInfo(d, i, 0, NULL, &required), "Failed to get OpenCL device information");
93 
94     char *buff = (char*)alloca(required);
95     enforce_cl_retcode(clGetDeviceInfo(d, i, required, buff, NULL), "Failed to get OpenCL device information");
96 
97     return buff;
98 }
99 
100 template <typename T>
platform_info(cl_platform_id p,cl_platform_info i)101 T platform_info(cl_platform_id p, cl_platform_info i) {
102     T res;
103     enforce_cl_retcode(clGetPlatformInfo(p, i, sizeof(res), &res, NULL), "Failed to get OpenCL platform information");
104     return res;
105 }
106 
107 template <>
108 inline std::string platform_info<std::string>(cl_platform_id p, cl_platform_info  i) {
109     size_t required;
110     enforce_cl_retcode(clGetPlatformInfo(p, i, 0, NULL, &required), "Failed to get OpenCL platform information");
111 
112     char *buff = (char*)alloca(required);
113     enforce_cl_retcode(clGetPlatformInfo(p, i, required, buff, NULL), "Failed to get OpenCL platform information");
114 
115     return buff;
116 }
117 
118 
119 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_device {
120 public:
121     typedef size_t device_id_type;
122     enum : device_id_type {
123         unknown = device_id_type( -2 ),
124         host = device_id_type( -1 )
125     };
126 
opencl_device()127     opencl_device() : my_device_id( unknown ), my_cl_device_id( NULL ), my_cl_command_queue( NULL ) {}
128 
opencl_device(cl_device_id d_id)129     opencl_device( cl_device_id d_id ) : my_device_id( unknown ), my_cl_device_id( d_id ), my_cl_command_queue( NULL ) {}
130 
opencl_device(cl_device_id cl_d_id,device_id_type device_id)131     opencl_device( cl_device_id cl_d_id, device_id_type device_id ) : my_device_id( device_id ), my_cl_device_id( cl_d_id ), my_cl_command_queue( NULL ) {}
132 
platform_profile()133     std::string platform_profile() const {
134         return platform_info<std::string>( platform_id(), CL_PLATFORM_PROFILE );
135     }
platform_version()136     std::string platform_version() const {
137         return platform_info<std::string>( platform_id(), CL_PLATFORM_VERSION );
138     }
platform_name()139     std::string platform_name() const {
140         return platform_info<std::string>( platform_id(), CL_PLATFORM_NAME );
141     }
platform_vendor()142     std::string platform_vendor() const {
143         return platform_info<std::string>( platform_id(), CL_PLATFORM_VENDOR );
144     }
platform_extensions()145     std::string platform_extensions() const {
146         return platform_info<std::string>( platform_id(), CL_PLATFORM_EXTENSIONS );
147     }
148 
149     template <typename T>
info(cl_device_info i,T & t)150     void info( cl_device_info i, T &t ) const {
151         t = device_info<T>( my_cl_device_id, i );
152     }
version()153     std::string version() const {
154         // The version string format: OpenCL<space><major_version.minor_version><space><vendor-specific information>
155         return device_info<std::string>( my_cl_device_id, CL_DEVICE_VERSION );
156     }
major_version()157     int major_version() const {
158         int major;
159         std::sscanf( version().c_str(), "OpenCL %d", &major );
160         return major;
161     }
minor_version()162     int minor_version() const {
163         int major, minor;
164         std::sscanf( version().c_str(), "OpenCL %d.%d", &major, &minor );
165         return minor;
166     }
out_of_order_exec_mode_on_host_present()167     bool out_of_order_exec_mode_on_host_present() const {
168 #if CL_VERSION_2_0
169         if ( major_version() >= 2 )
170             return (device_info<cl_command_queue_properties>( my_cl_device_id, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES ) & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
171         else
172 #endif /* CL_VERSION_2_0 */
173             return (device_info<cl_command_queue_properties>( my_cl_device_id, CL_DEVICE_QUEUE_PROPERTIES ) & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
174     }
out_of_order_exec_mode_on_device_present()175     bool out_of_order_exec_mode_on_device_present() const {
176 #if CL_VERSION_2_0
177         if ( major_version() >= 2 )
178             return (device_info<cl_command_queue_properties>( my_cl_device_id, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES ) & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
179         else
180 #endif /* CL_VERSION_2_0 */
181             return false;
182     }
max_work_item_sizes()183     std::array<size_t, 3> max_work_item_sizes() const {
184         return device_info<std::array<size_t, 3>>( my_cl_device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES );
185     }
max_work_group_size()186     size_t max_work_group_size() const {
187         return device_info<size_t>( my_cl_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE );
188     }
built_in_kernel_available(const std::string & k)189     bool built_in_kernel_available( const std::string& k ) const {
190         const std::string semi = ";";
191         // Added semicolumns to force an exact match (to avoid a partial match, e.g. "add" is partly matched with "madd").
192         return (semi + built_in_kernels() + semi).find( semi + k + semi ) != std::string::npos;
193     }
built_in_kernels()194     std::string built_in_kernels() const {
195         return device_info<std::string>( my_cl_device_id, CL_DEVICE_BUILT_IN_KERNELS );
196     }
name()197     std::string name() const {
198         return device_info<std::string>( my_cl_device_id, CL_DEVICE_NAME );
199     }
available()200     cl_bool available() const {
201         return device_info<cl_bool>( my_cl_device_id, CL_DEVICE_AVAILABLE );
202     }
compiler_available()203     cl_bool compiler_available() const {
204         return device_info<cl_bool>( my_cl_device_id, CL_DEVICE_COMPILER_AVAILABLE );
205     }
linker_available()206     cl_bool linker_available() const {
207         return device_info<cl_bool>( my_cl_device_id, CL_DEVICE_LINKER_AVAILABLE );
208     }
extension_available(const std::string & ext)209     bool extension_available( const std::string &ext ) const {
210         const std::string space = " ";
211         // Added space to force an exact match (to avoid a partial match, e.g. "ext" is partly matched with "ext2").
212         return (space + extensions() + space).find( space + ext + space ) != std::string::npos;
213     }
extensions()214     std::string extensions() const {
215         return device_info<std::string>( my_cl_device_id, CL_DEVICE_EXTENSIONS );
216     }
217 
type()218     cl_device_type type() const {
219         return device_info<cl_device_type>( my_cl_device_id, CL_DEVICE_TYPE );
220     }
221 
vendor()222     std::string vendor() const {
223         return device_info<std::string>( my_cl_device_id, CL_DEVICE_VENDOR );
224     }
225 
address_bits()226     cl_uint address_bits() const {
227         return device_info<cl_uint>( my_cl_device_id, CL_DEVICE_ADDRESS_BITS );
228     }
229 
device_id()230     cl_device_id device_id() const {
231         return my_cl_device_id;
232     }
233 
command_queue()234     cl_command_queue command_queue() const {
235         return my_cl_command_queue;
236     }
237 
set_command_queue(cl_command_queue cmd_queue)238     void set_command_queue( cl_command_queue cmd_queue ) {
239         my_cl_command_queue = cmd_queue;
240     }
241 
platform_id()242     cl_platform_id platform_id() const {
243         return device_info<cl_platform_id>( my_cl_device_id, CL_DEVICE_PLATFORM );
244     }
245 
246 private:
247 
248     device_id_type my_device_id;
249     cl_device_id my_cl_device_id;
250     cl_command_queue my_cl_command_queue;
251 
252     friend bool operator==(opencl_device d1, opencl_device d2) { return d1.my_cl_device_id == d2.my_cl_device_id; }
253 
254     template <typename DeviceFilter>
255     friend class opencl_factory;
256     template <typename Factory>
257     friend class opencl_memory;
258     template <typename Factory>
259     friend class opencl_program;
260 
261 #if TBB_USE_ASSERT
262     template <typename T, typename Factory>
263     friend class opencl_buffer;
264 #endif
265 };
266 
267 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_device_list {
268     typedef std::vector<opencl_device> container_type;
269 public:
270     typedef container_type::iterator iterator;
271     typedef container_type::const_iterator const_iterator;
272     typedef container_type::size_type size_type;
273 
opencl_device_list()274     opencl_device_list() {}
opencl_device_list(std::initializer_list<opencl_device> il)275     opencl_device_list( std::initializer_list<opencl_device> il ) : my_container( il ) {}
276 
add(opencl_device d)277     void add( opencl_device d ) { my_container.push_back( d ); }
size()278     size_type size() const { return my_container.size(); }
empty()279     bool empty() const { return my_container.empty(); }
begin()280     iterator begin() { return my_container.begin(); }
end()281     iterator end() { return my_container.end(); }
begin()282     const_iterator begin() const { return my_container.begin(); }
end()283     const_iterator end() const { return my_container.end(); }
cbegin()284     const_iterator cbegin() const { return my_container.cbegin(); }
cend()285     const_iterator cend() const { return my_container.cend(); }
286 
287 private:
288     container_type my_container;
289 };
290 
291 namespace internal {
292 
293 // Retrieve all OpenCL devices from machine
find_available_devices()294 inline opencl_device_list find_available_devices() {
295     opencl_device_list opencl_devices;
296 
297     cl_uint num_platforms;
298     enforce_cl_retcode(clGetPlatformIDs(0, NULL, &num_platforms), "clGetPlatformIDs failed");
299 
300     std::vector<cl_platform_id> platforms(num_platforms);
301     enforce_cl_retcode(clGetPlatformIDs(num_platforms, platforms.data(), NULL), "clGetPlatformIDs failed");
302 
303     cl_uint num_devices;
304     std::vector<cl_platform_id>::iterator platforms_it = platforms.begin();
305     cl_uint num_all_devices = 0;
306     while (platforms_it != platforms.end()) {
307         cl_int err = clGetDeviceIDs(*platforms_it, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
308         if (err == CL_DEVICE_NOT_FOUND) {
309             platforms_it = platforms.erase(platforms_it);
310         }
311         else {
312             enforce_cl_retcode(err, "clGetDeviceIDs failed");
313             num_all_devices += num_devices;
314             ++platforms_it;
315         }
316     }
317 
318     std::vector<cl_device_id> devices(num_all_devices);
319     std::vector<cl_device_id>::iterator devices_it = devices.begin();
320     for (auto p = platforms.begin(); p != platforms.end(); ++p) {
321         enforce_cl_retcode(clGetDeviceIDs((*p), CL_DEVICE_TYPE_ALL, (cl_uint)std::distance(devices_it, devices.end()), &*devices_it, &num_devices), "clGetDeviceIDs failed");
322         devices_it += num_devices;
323     }
324 
325     for (auto d = devices.begin(); d != devices.end(); ++d) {
326         opencl_devices.add(opencl_device((*d)));
327     }
328 
329     return opencl_devices;
330 }
331 
332 } // namespace internal
333 
334 // TODO: consider this namespace as public API
335 namespace opencl_info {
336 
available_devices()337     inline const opencl_device_list& available_devices() {
338         // Static storage for all available OpenCL devices on machine
339         static const opencl_device_list my_devices = internal::find_available_devices();
340         return my_devices;
341     }
342 
343 } // namespace opencl_info
344 
345 
346 class callback_base : tbb::internal::no_copy {
347 public:
348     virtual void call() = 0;
~callback_base()349     virtual ~callback_base() {}
350 };
351 
352 template <typename Callback, typename T>
353 class callback : public callback_base {
354     Callback my_callback;
355     T my_data;
356 public:
callback(Callback c,const T & t)357     callback( Callback c, const T& t ) : my_callback( c ), my_data( t ) {}
358 
call()359     void call() __TBB_override {
360         my_callback( my_data );
361     }
362 };
363 
364 template <typename T, typename Factory = opencl_info::default_opencl_factory>
365 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_async_msg : public async_msg<T> {
366 public:
367     typedef T value_type;
368 
opencl_async_msg()369     opencl_async_msg() : my_callback_flag_ptr( std::make_shared< tbb::atomic<bool>>() ) {
370         my_callback_flag_ptr->store<tbb::relaxed>(false);
371     }
372 
opencl_async_msg(const T & data)373     explicit opencl_async_msg( const T& data ) : my_data(data), my_callback_flag_ptr( std::make_shared<tbb::atomic<bool>>() ) {
374         my_callback_flag_ptr->store<tbb::relaxed>(false);
375     }
376 
opencl_async_msg(const T & data,cl_event event)377     opencl_async_msg( const T& data, cl_event event ) : my_data(data), my_event(event), my_is_event(true), my_callback_flag_ptr( std::make_shared<tbb::atomic<bool>>() ) {
378         my_callback_flag_ptr->store<tbb::relaxed>(false);
379         enforce_cl_retcode( clRetainEvent( my_event ), "Failed to retain an event" );
380     }
381 
382     T& data( bool wait = true ) {
383         if ( my_is_event && wait ) {
384             enforce_cl_retcode( clWaitForEvents( 1, &my_event ), "Failed to wait for an event" );
385             enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
386             my_is_event = false;
387         }
388         return my_data;
389     }
390 
391     const T& data( bool wait = true ) const {
392         if ( my_is_event && wait ) {
393             enforce_cl_retcode( clWaitForEvents( 1, &my_event ), "Failed to wait for an event" );
394             enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
395             my_is_event = false;
396         }
397         return my_data;
398     }
399 
opencl_async_msg(const opencl_async_msg & dmsg)400     opencl_async_msg( const opencl_async_msg &dmsg ) : async_msg<T>(dmsg),
401         my_data(dmsg.my_data), my_event(dmsg.my_event), my_is_event( dmsg.my_is_event ),
402         my_callback_flag_ptr(dmsg.my_callback_flag_ptr)
403     {
404         if ( my_is_event )
405             enforce_cl_retcode( clRetainEvent( my_event ), "Failed to retain an event" );
406     }
407 
opencl_async_msg(opencl_async_msg && dmsg)408     opencl_async_msg( opencl_async_msg &&dmsg ) : async_msg<T>(std::move(dmsg)),
409         my_data(std::move(dmsg.my_data)), my_event(dmsg.my_event), my_is_event(dmsg.my_is_event),
410         my_callback_flag_ptr( std::move(dmsg.my_callback_flag_ptr) )
411     {
412         dmsg.my_is_event = false;
413     }
414 
415     opencl_async_msg& operator=(const opencl_async_msg &dmsg) {
416         async_msg<T>::operator =(dmsg);
417 
418         // Release original event
419         if ( my_is_event )
420             enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to retain an event" );
421 
422         my_data = dmsg.my_data;
423         my_event = dmsg.my_event;
424         my_is_event = dmsg.my_is_event;
425 
426         // Retain copied event
427         if ( my_is_event )
428             enforce_cl_retcode( clRetainEvent( my_event ), "Failed to retain an event" );
429 
430         my_callback_flag_ptr = dmsg.my_callback_flag_ptr;
431         return *this;
432     }
433 
~opencl_async_msg()434     ~opencl_async_msg() {
435         if ( my_is_event )
436             enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
437     }
438 
get_event()439     cl_event const * get_event() const { return my_is_event ? &my_event : NULL; }
set_event(cl_event e)440     void set_event( cl_event e ) const {
441         if ( my_is_event ) {
442             cl_command_queue cq = event_info<cl_command_queue>( my_event, CL_EVENT_COMMAND_QUEUE );
443             if ( cq != event_info<cl_command_queue>( e, CL_EVENT_COMMAND_QUEUE ) )
444                 enforce_cl_retcode( clFlush( cq ), "Failed to flush an OpenCL command queue" );
445             enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
446         }
447         my_is_event = true;
448         my_event = e;
449         clRetainEvent( my_event );
450     }
451 
clear_event()452     void clear_event() const {
453         if ( my_is_event ) {
454             enforce_cl_retcode( clFlush( event_info<cl_command_queue>( my_event, CL_EVENT_COMMAND_QUEUE ) ), "Failed to flush an OpenCL command queue" );
455             enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
456         }
457         my_is_event = false;
458     }
459 
460     template <typename Callback>
register_callback(Callback c)461     void register_callback( Callback c ) const {
462         __TBB_ASSERT( my_is_event, "The OpenCL event is not set" );
463         enforce_cl_retcode( clSetEventCallback( my_event, CL_COMPLETE, register_callback_func, new callback<Callback, T>( c, my_data ) ), "Failed to set an OpenCL callback" );
464     }
465 
466     operator T&() { return data(); }
467     operator const T&() const { return data(); }
468 
469 protected:
470     // Overridden in this derived class to inform that
471     // async calculation chain is over
finalize()472     void finalize() const __TBB_override {
473         receive_if_memory_object(*this);
474         if (! my_callback_flag_ptr->fetch_and_store(true)) {
475             opencl_async_msg a(*this);
476             if (my_is_event) {
477                 register_callback([a](const T& t) mutable {
478                     a.set(t);
479                 });
480             }
481             else {
482                 a.set(my_data);
483             }
484         }
485         clear_event();
486     }
487 
488 private:
register_callback_func(cl_event,cl_int event_command_exec_status,void * data)489     static void CL_CALLBACK register_callback_func( cl_event, cl_int event_command_exec_status, void *data ) {
490         tbb::internal::suppress_unused_warning( event_command_exec_status );
491         __TBB_ASSERT( event_command_exec_status == CL_COMPLETE, NULL );
492         __TBB_ASSERT( data, NULL );
493         callback_base *c = static_cast<callback_base*>(data);
494         c->call();
495         delete c;
496     }
497 
498     T my_data;
499     mutable cl_event my_event;
500     mutable bool my_is_event = false;
501 
502     std::shared_ptr< tbb::atomic<bool> > my_callback_flag_ptr;
503 };
504 
505 template <typename K, typename T, typename Factory>
key_from_message(const opencl_async_msg<T,Factory> & dmsg)506 K key_from_message( const opencl_async_msg<T, Factory> &dmsg ) {
507     using tbb::flow::key_from_message;
508     const T &t = dmsg.data( false );
509     __TBB_STATIC_ASSERT( true, "" );
510     return key_from_message<K, T>( t );
511 }
512 
513 template <typename Factory>
514 class opencl_memory {
515 public:
opencl_memory()516     opencl_memory() {}
opencl_memory(Factory & f)517     opencl_memory( Factory &f ) : my_host_ptr( NULL ), my_factory( &f ), my_sending_event_present( false ) {
518         my_curr_device_id = my_factory->devices().begin()->my_device_id;
519     }
520 
~opencl_memory()521     virtual ~opencl_memory() {
522         if ( my_sending_event_present ) enforce_cl_retcode( clReleaseEvent( my_sending_event ), "Failed to release an event for the OpenCL buffer" );
523         enforce_cl_retcode( clReleaseMemObject( my_cl_mem ), "Failed to release an memory object" );
524     }
525 
get_cl_mem()526     cl_mem get_cl_mem() const {
527         return my_cl_mem;
528     }
529 
get_host_ptr()530     void* get_host_ptr() {
531         if ( !my_host_ptr ) {
532             opencl_async_msg<void*, Factory> d = receive( NULL );
533             d.data();
534             __TBB_ASSERT( d.data() == my_host_ptr, NULL );
535         }
536         return my_host_ptr;
537     }
538 
factory()539     Factory *factory() const { return my_factory; }
540 
receive(const cl_event * e)541     opencl_async_msg<void*, Factory> receive(const cl_event *e) {
542         opencl_async_msg<void*, Factory> d;
543         if (e) {
544             d = opencl_async_msg<void*, Factory>(my_host_ptr, *e);
545         } else {
546             d = opencl_async_msg<void*, Factory>(my_host_ptr);
547         }
548 
549         // Concurrent receives are prohibited so we do not worry about synchronization.
550         if (my_curr_device_id.load<tbb::relaxed>() != opencl_device::host) {
551             map_memory(*my_factory->devices().begin(), d);
552             my_curr_device_id.store<tbb::relaxed>(opencl_device::host);
553             my_host_ptr = d.data(false);
554         }
555         // Release the sending event
556         if (my_sending_event_present) {
557             enforce_cl_retcode(clReleaseEvent(my_sending_event), "Failed to release an event");
558             my_sending_event_present = false;
559         }
560         return d;
561     }
562 
send(opencl_device device,const cl_event * e)563     opencl_async_msg<void*, Factory> send(opencl_device device, const cl_event *e) {
564         opencl_device::device_id_type device_id = device.my_device_id;
565         if (!my_factory->is_same_context(my_curr_device_id.load<tbb::acquire>(), device_id)) {
566             {
567                 tbb::spin_mutex::scoped_lock lock(my_sending_lock);
568                 if (!my_factory->is_same_context(my_curr_device_id.load<tbb::relaxed>(), device_id)) {
569                     __TBB_ASSERT(my_host_ptr, "The buffer has not been mapped");
570                     opencl_async_msg<void*, Factory> d(my_host_ptr);
571                     my_factory->enqueue_unmap_buffer(device, *this, d);
572                     my_sending_event = *d.get_event();
573                     my_sending_event_present = true;
574                     enforce_cl_retcode(clRetainEvent(my_sending_event), "Failed to retain an event");
575                     my_host_ptr = NULL;
576                     my_curr_device_id.store<tbb::release>(device_id);
577                 }
578             }
579             __TBB_ASSERT(my_sending_event_present, NULL);
580         }
581 
582         // !e means that buffer has come from the host
583         if (!e && my_sending_event_present) e = &my_sending_event;
584 
585         __TBB_ASSERT(!my_host_ptr, "The buffer has not been unmapped");
586         return e ? opencl_async_msg<void*, Factory>(NULL, *e) : opencl_async_msg<void*, Factory>(NULL);
587     }
588 
589     virtual void map_memory( opencl_device, opencl_async_msg<void*, Factory> & ) = 0;
590 protected:
591     cl_mem my_cl_mem;
592     tbb::atomic<opencl_device::device_id_type> my_curr_device_id;
593     void* my_host_ptr;
594     Factory *my_factory;
595 
596     tbb::spin_mutex my_sending_lock;
597     bool my_sending_event_present;
598     cl_event my_sending_event;
599 };
600 
601 template <typename Factory>
602 class opencl_buffer_impl : public opencl_memory<Factory> {
603     size_t my_size;
604 public:
opencl_buffer_impl(size_t size,Factory & f)605     opencl_buffer_impl( size_t size, Factory& f ) : opencl_memory<Factory>( f ), my_size( size ) {
606         cl_int err;
607         this->my_cl_mem = clCreateBuffer( this->my_factory->context(), CL_MEM_ALLOC_HOST_PTR, size, NULL, &err );
608         enforce_cl_retcode( err, "Failed to create an OpenCL buffer" );
609     }
610 
611     // The constructor for subbuffers.
opencl_buffer_impl(cl_mem m,size_t index,size_t size,Factory & f)612     opencl_buffer_impl( cl_mem m, size_t index, size_t size, Factory& f ) : opencl_memory<Factory>( f ), my_size( size ) {
613         cl_int err;
614         cl_buffer_region region = { index, size };
615         this->my_cl_mem = clCreateSubBuffer( m, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err );
616         enforce_cl_retcode( err, "Failed to create an OpenCL subbuffer" );
617     }
618 
size()619     size_t size() const {
620         return my_size;
621     }
622 
map_memory(opencl_device device,opencl_async_msg<void *,Factory> & dmsg)623     void map_memory( opencl_device device, opencl_async_msg<void*, Factory> &dmsg ) __TBB_override {
624         this->my_factory->enqueue_map_buffer( device, *this, dmsg );
625     }
626 
627 #if TBB_USE_ASSERT
628     template <typename, typename>
629     friend class opencl_buffer;
630 #endif
631 };
632 
633 enum access_type {
634     read_write,
635     write_only,
636     read_only
637 };
638 
639 template <typename T, typename Factory = opencl_info::default_opencl_factory>
640 class __TBB_DEPRECATED_IN_VERBOSE_MODE
641 opencl_subbuffer;
642 
643 template <typename T, typename Factory = opencl_info::default_opencl_factory>
644 class __TBB_DEPRECATED_IN_VERBOSE_MODE
645 opencl_buffer {
646 public:
647     typedef cl_mem native_object_type;
648     typedef opencl_buffer memory_object_type;
649     typedef Factory opencl_factory_type;
650 
651     template<access_type a> using iterator = T*;
652 
653     template <access_type a>
access()654     iterator<a> access() const {
655         T* ptr = (T*)my_impl->get_host_ptr();
656         __TBB_ASSERT( ptr, NULL );
657         return iterator<a>( ptr );
658     }
659 
data()660     T* data() const { return &access<read_write>()[0]; }
661 
662     template <access_type a = read_write>
begin()663     iterator<a> begin() const { return access<a>(); }
664 
665     template <access_type a = read_write>
end()666     iterator<a> end() const { return access<a>()+my_impl->size()/sizeof(T); }
667 
size()668     size_t size() const { return my_impl->size()/sizeof(T); }
669 
670     T& operator[] ( ptrdiff_t k ) { return begin()[k]; }
671 
opencl_buffer()672     opencl_buffer() {}
673     opencl_buffer( size_t size );
opencl_buffer(Factory & f,size_t size)674     opencl_buffer( Factory &f, size_t size ) : my_impl( std::make_shared<impl_type>( size*sizeof(T), f ) ) {}
675 
native_object()676     cl_mem native_object() const {
677         return my_impl->get_cl_mem();
678     }
679 
memory_object()680     const opencl_buffer& memory_object() const {
681         return *this;
682     }
683 
send(opencl_device device,opencl_async_msg<opencl_buffer,Factory> & dependency)684     void send( opencl_device device, opencl_async_msg<opencl_buffer, Factory> &dependency ) const {
685         __TBB_ASSERT( dependency.data( /*wait = */false ) == *this, NULL );
686         opencl_async_msg<void*, Factory> d = my_impl->send( device, dependency.get_event() );
687         const cl_event *e = d.get_event();
688         if ( e ) dependency.set_event( *e );
689         else dependency.clear_event();
690     }
receive(const opencl_async_msg<opencl_buffer,Factory> & dependency)691     void receive( const opencl_async_msg<opencl_buffer, Factory> &dependency ) const {
692         __TBB_ASSERT( dependency.data( /*wait = */false ) == *this, NULL );
693         opencl_async_msg<void*, Factory> d = my_impl->receive( dependency.get_event() );
694         const cl_event *e = d.get_event();
695         if ( e ) dependency.set_event( *e );
696         else dependency.clear_event();
697     }
698 
699     opencl_subbuffer<T, Factory> subbuffer( size_t index, size_t size ) const;
700 private:
701     // The constructor for subbuffers.
opencl_buffer(Factory & f,cl_mem m,size_t index,size_t size)702     opencl_buffer( Factory &f, cl_mem m, size_t index, size_t size ) : my_impl( std::make_shared<impl_type>( m, index*sizeof(T), size*sizeof(T), f ) ) {}
703 
704     typedef opencl_buffer_impl<Factory> impl_type;
705 
706     std::shared_ptr<impl_type> my_impl;
707 
708     friend bool operator==(const opencl_buffer<T, Factory> &lhs, const opencl_buffer<T, Factory> &rhs) {
709         return lhs.my_impl == rhs.my_impl;
710     }
711 
712     template <typename>
713     friend class opencl_factory;
714     template <typename, typename>
715     friend class opencl_subbuffer;
716 };
717 
718 template <typename T, typename Factory>
719 class __TBB_DEPRECATED_IN_VERBOSE_MODE
720 opencl_subbuffer : public opencl_buffer<T, Factory> {
721     opencl_buffer<T, Factory> my_owner;
722 public:
opencl_subbuffer()723     opencl_subbuffer() {}
opencl_subbuffer(const opencl_buffer<T,Factory> & owner,size_t index,size_t size)724     opencl_subbuffer( const opencl_buffer<T, Factory> &owner, size_t index, size_t size ) :
725         opencl_buffer<T, Factory>( *owner.my_impl->factory(), owner.native_object(), index, size ), my_owner( owner ) {}
726 };
727 
728 template <typename T, typename Factory>
subbuffer(size_t index,size_t size)729 opencl_subbuffer<T, Factory> opencl_buffer<T, Factory>::subbuffer( size_t index, size_t size ) const {
730     return opencl_subbuffer<T, Factory>( *this, index, size );
731 }
732 
733 
734 #define is_typedef(type)                                                    \
735     template <typename T>                                                   \
736     struct is_##type {                                                      \
737         template <typename C>                                               \
738         static std::true_type check( typename C::type* );                   \
739         template <typename C>                                               \
740         static std::false_type check( ... );                                \
741                                                                             \
742         static const bool value = decltype(check<T>(0))::value;             \
743     }
744 
745 is_typedef( native_object_type );
746 is_typedef( memory_object_type );
747 
748 template <typename T>
get_native_object(const T & t)749 typename std::enable_if<is_native_object_type<T>::value, typename T::native_object_type>::type get_native_object( const T &t ) {
750     return t.native_object();
751 }
752 
753 template <typename T>
get_native_object(T t)754 typename std::enable_if<!is_native_object_type<T>::value, T>::type get_native_object( T t ) {
755     return t;
756 }
757 
758 // send_if_memory_object checks if the T type has memory_object_type and call the send method for the object.
759 template <typename T, typename Factory>
send_if_memory_object(opencl_device device,opencl_async_msg<T,Factory> & dmsg)760 typename std::enable_if<is_memory_object_type<T>::value>::type send_if_memory_object( opencl_device device, opencl_async_msg<T, Factory> &dmsg ) {
761     const T &t = dmsg.data( false );
762     typedef typename T::memory_object_type mem_obj_t;
763     mem_obj_t mem_obj = t.memory_object();
764     opencl_async_msg<mem_obj_t, Factory> d( mem_obj );
765     if ( dmsg.get_event() ) d.set_event( *dmsg.get_event() );
766     mem_obj.send( device, d );
767     if ( d.get_event() ) dmsg.set_event( *d.get_event() );
768 }
769 
770 template <typename T>
send_if_memory_object(opencl_device device,T & t)771 typename std::enable_if<is_memory_object_type<T>::value>::type send_if_memory_object( opencl_device device, T &t ) {
772     typedef typename T::memory_object_type mem_obj_t;
773     mem_obj_t mem_obj = t.memory_object();
774     opencl_async_msg<mem_obj_t, typename mem_obj_t::opencl_factory_type> dmsg( mem_obj );
775     mem_obj.send( device, dmsg );
776 }
777 
778 template <typename T>
send_if_memory_object(opencl_device,T &)779 typename std::enable_if<!is_memory_object_type<T>::value>::type send_if_memory_object( opencl_device, T& ) {};
780 
781 // receive_if_memory_object checks if the T type has memory_object_type and call the receive method for the object.
782 template <typename T, typename Factory>
receive_if_memory_object(const opencl_async_msg<T,Factory> & dmsg)783 typename std::enable_if<is_memory_object_type<T>::value>::type receive_if_memory_object( const opencl_async_msg<T, Factory> &dmsg ) {
784     const T &t = dmsg.data( false );
785     typedef typename T::memory_object_type mem_obj_t;
786     mem_obj_t mem_obj = t.memory_object();
787     opencl_async_msg<mem_obj_t, Factory> d( mem_obj );
788     if ( dmsg.get_event() ) d.set_event( *dmsg.get_event() );
789     mem_obj.receive( d );
790     if ( d.get_event() ) dmsg.set_event( *d.get_event() );
791 }
792 
793 template <typename T>
receive_if_memory_object(const T &)794 typename std::enable_if<!is_memory_object_type<T>::value>::type  receive_if_memory_object( const T& ) {}
795 
796 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_range {
797 public:
798     typedef size_t range_index_type;
799     typedef std::array<range_index_type, 3> nd_range_type;
800 
801     template <typename G = std::initializer_list<int>, typename L = std::initializer_list<int>,
802         typename = typename std::enable_if<!std::is_same<typename std::decay<G>::type, opencl_range>::value>::type>
803     opencl_range(G&& global_work = std::initializer_list<int>({ 0 }), L&& local_work = std::initializer_list<int>({ 0, 0, 0 })) {
804         auto g_it = global_work.begin();
805         auto l_it = local_work.begin();
806         my_global_work_size = { {size_t(-1), size_t(-1), size_t(-1)} };
807         // my_local_work_size is still uninitialized
808         for (int s = 0; s < 3 && g_it != global_work.end(); ++g_it, ++l_it, ++s) {
809             __TBB_ASSERT(l_it != local_work.end(), "global_work & local_work must have same size");
810             my_global_work_size[s] = *g_it;
811             my_local_work_size[s] = *l_it;
812         }
813     }
814 
global_range()815     const nd_range_type& global_range() const { return my_global_work_size; }
local_range()816     const nd_range_type& local_range() const { return my_local_work_size; }
817 
818 private:
819     nd_range_type my_global_work_size;
820     nd_range_type my_local_work_size;
821 };
822 
823 template <typename DeviceFilter>
824 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_factory {
825 public:
826     template<typename T> using async_msg_type = opencl_async_msg<T, opencl_factory<DeviceFilter>>;
827     typedef opencl_device device_type;
828 
829     class kernel : tbb::internal::no_assign {
830     public:
kernel(const kernel & k)831         kernel( const kernel& k ) : my_factory( k.my_factory ) {
832             // Clone my_cl_kernel via opencl_program
833             size_t ret_size = 0;
834 
835             std::vector<char> kernel_name;
836             for ( size_t curr_size = 32;; curr_size <<= 1 ) {
837                 kernel_name.resize( curr_size <<= 1 );
838                 enforce_cl_retcode( clGetKernelInfo( k.my_cl_kernel, CL_KERNEL_FUNCTION_NAME, curr_size, kernel_name.data(), &ret_size ), "Failed to get kernel info" );
839                 if ( ret_size < curr_size ) break;
840             }
841 
842             cl_program program;
843             enforce_cl_retcode( clGetKernelInfo( k.my_cl_kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, &ret_size ), "Failed to get kernel info" );
844             __TBB_ASSERT( ret_size == sizeof(program), NULL );
845 
846             my_cl_kernel = opencl_program< factory_type >( my_factory, program ).get_cl_kernel( kernel_name.data() );
847         }
848 
~kernel()849         ~kernel() {
850             enforce_cl_retcode( clReleaseKernel( my_cl_kernel ), "Failed to release a kernel" );
851         }
852 
853     private:
854         typedef opencl_factory<DeviceFilter> factory_type;
855 
kernel(const cl_kernel & k,factory_type & f)856         kernel( const cl_kernel& k, factory_type& f ) : my_cl_kernel( k ), my_factory( f ) {}
857 
858         // Data
859         cl_kernel my_cl_kernel;
860         factory_type& my_factory;
861 
862         template <typename DeviceFilter_>
863         friend class opencl_factory;
864 
865         template <typename Factory>
866         friend class opencl_program;
867     };
868 
869     typedef kernel kernel_type;
870 
871     // 'range_type' enables kernel_executor with range support
872     // it affects expectations for enqueue_kernel(.....) interface method
873     typedef opencl_range range_type;
874 
opencl_factory()875     opencl_factory() {}
~opencl_factory()876     ~opencl_factory() {
877         if ( my_devices.size() ) {
878             for ( auto d = my_devices.begin(); d != my_devices.end(); ++d ) {
879                 enforce_cl_retcode( clReleaseCommandQueue( (*d).my_cl_command_queue ), "Failed to release a command queue" );
880             }
881             enforce_cl_retcode( clReleaseContext( my_cl_context ), "Failed to release a context" );
882         }
883     }
884 
init(const opencl_device_list & device_list)885     bool init( const opencl_device_list &device_list ) {
886         tbb::spin_mutex::scoped_lock lock( my_devices_mutex );
887         if ( !my_devices.size() ) {
888             my_devices = device_list;
889             return true;
890         }
891         return false;
892     }
893 
894 
895 private:
896     template <typename Factory>
enqueue_map_buffer(opencl_device device,opencl_buffer_impl<Factory> & buffer,opencl_async_msg<void *,Factory> & dmsg)897     void enqueue_map_buffer( opencl_device device, opencl_buffer_impl<Factory> &buffer, opencl_async_msg<void*, Factory>& dmsg ) {
898         cl_event const* e1 = dmsg.get_event();
899         cl_event e2;
900         cl_int err;
901         void *ptr = clEnqueueMapBuffer( device.my_cl_command_queue, buffer.get_cl_mem(), false, CL_MAP_READ | CL_MAP_WRITE, 0, buffer.size(),
902             e1 == NULL ? 0 : 1, e1, &e2, &err );
903         enforce_cl_retcode( err, "Failed to map a buffer" );
904         dmsg.data( false ) = ptr;
905         dmsg.set_event( e2 );
906         enforce_cl_retcode( clReleaseEvent( e2 ), "Failed to release an event" );
907     }
908 
909 
910     template <typename Factory>
enqueue_unmap_buffer(opencl_device device,opencl_memory<Factory> & memory,opencl_async_msg<void *,Factory> & dmsg)911     void enqueue_unmap_buffer( opencl_device device, opencl_memory<Factory> &memory, opencl_async_msg<void*, Factory>& dmsg ) {
912         cl_event const* e1 = dmsg.get_event();
913         cl_event e2;
914         enforce_cl_retcode(
915             clEnqueueUnmapMemObject( device.my_cl_command_queue, memory.get_cl_mem(), memory.get_host_ptr(), e1 == NULL ? 0 : 1, e1, &e2 ),
916            "Failed to unmap a buffer" );
917         dmsg.set_event( e2 );
918         enforce_cl_retcode( clReleaseEvent( e2 ), "Failed to release an event" );
919     }
920 
921     // --------- Kernel argument & event list helpers --------- //
922     template <size_t NUM_ARGS, typename T>
process_one_arg(const kernel_type & kernel,std::array<cl_event,NUM_ARGS> &,int &,int & place,const T & t)923     void process_one_arg( const kernel_type& kernel, std::array<cl_event, NUM_ARGS>&, int&, int& place, const T& t ) {
924         auto p = get_native_object(t);
925         enforce_cl_retcode( clSetKernelArg(kernel.my_cl_kernel, place++, sizeof(p), &p), "Failed to set a kernel argument" );
926     }
927 
928     template <size_t NUM_ARGS, typename T, typename F>
process_one_arg(const kernel_type & kernel,std::array<cl_event,NUM_ARGS> & events,int & num_events,int & place,const opencl_async_msg<T,F> & msg)929     void process_one_arg( const kernel_type& kernel, std::array<cl_event, NUM_ARGS>& events, int& num_events, int& place, const opencl_async_msg<T, F>& msg ) {
930         __TBB_ASSERT((static_cast<typename std::array<cl_event, NUM_ARGS>::size_type>(num_events) < events.size()), NULL);
931 
932         const cl_event * const e = msg.get_event();
933         if (e != NULL) {
934             events[num_events++] = *e;
935         }
936 
937         process_one_arg( kernel, events, num_events, place, msg.data(false) );
938     }
939 
940     template <size_t NUM_ARGS, typename T, typename ...Rest>
process_arg_list(const kernel_type & kernel,std::array<cl_event,NUM_ARGS> & events,int & num_events,int & place,const T & t,const Rest &...args)941     void process_arg_list( const kernel_type& kernel, std::array<cl_event, NUM_ARGS>& events, int& num_events, int& place, const T& t, const Rest&... args ) {
942         process_one_arg( kernel, events, num_events, place, t );
943         process_arg_list( kernel, events, num_events, place, args... );
944     }
945 
946     template <size_t NUM_ARGS>
process_arg_list(const kernel_type &,std::array<cl_event,NUM_ARGS> &,int &,int &)947     void process_arg_list( const kernel_type&, std::array<cl_event, NUM_ARGS>&, int&, int& ) {}
948     // ------------------------------------------- //
949     template <typename T>
update_one_arg(cl_event,T &)950     void update_one_arg( cl_event, T& ) {}
951 
952     template <typename T, typename F>
update_one_arg(cl_event e,opencl_async_msg<T,F> & msg)953     void update_one_arg( cl_event e, opencl_async_msg<T, F>& msg ) {
954         msg.set_event( e );
955     }
956 
957     template <typename T, typename ...Rest>
update_arg_list(cl_event e,T & t,Rest &...args)958     void update_arg_list( cl_event e, T& t, Rest&... args ) {
959         update_one_arg( e, t );
960         update_arg_list( e, args... );
961     }
962 
update_arg_list(cl_event)963     void update_arg_list( cl_event ) {}
964     // ------------------------------------------- //
965 public:
966     template <typename ...Args>
send_kernel(opencl_device device,const kernel_type & kernel,const range_type & work_size,Args &...args)967     void send_kernel( opencl_device device, const kernel_type& kernel, const range_type& work_size, Args&... args ) {
968         std::array<cl_event, sizeof...(Args)> events;
969         int num_events = 0;
970         int place = 0;
971         process_arg_list( kernel, events, num_events, place, args... );
972 
973         const cl_event e = send_kernel_impl( device, kernel.my_cl_kernel, work_size, num_events, events.data() );
974 
975         update_arg_list(e, args...);
976 
977         // Release our own reference to cl_event
978         enforce_cl_retcode( clReleaseEvent(e), "Failed to release an event" );
979     }
980 
981     // ------------------------------------------- //
982     template <typename T, typename ...Rest>
send_data(opencl_device device,T & t,Rest &...args)983     void send_data(opencl_device device, T& t, Rest&... args) {
984         send_if_memory_object( device, t );
985         send_data( device, args... );
986     }
987 
send_data(opencl_device)988     void send_data(opencl_device) {}
989     // ------------------------------------------- //
990 
991 private:
send_kernel_impl(opencl_device device,const cl_kernel & kernel,const range_type & work_size,cl_uint num_events,cl_event * event_list)992     cl_event send_kernel_impl( opencl_device device, const cl_kernel& kernel,
993         const range_type& work_size, cl_uint num_events, cl_event* event_list ) {
994         const typename range_type::nd_range_type g_offset = { { 0, 0, 0 } };
995         const typename range_type::nd_range_type& g_size = work_size.global_range();
996         const typename range_type::nd_range_type& l_size = work_size.local_range();
997         cl_uint s;
998         for ( s = 1; s < 3 && g_size[s] != size_t(-1); ++s) {}
999         cl_event event;
1000         enforce_cl_retcode(
1001             clEnqueueNDRangeKernel( device.my_cl_command_queue, kernel, s,
1002                 g_offset.data(), g_size.data(), l_size[0] ? l_size.data() : NULL, num_events, num_events ? event_list : NULL, &event ),
1003             "Failed to enqueue a kernel" );
1004         return event;
1005     }
1006 
1007     // ------------------------------------------- //
1008     template <typename T>
get_event_from_one_arg(cl_event &,const T &)1009     bool get_event_from_one_arg( cl_event&, const T& ) {
1010         return false;
1011     }
1012 
1013     template <typename T, typename F>
get_event_from_one_arg(cl_event & e,const opencl_async_msg<T,F> & msg)1014     bool get_event_from_one_arg( cl_event& e, const opencl_async_msg<T, F>& msg) {
1015         cl_event const *e_ptr = msg.get_event();
1016 
1017         if ( e_ptr != NULL ) {
1018             e = *e_ptr;
1019             return true;
1020         }
1021 
1022         return false;
1023     }
1024 
1025     template <typename T, typename ...Rest>
get_event_from_args(cl_event & e,const T & t,const Rest &...args)1026     bool get_event_from_args( cl_event& e, const T& t, const Rest&... args ) {
1027         if ( get_event_from_one_arg( e, t ) ) {
1028             return true;
1029         }
1030 
1031         return get_event_from_args( e, args... );
1032     }
1033 
get_event_from_args(cl_event &)1034     bool get_event_from_args( cl_event& ) {
1035         return false;
1036     }
1037     // ------------------------------------------- //
1038 
1039     struct finalize_fn : tbb::internal::no_assign {
~finalize_fnfinalize_fn1040         virtual ~finalize_fn() {}
operatorfinalize_fn1041         virtual void operator() () {}
1042     };
1043 
1044     template<typename Fn>
1045     struct finalize_fn_leaf : public finalize_fn {
1046         Fn my_fn;
finalize_fn_leaffinalize_fn_leaf1047         finalize_fn_leaf(Fn fn) : my_fn(fn) {}
operatorfinalize_fn_leaf1048         void operator() () __TBB_override { my_fn(); }
1049     };
1050 
finalize_callback(cl_event,cl_int event_command_exec_status,void * data)1051     static void CL_CALLBACK finalize_callback(cl_event, cl_int event_command_exec_status, void *data) {
1052         tbb::internal::suppress_unused_warning(event_command_exec_status);
1053         __TBB_ASSERT(event_command_exec_status == CL_COMPLETE, NULL);
1054 
1055         finalize_fn * const fn_ptr = static_cast<finalize_fn*>(data);
1056         __TBB_ASSERT(fn_ptr != NULL, "Invalid finalize function pointer");
1057         (*fn_ptr)();
1058 
1059         // Function pointer was created by 'new' & this callback must be called once only
1060         delete fn_ptr;
1061     }
1062 public:
1063     template <typename FinalizeFn, typename ...Args>
finalize(opencl_device device,FinalizeFn fn,Args &...args)1064     void finalize( opencl_device device, FinalizeFn fn, Args&... args ) {
1065         cl_event e;
1066 
1067         if ( get_event_from_args( e, args... ) ) {
1068             enforce_cl_retcode( clSetEventCallback( e, CL_COMPLETE, finalize_callback,
1069                 new finalize_fn_leaf<FinalizeFn>(fn) ), "Failed to set a callback" );
1070         }
1071 
1072         enforce_cl_retcode( clFlush( device.my_cl_command_queue ), "Failed to flush an OpenCL command queue" );
1073     }
1074 
devices()1075     const opencl_device_list& devices() {
1076         std::call_once( my_once_flag, &opencl_factory::init_once, this );
1077         return my_devices;
1078     }
1079 
1080 private:
is_same_context(opencl_device::device_id_type d1,opencl_device::device_id_type d2)1081     bool is_same_context( opencl_device::device_id_type d1, opencl_device::device_id_type d2 ) {
1082         __TBB_ASSERT( d1 != opencl_device::unknown && d2 != opencl_device::unknown, NULL );
1083         // Currently, factory supports only one context so if the both devices are not host it means the are in the same context.
1084         if ( d1 != opencl_device::host && d2 != opencl_device::host )
1085             return true;
1086         return d1 == d2;
1087     }
1088 private:
1089     opencl_factory( const opencl_factory& );
1090     opencl_factory& operator=(const opencl_factory&);
1091 
context()1092     cl_context context() {
1093         std::call_once( my_once_flag, &opencl_factory::init_once, this );
1094         return my_cl_context;
1095     }
1096 
init_once()1097     void init_once() {
1098         {
1099             tbb::spin_mutex::scoped_lock lock(my_devices_mutex);
1100             if (!my_devices.size())
1101                 my_devices = DeviceFilter()( opencl_info::available_devices() );
1102         }
1103 
1104         enforce_cl_retcode(my_devices.size() ? CL_SUCCESS : CL_INVALID_DEVICE, "No devices in the device list");
1105         cl_platform_id platform_id = my_devices.begin()->platform_id();
1106         for (opencl_device_list::iterator it = ++my_devices.begin(); it != my_devices.end(); ++it)
1107             enforce_cl_retcode(it->platform_id() == platform_id ? CL_SUCCESS : CL_INVALID_PLATFORM, "All devices should be in the same platform");
1108 
1109         std::vector<cl_device_id> cl_device_ids;
1110         for (auto d = my_devices.begin(); d != my_devices.end(); ++d) {
1111             cl_device_ids.push_back((*d).my_cl_device_id);
1112         }
1113 
1114         cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, (cl_context_properties)NULL };
1115         cl_int err;
1116         cl_context ctx = clCreateContext(context_properties,
1117             (cl_uint)cl_device_ids.size(),
1118             cl_device_ids.data(),
1119             NULL, NULL, &err);
1120         enforce_cl_retcode(err, "Failed to create context");
1121         my_cl_context = ctx;
1122 
1123         size_t device_counter = 0;
1124         for (auto d = my_devices.begin(); d != my_devices.end(); d++) {
1125             (*d).my_device_id = device_counter++;
1126             cl_int err2;
1127             cl_command_queue cq;
1128 #if CL_VERSION_2_0
1129             if ((*d).major_version() >= 2) {
1130                 if ((*d).out_of_order_exec_mode_on_host_present()) {
1131                     cl_queue_properties props[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0 };
1132                     cq = clCreateCommandQueueWithProperties(ctx, (*d).my_cl_device_id, props, &err2);
1133                 } else {
1134                     cl_queue_properties props[] = { 0 };
1135                     cq = clCreateCommandQueueWithProperties(ctx, (*d).my_cl_device_id, props, &err2);
1136                 }
1137             } else
1138 #endif
1139             {
1140                 cl_command_queue_properties props = (*d).out_of_order_exec_mode_on_host_present() ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE : 0;
1141                 // Suppress "declared deprecated" warning for the next line.
1142 #if __TBB_GCC_WARNING_SUPPRESSION_PRESENT
1143 #pragma GCC diagnostic push
1144 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1145 #endif
1146 #if _MSC_VER || __INTEL_COMPILER
1147 #pragma warning( push )
1148 #if __INTEL_COMPILER
1149 #pragma warning (disable: 1478)
1150 #else
1151 #pragma warning (disable: 4996)
1152 #endif
1153 #endif
1154                 cq = clCreateCommandQueue(ctx, (*d).my_cl_device_id, props, &err2);
1155 #if _MSC_VER || __INTEL_COMPILER
1156 #pragma warning( pop )
1157 #endif
1158 #if __TBB_GCC_WARNING_SUPPRESSION_PRESENT
1159 #pragma GCC diagnostic pop
1160 #endif
1161             }
1162             enforce_cl_retcode(err2, "Failed to create command queue");
1163             (*d).my_cl_command_queue = cq;
1164         }
1165     }
1166 
1167     std::once_flag my_once_flag;
1168     opencl_device_list my_devices;
1169     cl_context my_cl_context;
1170 
1171     tbb::spin_mutex my_devices_mutex;
1172 
1173     template <typename Factory>
1174     friend class opencl_program;
1175     template <typename Factory>
1176     friend class opencl_buffer_impl;
1177     template <typename Factory>
1178     friend class opencl_memory;
1179 }; // class opencl_factory
1180 
1181 // TODO: consider this namespace as public API
1182 namespace opencl_info {
1183 
1184 // Default types
1185 
1186 template <typename Factory>
1187 struct default_device_selector {
operatordefault_device_selector1188     opencl_device operator()(Factory& f) {
1189         __TBB_ASSERT(!f.devices().empty(), "No available devices");
1190         return *(f.devices().begin());
1191     }
1192 };
1193 
1194 struct default_device_filter {
operatordefault_device_filter1195     opencl_device_list operator()(const opencl_device_list &devices) {
1196         opencl_device_list dl;
1197         cl_platform_id platform_id = devices.begin()->platform_id();
1198         for (opencl_device_list::const_iterator it = devices.cbegin(); it != devices.cend(); ++it) {
1199             if (it->platform_id() == platform_id) {
1200                 dl.add(*it);
1201             }
1202         }
1203         return dl;
1204     }
1205 };
1206 
1207 class default_opencl_factory : public opencl_factory < default_device_filter >, tbb::internal::no_copy {
1208 public:
1209     template<typename T> using async_msg_type = opencl_async_msg<T, default_opencl_factory>;
1210 
1211     friend default_opencl_factory& default_factory();
1212 
1213 private:
1214     default_opencl_factory() = default;
1215 };
1216 
default_factory()1217 inline default_opencl_factory& default_factory() {
1218     static default_opencl_factory default_factory;
1219     return default_factory;
1220 }
1221 
1222 } // namespace opencl_info
1223 
1224 template <typename T, typename Factory>
opencl_buffer(size_t size)1225 opencl_buffer<T, Factory>::opencl_buffer( size_t size ) : my_impl( std::make_shared<impl_type>( size*sizeof(T), opencl_info::default_factory() ) ) {}
1226 
1227 
1228 enum class opencl_program_type {
1229     SOURCE,
1230     PRECOMPILED,
1231     SPIR
1232 };
1233 
1234 template <typename Factory = opencl_info::default_opencl_factory>
1235 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_program : tbb::internal::no_assign {
1236 public:
1237     typedef typename Factory::kernel_type kernel_type;
1238 
opencl_program(Factory & factory,opencl_program_type type,const std::string & program_name)1239     opencl_program( Factory& factory, opencl_program_type type, const std::string& program_name ) : my_factory( factory ), my_type(type) , my_arg_str( program_name) {}
opencl_program(Factory & factory,const char * program_name)1240     opencl_program( Factory& factory, const char* program_name ) : opencl_program( factory, std::string( program_name ) ) {}
opencl_program(Factory & factory,const std::string & program_name)1241     opencl_program( Factory& factory, const std::string& program_name ) : opencl_program( factory, opencl_program_type::SOURCE, program_name ) {}
1242 
opencl_program(opencl_program_type type,const std::string & program_name)1243     opencl_program( opencl_program_type type, const std::string& program_name ) : opencl_program( opencl_info::default_factory(), type, program_name ) {}
opencl_program(const char * program_name)1244     opencl_program( const char* program_name ) : opencl_program( opencl_info::default_factory(), program_name ) {}
opencl_program(const std::string & program_name)1245     opencl_program( const std::string& program_name ) : opencl_program( opencl_info::default_factory(), program_name ) {}
opencl_program(opencl_program_type type)1246     opencl_program( opencl_program_type type ) : opencl_program( opencl_info::default_factory(), type ) {}
1247 
opencl_program(const opencl_program & src)1248     opencl_program( const opencl_program &src ) : my_factory( src.my_factory ), my_type( src.type ), my_arg_str( src.my_arg_str ), my_cl_program( src.my_cl_program ) {
1249         // Set my_do_once_flag to the called state.
1250         std::call_once( my_do_once_flag, [](){} );
1251     }
1252 
get_kernel(const std::string & k)1253     kernel_type get_kernel( const std::string& k ) const {
1254         return kernel_type( get_cl_kernel(k), my_factory );
1255     }
1256 
1257 private:
opencl_program(Factory & factory,cl_program program)1258     opencl_program( Factory& factory, cl_program program ) : my_factory( factory ), my_cl_program( program ) {
1259         // Set my_do_once_flag to the called state.
1260         std::call_once( my_do_once_flag, [](){} );
1261     }
1262 
get_cl_kernel(const std::string & k)1263     cl_kernel get_cl_kernel( const std::string& k ) const {
1264         std::call_once( my_do_once_flag, [this, &k](){ this->init( k ); } );
1265         cl_int err;
1266         cl_kernel kernel = clCreateKernel( my_cl_program, k.c_str(), &err );
1267         enforce_cl_retcode( err, std::string( "Failed to create kernel: " ) + k );
1268         return kernel;
1269     }
1270 
1271     class file_reader {
1272     public:
file_reader(const std::string & filepath)1273         file_reader( const std::string& filepath ) {
1274             std::ifstream file_descriptor( filepath, std::ifstream::binary );
1275             if ( !file_descriptor.is_open() ) {
1276                 std::string str = std::string( "Could not open file: " ) + filepath;
1277                 std::cerr << str << std::endl;
1278                 throw str;
1279             }
1280             file_descriptor.seekg( 0, file_descriptor.end );
1281             size_t length = size_t( file_descriptor.tellg() );
1282             file_descriptor.seekg( 0, file_descriptor.beg );
1283             my_content.resize( length );
1284             char* begin = &*my_content.begin();
1285             file_descriptor.read( begin, length );
1286             file_descriptor.close();
1287         }
content()1288         const char* content() { return &*my_content.cbegin(); }
length()1289         size_t length() { return my_content.length(); }
1290     private:
1291         std::string my_content;
1292     };
1293 
1294     class opencl_program_builder {
1295     public:
1296         typedef void (CL_CALLBACK *cl_callback_type)(cl_program, void*);
opencl_program_builder(Factory & f,const std::string & name,cl_program program,cl_uint num_devices,cl_device_id * device_list,const char * options,cl_callback_type callback,void * user_data)1297         opencl_program_builder( Factory& f, const std::string& name, cl_program program,
1298                                 cl_uint num_devices, cl_device_id* device_list,
1299                                 const char* options, cl_callback_type callback,
1300                                 void* user_data ) {
1301             cl_int err = clBuildProgram( program, num_devices, device_list, options,
1302                                          callback, user_data );
1303             if( err == CL_SUCCESS )
1304                 return;
1305             std::string str = std::string( "Failed to build program: " ) + name;
1306             if ( err == CL_BUILD_PROGRAM_FAILURE ) {
1307                 const opencl_device_list &devices = f.devices();
1308                 for ( auto d = devices.begin(); d != devices.end(); ++d ) {
1309                     std::cerr << "Build log for device: " << (*d).name() << std::endl;
1310                     size_t log_size;
1311                     cl_int query_err = clGetProgramBuildInfo(
1312                         program, (*d).my_cl_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL,
1313                         &log_size );
1314                     enforce_cl_retcode( query_err, "Failed to get build log size" );
1315                     if( log_size ) {
1316                         std::vector<char> output;
1317                         output.resize( log_size );
1318                         query_err = clGetProgramBuildInfo(
1319                             program, (*d).my_cl_device_id, CL_PROGRAM_BUILD_LOG,
1320                             output.size(), output.data(), NULL );
1321                         enforce_cl_retcode( query_err, "Failed to get build output" );
1322                         std::cerr << output.data() << std::endl;
1323                     } else {
1324                         std::cerr << "No build log available" << std::endl;
1325                     }
1326                 }
1327             }
1328             enforce_cl_retcode( err, str );
1329         }
1330     };
1331 
1332     class opencl_device_filter {
1333     public:
1334         template<typename Filter>
opencl_device_filter(cl_uint & num_devices,cl_device_id * device_list,Filter filter,const char * message)1335         opencl_device_filter( cl_uint& num_devices, cl_device_id* device_list,
1336                               Filter filter, const char* message ) {
1337             for ( cl_uint i = 0; i < num_devices; ++i )
1338                 if ( filter(device_list[i]) ) {
1339                     device_list[i--] = device_list[--num_devices];
1340                 }
1341             if ( !num_devices )
1342                 enforce_cl_retcode( CL_DEVICE_NOT_AVAILABLE, message );
1343         }
1344     };
1345 
init(const std::string &)1346     void init( const std::string& ) const {
1347         cl_uint num_devices;
1348         enforce_cl_retcode( clGetContextInfo( my_factory.context(), CL_CONTEXT_NUM_DEVICES, sizeof( num_devices ), &num_devices, NULL ),
1349             "Failed to get OpenCL context info" );
1350         if ( !num_devices )
1351             enforce_cl_retcode( CL_DEVICE_NOT_FOUND, "No supported devices found" );
1352         cl_device_id *device_list = (cl_device_id *)alloca( num_devices*sizeof( cl_device_id ) );
1353         enforce_cl_retcode( clGetContextInfo( my_factory.context(), CL_CONTEXT_DEVICES, num_devices*sizeof( cl_device_id ), device_list, NULL ),
1354             "Failed to get OpenCL context info" );
1355         const char *options = NULL;
1356         switch ( my_type ) {
1357         case opencl_program_type::SOURCE: {
1358             file_reader fr( my_arg_str );
1359             const char *s[] = { fr.content() };
1360             const size_t l[] = { fr.length() };
1361             cl_int err;
1362             my_cl_program = clCreateProgramWithSource( my_factory.context(), 1, s, l, &err );
1363             enforce_cl_retcode( err, std::string( "Failed to create program: " ) + my_arg_str );
1364             opencl_device_filter(
1365                 num_devices, device_list,
1366                 []( const opencl_device& d ) -> bool {
1367                     return !d.compiler_available() || !d.linker_available();
1368                 }, "No one device supports building program from sources" );
1369             opencl_program_builder(
1370                 my_factory, my_arg_str, my_cl_program, num_devices, device_list,
1371                 options, /*callback*/ NULL, /*user data*/NULL );
1372             break;
1373         }
1374         case opencl_program_type::SPIR:
1375             options = "-x spir";
1376         case opencl_program_type::PRECOMPILED: {
1377             file_reader fr( my_arg_str );
1378             std::vector<const unsigned char*> s(
1379                 num_devices, reinterpret_cast<const unsigned char*>(fr.content()) );
1380             std::vector<size_t> l( num_devices, fr.length() );
1381             std::vector<cl_int> bin_statuses( num_devices, -1 );
1382             cl_int err;
1383             my_cl_program = clCreateProgramWithBinary( my_factory.context(), num_devices,
1384                                                        device_list, l.data(), s.data(),
1385                                                        bin_statuses.data(), &err );
1386             if( err != CL_SUCCESS ) {
1387                 std::string statuses_str;
1388                 for (auto st = bin_statuses.begin(); st != bin_statuses.end(); ++st) {
1389                     statuses_str += std::to_string((*st));
1390                 }
1391 
1392                 enforce_cl_retcode( err, std::string( "Failed to create program, error " + std::to_string( err ) + " : " ) + my_arg_str +
1393                                     std::string( ", binary_statuses = " ) + statuses_str );
1394             }
1395             opencl_program_builder(
1396                 my_factory, my_arg_str, my_cl_program, num_devices, device_list,
1397                 options, /*callback*/ NULL, /*user data*/NULL );
1398             break;
1399         }
1400         default:
1401             __TBB_ASSERT( false, "Unsupported program type" );
1402         }
1403     }
1404 
1405     Factory& my_factory;
1406     opencl_program_type my_type;
1407     std::string my_arg_str;
1408     mutable cl_program my_cl_program;
1409     mutable std::once_flag my_do_once_flag;
1410 
1411     template <typename DeviceFilter>
1412     friend class opencl_factory;
1413 
1414     friend class Factory::kernel;
1415 };
1416 
1417 template<typename... Args>
1418 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_node;
1419 
1420 template<typename JP, typename Factory, typename... Ports>
1421 class __TBB_DEPRECATED_IN_VERBOSE_MODE
1422 opencl_node< tuple<Ports...>, JP, Factory > : public streaming_node< tuple<Ports...>, JP, Factory > {
1423     typedef streaming_node < tuple<Ports...>, JP, Factory > base_type;
1424 public:
1425     typedef typename base_type::kernel_type kernel_type;
1426 
opencl_node(graph & g,const kernel_type & kernel)1427     opencl_node( graph &g, const kernel_type& kernel )
1428         : base_type( g, kernel, opencl_info::default_device_selector< opencl_info::default_opencl_factory >(), opencl_info::default_factory() )
1429     {
1430         tbb::internal::fgt_multiinput_multioutput_node( CODEPTR(), tbb::internal::FLOW_OPENCL_NODE, this, &this->my_graph );
1431     }
1432 
opencl_node(graph & g,const kernel_type & kernel,Factory & f)1433     opencl_node( graph &g, const kernel_type& kernel, Factory &f )
1434         : base_type( g, kernel, opencl_info::default_device_selector <Factory >(), f )
1435     {
1436         tbb::internal::fgt_multiinput_multioutput_node( CODEPTR(), tbb::internal::FLOW_OPENCL_NODE, this, &this->my_graph );
1437     }
1438 
1439     template <typename DeviceSelector>
opencl_node(graph & g,const kernel_type & kernel,DeviceSelector d,Factory & f)1440     opencl_node( graph &g, const kernel_type& kernel, DeviceSelector d, Factory &f)
1441         : base_type( g, kernel, d, f)
1442     {
1443         tbb::internal::fgt_multiinput_multioutput_node( CODEPTR(), tbb::internal::FLOW_OPENCL_NODE, this, &this->my_graph );
1444     }
1445 };
1446 
1447 template<typename JP, typename... Ports>
1448 class __TBB_DEPRECATED_IN_VERBOSE_MODE
1449 opencl_node< tuple<Ports...>, JP > : public opencl_node < tuple<Ports...>, JP, opencl_info::default_opencl_factory > {
1450     typedef opencl_node < tuple<Ports...>, JP, opencl_info::default_opencl_factory > base_type;
1451 public:
1452     typedef typename base_type::kernel_type kernel_type;
1453 
opencl_node(graph & g,const kernel_type & kernel)1454     opencl_node( graph &g, const kernel_type& kernel )
1455         : base_type( g, kernel, opencl_info::default_device_selector< opencl_info::default_opencl_factory >(), opencl_info::default_factory() )
1456     {}
1457 
1458     template <typename DeviceSelector>
opencl_node(graph & g,const kernel_type & kernel,DeviceSelector d)1459     opencl_node( graph &g, const kernel_type& kernel, DeviceSelector d )
1460         : base_type( g, kernel, d, opencl_info::default_factory() )
1461     {}
1462 };
1463 
1464 template<typename... Ports>
1465 class __TBB_DEPRECATED_IN_VERBOSE_MODE
1466 opencl_node< tuple<Ports...> > : public opencl_node < tuple<Ports...>, queueing, opencl_info::default_opencl_factory > {
1467     typedef opencl_node < tuple<Ports...>, queueing, opencl_info::default_opencl_factory > base_type;
1468 public:
1469     typedef typename base_type::kernel_type kernel_type;
1470 
opencl_node(graph & g,const kernel_type & kernel)1471     opencl_node( graph &g, const kernel_type& kernel )
1472         : base_type( g, kernel, opencl_info::default_device_selector< opencl_info::default_opencl_factory >(), opencl_info::default_factory() )
1473     {}
1474 
1475     template <typename DeviceSelector>
opencl_node(graph & g,const kernel_type & kernel,DeviceSelector d)1476     opencl_node( graph &g, const kernel_type& kernel, DeviceSelector d )
1477         : base_type( g, kernel, d, opencl_info::default_factory() )
1478     {}
1479 };
1480 
1481 } // namespace interfaceX
1482 
1483 using interface11::opencl_node;
1484 using interface11::read_only;
1485 using interface11::read_write;
1486 using interface11::write_only;
1487 using interface11::opencl_buffer;
1488 using interface11::opencl_subbuffer;
1489 using interface11::opencl_device;
1490 using interface11::opencl_device_list;
1491 using interface11::opencl_program;
1492 using interface11::opencl_program_type;
1493 using interface11::opencl_async_msg;
1494 using interface11::opencl_factory;
1495 using interface11::opencl_range;
1496 
1497 } // namespace flow
1498 } // namespace tbb
1499 #endif /* __TBB_PREVIEW_OPENCL_NODE */
1500 
1501 #include "internal/_warning_suppress_disable_notice.h"
1502 #undef __TBB_flow_graph_opencl_node_H_include_area
1503 
1504 #endif // __TBB_flow_graph_opencl_node_H
1505