1 /*
2 Copyright (c) 2005-2020 Intel Corporation
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 */
16
17 #include "internal/_deprecated_header_message_guard.h"
18
19 #if !defined(__TBB_show_deprecation_message_flow_graph_opencl_node_H) && defined(__TBB_show_deprecated_header_message)
20 #define __TBB_show_deprecation_message_flow_graph_opencl_node_H
21 #pragma message("TBB Warning: tbb/flow_graph_opencl_node.h is deprecated. For details, please see Deprecated Features appendix in the TBB reference manual.")
22 #endif
23
24 #if defined(__TBB_show_deprecated_header_message)
25 #undef __TBB_show_deprecated_header_message
26 #endif
27
28 #ifndef __TBB_flow_graph_opencl_node_H
29 #define __TBB_flow_graph_opencl_node_H
30
31 #define __TBB_flow_graph_opencl_node_H_include_area
32 #include "internal/_warning_suppress_enable_notice.h"
33
34 #include "tbb/tbb_config.h"
35 #if __TBB_PREVIEW_OPENCL_NODE
36
37 #include "flow_graph.h"
38
39 #include <vector>
40 #include <string>
41 #include <algorithm>
42 #include <iostream>
43 #include <fstream>
44 #include <map>
45 #include <mutex>
46
47 #ifdef __APPLE__
48 #include <OpenCL/opencl.h>
49 #else
50 #include <CL/cl.h>
51 #endif
52
53 namespace tbb {
54 namespace flow {
55
56 namespace interface11 {
57
58 template <typename DeviceFilter>
59 class opencl_factory;
60
61 namespace opencl_info {
62 class default_opencl_factory;
63 }
64
65 template <typename Factory>
66 class opencl_program;
67
enforce_cl_retcode(cl_int err,std::string msg)68 inline void enforce_cl_retcode(cl_int err, std::string msg) {
69 if (err != CL_SUCCESS) {
70 std::cerr << msg << "; error code: " << err << std::endl;
71 throw msg;
72 }
73 }
74
75 template <typename T>
event_info(cl_event e,cl_event_info i)76 T event_info(cl_event e, cl_event_info i) {
77 T res;
78 enforce_cl_retcode(clGetEventInfo(e, i, sizeof(res), &res, NULL), "Failed to get OpenCL event information");
79 return res;
80 }
81
82 template <typename T>
device_info(cl_device_id d,cl_device_info i)83 T device_info(cl_device_id d, cl_device_info i) {
84 T res;
85 enforce_cl_retcode(clGetDeviceInfo(d, i, sizeof(res), &res, NULL), "Failed to get OpenCL device information");
86 return res;
87 }
88
89 template <>
90 inline std::string device_info<std::string>(cl_device_id d, cl_device_info i) {
91 size_t required;
92 enforce_cl_retcode(clGetDeviceInfo(d, i, 0, NULL, &required), "Failed to get OpenCL device information");
93
94 char *buff = (char*)alloca(required);
95 enforce_cl_retcode(clGetDeviceInfo(d, i, required, buff, NULL), "Failed to get OpenCL device information");
96
97 return buff;
98 }
99
100 template <typename T>
platform_info(cl_platform_id p,cl_platform_info i)101 T platform_info(cl_platform_id p, cl_platform_info i) {
102 T res;
103 enforce_cl_retcode(clGetPlatformInfo(p, i, sizeof(res), &res, NULL), "Failed to get OpenCL platform information");
104 return res;
105 }
106
107 template <>
108 inline std::string platform_info<std::string>(cl_platform_id p, cl_platform_info i) {
109 size_t required;
110 enforce_cl_retcode(clGetPlatformInfo(p, i, 0, NULL, &required), "Failed to get OpenCL platform information");
111
112 char *buff = (char*)alloca(required);
113 enforce_cl_retcode(clGetPlatformInfo(p, i, required, buff, NULL), "Failed to get OpenCL platform information");
114
115 return buff;
116 }
117
118
119 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_device {
120 public:
121 typedef size_t device_id_type;
122 enum : device_id_type {
123 unknown = device_id_type( -2 ),
124 host = device_id_type( -1 )
125 };
126
opencl_device()127 opencl_device() : my_device_id( unknown ), my_cl_device_id( NULL ), my_cl_command_queue( NULL ) {}
128
opencl_device(cl_device_id d_id)129 opencl_device( cl_device_id d_id ) : my_device_id( unknown ), my_cl_device_id( d_id ), my_cl_command_queue( NULL ) {}
130
opencl_device(cl_device_id cl_d_id,device_id_type device_id)131 opencl_device( cl_device_id cl_d_id, device_id_type device_id ) : my_device_id( device_id ), my_cl_device_id( cl_d_id ), my_cl_command_queue( NULL ) {}
132
platform_profile()133 std::string platform_profile() const {
134 return platform_info<std::string>( platform_id(), CL_PLATFORM_PROFILE );
135 }
platform_version()136 std::string platform_version() const {
137 return platform_info<std::string>( platform_id(), CL_PLATFORM_VERSION );
138 }
platform_name()139 std::string platform_name() const {
140 return platform_info<std::string>( platform_id(), CL_PLATFORM_NAME );
141 }
platform_vendor()142 std::string platform_vendor() const {
143 return platform_info<std::string>( platform_id(), CL_PLATFORM_VENDOR );
144 }
platform_extensions()145 std::string platform_extensions() const {
146 return platform_info<std::string>( platform_id(), CL_PLATFORM_EXTENSIONS );
147 }
148
149 template <typename T>
info(cl_device_info i,T & t)150 void info( cl_device_info i, T &t ) const {
151 t = device_info<T>( my_cl_device_id, i );
152 }
version()153 std::string version() const {
154 // The version string format: OpenCL<space><major_version.minor_version><space><vendor-specific information>
155 return device_info<std::string>( my_cl_device_id, CL_DEVICE_VERSION );
156 }
major_version()157 int major_version() const {
158 int major;
159 std::sscanf( version().c_str(), "OpenCL %d", &major );
160 return major;
161 }
minor_version()162 int minor_version() const {
163 int major, minor;
164 std::sscanf( version().c_str(), "OpenCL %d.%d", &major, &minor );
165 return minor;
166 }
out_of_order_exec_mode_on_host_present()167 bool out_of_order_exec_mode_on_host_present() const {
168 #if CL_VERSION_2_0
169 if ( major_version() >= 2 )
170 return (device_info<cl_command_queue_properties>( my_cl_device_id, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES ) & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
171 else
172 #endif /* CL_VERSION_2_0 */
173 return (device_info<cl_command_queue_properties>( my_cl_device_id, CL_DEVICE_QUEUE_PROPERTIES ) & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
174 }
out_of_order_exec_mode_on_device_present()175 bool out_of_order_exec_mode_on_device_present() const {
176 #if CL_VERSION_2_0
177 if ( major_version() >= 2 )
178 return (device_info<cl_command_queue_properties>( my_cl_device_id, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES ) & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
179 else
180 #endif /* CL_VERSION_2_0 */
181 return false;
182 }
max_work_item_sizes()183 std::array<size_t, 3> max_work_item_sizes() const {
184 return device_info<std::array<size_t, 3>>( my_cl_device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES );
185 }
max_work_group_size()186 size_t max_work_group_size() const {
187 return device_info<size_t>( my_cl_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE );
188 }
built_in_kernel_available(const std::string & k)189 bool built_in_kernel_available( const std::string& k ) const {
190 const std::string semi = ";";
191 // Added semicolumns to force an exact match (to avoid a partial match, e.g. "add" is partly matched with "madd").
192 return (semi + built_in_kernels() + semi).find( semi + k + semi ) != std::string::npos;
193 }
built_in_kernels()194 std::string built_in_kernels() const {
195 return device_info<std::string>( my_cl_device_id, CL_DEVICE_BUILT_IN_KERNELS );
196 }
name()197 std::string name() const {
198 return device_info<std::string>( my_cl_device_id, CL_DEVICE_NAME );
199 }
available()200 cl_bool available() const {
201 return device_info<cl_bool>( my_cl_device_id, CL_DEVICE_AVAILABLE );
202 }
compiler_available()203 cl_bool compiler_available() const {
204 return device_info<cl_bool>( my_cl_device_id, CL_DEVICE_COMPILER_AVAILABLE );
205 }
linker_available()206 cl_bool linker_available() const {
207 return device_info<cl_bool>( my_cl_device_id, CL_DEVICE_LINKER_AVAILABLE );
208 }
extension_available(const std::string & ext)209 bool extension_available( const std::string &ext ) const {
210 const std::string space = " ";
211 // Added space to force an exact match (to avoid a partial match, e.g. "ext" is partly matched with "ext2").
212 return (space + extensions() + space).find( space + ext + space ) != std::string::npos;
213 }
extensions()214 std::string extensions() const {
215 return device_info<std::string>( my_cl_device_id, CL_DEVICE_EXTENSIONS );
216 }
217
type()218 cl_device_type type() const {
219 return device_info<cl_device_type>( my_cl_device_id, CL_DEVICE_TYPE );
220 }
221
vendor()222 std::string vendor() const {
223 return device_info<std::string>( my_cl_device_id, CL_DEVICE_VENDOR );
224 }
225
address_bits()226 cl_uint address_bits() const {
227 return device_info<cl_uint>( my_cl_device_id, CL_DEVICE_ADDRESS_BITS );
228 }
229
device_id()230 cl_device_id device_id() const {
231 return my_cl_device_id;
232 }
233
command_queue()234 cl_command_queue command_queue() const {
235 return my_cl_command_queue;
236 }
237
set_command_queue(cl_command_queue cmd_queue)238 void set_command_queue( cl_command_queue cmd_queue ) {
239 my_cl_command_queue = cmd_queue;
240 }
241
platform_id()242 cl_platform_id platform_id() const {
243 return device_info<cl_platform_id>( my_cl_device_id, CL_DEVICE_PLATFORM );
244 }
245
246 private:
247
248 device_id_type my_device_id;
249 cl_device_id my_cl_device_id;
250 cl_command_queue my_cl_command_queue;
251
252 friend bool operator==(opencl_device d1, opencl_device d2) { return d1.my_cl_device_id == d2.my_cl_device_id; }
253
254 template <typename DeviceFilter>
255 friend class opencl_factory;
256 template <typename Factory>
257 friend class opencl_memory;
258 template <typename Factory>
259 friend class opencl_program;
260
261 #if TBB_USE_ASSERT
262 template <typename T, typename Factory>
263 friend class opencl_buffer;
264 #endif
265 };
266
267 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_device_list {
268 typedef std::vector<opencl_device> container_type;
269 public:
270 typedef container_type::iterator iterator;
271 typedef container_type::const_iterator const_iterator;
272 typedef container_type::size_type size_type;
273
opencl_device_list()274 opencl_device_list() {}
opencl_device_list(std::initializer_list<opencl_device> il)275 opencl_device_list( std::initializer_list<opencl_device> il ) : my_container( il ) {}
276
add(opencl_device d)277 void add( opencl_device d ) { my_container.push_back( d ); }
size()278 size_type size() const { return my_container.size(); }
empty()279 bool empty() const { return my_container.empty(); }
begin()280 iterator begin() { return my_container.begin(); }
end()281 iterator end() { return my_container.end(); }
begin()282 const_iterator begin() const { return my_container.begin(); }
end()283 const_iterator end() const { return my_container.end(); }
cbegin()284 const_iterator cbegin() const { return my_container.cbegin(); }
cend()285 const_iterator cend() const { return my_container.cend(); }
286
287 private:
288 container_type my_container;
289 };
290
291 namespace internal {
292
293 // Retrieve all OpenCL devices from machine
find_available_devices()294 inline opencl_device_list find_available_devices() {
295 opencl_device_list opencl_devices;
296
297 cl_uint num_platforms;
298 enforce_cl_retcode(clGetPlatformIDs(0, NULL, &num_platforms), "clGetPlatformIDs failed");
299
300 std::vector<cl_platform_id> platforms(num_platforms);
301 enforce_cl_retcode(clGetPlatformIDs(num_platforms, platforms.data(), NULL), "clGetPlatformIDs failed");
302
303 cl_uint num_devices;
304 std::vector<cl_platform_id>::iterator platforms_it = platforms.begin();
305 cl_uint num_all_devices = 0;
306 while (platforms_it != platforms.end()) {
307 cl_int err = clGetDeviceIDs(*platforms_it, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
308 if (err == CL_DEVICE_NOT_FOUND) {
309 platforms_it = platforms.erase(platforms_it);
310 }
311 else {
312 enforce_cl_retcode(err, "clGetDeviceIDs failed");
313 num_all_devices += num_devices;
314 ++platforms_it;
315 }
316 }
317
318 std::vector<cl_device_id> devices(num_all_devices);
319 std::vector<cl_device_id>::iterator devices_it = devices.begin();
320 for (auto p = platforms.begin(); p != platforms.end(); ++p) {
321 enforce_cl_retcode(clGetDeviceIDs((*p), CL_DEVICE_TYPE_ALL, (cl_uint)std::distance(devices_it, devices.end()), &*devices_it, &num_devices), "clGetDeviceIDs failed");
322 devices_it += num_devices;
323 }
324
325 for (auto d = devices.begin(); d != devices.end(); ++d) {
326 opencl_devices.add(opencl_device((*d)));
327 }
328
329 return opencl_devices;
330 }
331
332 } // namespace internal
333
334 // TODO: consider this namespace as public API
335 namespace opencl_info {
336
available_devices()337 inline const opencl_device_list& available_devices() {
338 // Static storage for all available OpenCL devices on machine
339 static const opencl_device_list my_devices = internal::find_available_devices();
340 return my_devices;
341 }
342
343 } // namespace opencl_info
344
345
346 class callback_base : tbb::internal::no_copy {
347 public:
348 virtual void call() = 0;
~callback_base()349 virtual ~callback_base() {}
350 };
351
352 template <typename Callback, typename T>
353 class callback : public callback_base {
354 Callback my_callback;
355 T my_data;
356 public:
callback(Callback c,const T & t)357 callback( Callback c, const T& t ) : my_callback( c ), my_data( t ) {}
358
call()359 void call() __TBB_override {
360 my_callback( my_data );
361 }
362 };
363
364 template <typename T, typename Factory = opencl_info::default_opencl_factory>
365 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_async_msg : public async_msg<T> {
366 public:
367 typedef T value_type;
368
opencl_async_msg()369 opencl_async_msg() : my_callback_flag_ptr( std::make_shared< tbb::atomic<bool>>() ) {
370 my_callback_flag_ptr->store<tbb::relaxed>(false);
371 }
372
opencl_async_msg(const T & data)373 explicit opencl_async_msg( const T& data ) : my_data(data), my_callback_flag_ptr( std::make_shared<tbb::atomic<bool>>() ) {
374 my_callback_flag_ptr->store<tbb::relaxed>(false);
375 }
376
opencl_async_msg(const T & data,cl_event event)377 opencl_async_msg( const T& data, cl_event event ) : my_data(data), my_event(event), my_is_event(true), my_callback_flag_ptr( std::make_shared<tbb::atomic<bool>>() ) {
378 my_callback_flag_ptr->store<tbb::relaxed>(false);
379 enforce_cl_retcode( clRetainEvent( my_event ), "Failed to retain an event" );
380 }
381
382 T& data( bool wait = true ) {
383 if ( my_is_event && wait ) {
384 enforce_cl_retcode( clWaitForEvents( 1, &my_event ), "Failed to wait for an event" );
385 enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
386 my_is_event = false;
387 }
388 return my_data;
389 }
390
391 const T& data( bool wait = true ) const {
392 if ( my_is_event && wait ) {
393 enforce_cl_retcode( clWaitForEvents( 1, &my_event ), "Failed to wait for an event" );
394 enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
395 my_is_event = false;
396 }
397 return my_data;
398 }
399
opencl_async_msg(const opencl_async_msg & dmsg)400 opencl_async_msg( const opencl_async_msg &dmsg ) : async_msg<T>(dmsg),
401 my_data(dmsg.my_data), my_event(dmsg.my_event), my_is_event( dmsg.my_is_event ),
402 my_callback_flag_ptr(dmsg.my_callback_flag_ptr)
403 {
404 if ( my_is_event )
405 enforce_cl_retcode( clRetainEvent( my_event ), "Failed to retain an event" );
406 }
407
opencl_async_msg(opencl_async_msg && dmsg)408 opencl_async_msg( opencl_async_msg &&dmsg ) : async_msg<T>(std::move(dmsg)),
409 my_data(std::move(dmsg.my_data)), my_event(dmsg.my_event), my_is_event(dmsg.my_is_event),
410 my_callback_flag_ptr( std::move(dmsg.my_callback_flag_ptr) )
411 {
412 dmsg.my_is_event = false;
413 }
414
415 opencl_async_msg& operator=(const opencl_async_msg &dmsg) {
416 async_msg<T>::operator =(dmsg);
417
418 // Release original event
419 if ( my_is_event )
420 enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to retain an event" );
421
422 my_data = dmsg.my_data;
423 my_event = dmsg.my_event;
424 my_is_event = dmsg.my_is_event;
425
426 // Retain copied event
427 if ( my_is_event )
428 enforce_cl_retcode( clRetainEvent( my_event ), "Failed to retain an event" );
429
430 my_callback_flag_ptr = dmsg.my_callback_flag_ptr;
431 return *this;
432 }
433
~opencl_async_msg()434 ~opencl_async_msg() {
435 if ( my_is_event )
436 enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
437 }
438
get_event()439 cl_event const * get_event() const { return my_is_event ? &my_event : NULL; }
set_event(cl_event e)440 void set_event( cl_event e ) const {
441 if ( my_is_event ) {
442 cl_command_queue cq = event_info<cl_command_queue>( my_event, CL_EVENT_COMMAND_QUEUE );
443 if ( cq != event_info<cl_command_queue>( e, CL_EVENT_COMMAND_QUEUE ) )
444 enforce_cl_retcode( clFlush( cq ), "Failed to flush an OpenCL command queue" );
445 enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
446 }
447 my_is_event = true;
448 my_event = e;
449 clRetainEvent( my_event );
450 }
451
clear_event()452 void clear_event() const {
453 if ( my_is_event ) {
454 enforce_cl_retcode( clFlush( event_info<cl_command_queue>( my_event, CL_EVENT_COMMAND_QUEUE ) ), "Failed to flush an OpenCL command queue" );
455 enforce_cl_retcode( clReleaseEvent( my_event ), "Failed to release an event" );
456 }
457 my_is_event = false;
458 }
459
460 template <typename Callback>
register_callback(Callback c)461 void register_callback( Callback c ) const {
462 __TBB_ASSERT( my_is_event, "The OpenCL event is not set" );
463 enforce_cl_retcode( clSetEventCallback( my_event, CL_COMPLETE, register_callback_func, new callback<Callback, T>( c, my_data ) ), "Failed to set an OpenCL callback" );
464 }
465
466 operator T&() { return data(); }
467 operator const T&() const { return data(); }
468
469 protected:
470 // Overridden in this derived class to inform that
471 // async calculation chain is over
finalize()472 void finalize() const __TBB_override {
473 receive_if_memory_object(*this);
474 if (! my_callback_flag_ptr->fetch_and_store(true)) {
475 opencl_async_msg a(*this);
476 if (my_is_event) {
477 register_callback([a](const T& t) mutable {
478 a.set(t);
479 });
480 }
481 else {
482 a.set(my_data);
483 }
484 }
485 clear_event();
486 }
487
488 private:
register_callback_func(cl_event,cl_int event_command_exec_status,void * data)489 static void CL_CALLBACK register_callback_func( cl_event, cl_int event_command_exec_status, void *data ) {
490 tbb::internal::suppress_unused_warning( event_command_exec_status );
491 __TBB_ASSERT( event_command_exec_status == CL_COMPLETE, NULL );
492 __TBB_ASSERT( data, NULL );
493 callback_base *c = static_cast<callback_base*>(data);
494 c->call();
495 delete c;
496 }
497
498 T my_data;
499 mutable cl_event my_event;
500 mutable bool my_is_event = false;
501
502 std::shared_ptr< tbb::atomic<bool> > my_callback_flag_ptr;
503 };
504
505 template <typename K, typename T, typename Factory>
key_from_message(const opencl_async_msg<T,Factory> & dmsg)506 K key_from_message( const opencl_async_msg<T, Factory> &dmsg ) {
507 using tbb::flow::key_from_message;
508 const T &t = dmsg.data( false );
509 __TBB_STATIC_ASSERT( true, "" );
510 return key_from_message<K, T>( t );
511 }
512
513 template <typename Factory>
514 class opencl_memory {
515 public:
opencl_memory()516 opencl_memory() {}
opencl_memory(Factory & f)517 opencl_memory( Factory &f ) : my_host_ptr( NULL ), my_factory( &f ), my_sending_event_present( false ) {
518 my_curr_device_id = my_factory->devices().begin()->my_device_id;
519 }
520
~opencl_memory()521 virtual ~opencl_memory() {
522 if ( my_sending_event_present ) enforce_cl_retcode( clReleaseEvent( my_sending_event ), "Failed to release an event for the OpenCL buffer" );
523 enforce_cl_retcode( clReleaseMemObject( my_cl_mem ), "Failed to release an memory object" );
524 }
525
get_cl_mem()526 cl_mem get_cl_mem() const {
527 return my_cl_mem;
528 }
529
get_host_ptr()530 void* get_host_ptr() {
531 if ( !my_host_ptr ) {
532 opencl_async_msg<void*, Factory> d = receive( NULL );
533 d.data();
534 __TBB_ASSERT( d.data() == my_host_ptr, NULL );
535 }
536 return my_host_ptr;
537 }
538
factory()539 Factory *factory() const { return my_factory; }
540
receive(const cl_event * e)541 opencl_async_msg<void*, Factory> receive(const cl_event *e) {
542 opencl_async_msg<void*, Factory> d;
543 if (e) {
544 d = opencl_async_msg<void*, Factory>(my_host_ptr, *e);
545 } else {
546 d = opencl_async_msg<void*, Factory>(my_host_ptr);
547 }
548
549 // Concurrent receives are prohibited so we do not worry about synchronization.
550 if (my_curr_device_id.load<tbb::relaxed>() != opencl_device::host) {
551 map_memory(*my_factory->devices().begin(), d);
552 my_curr_device_id.store<tbb::relaxed>(opencl_device::host);
553 my_host_ptr = d.data(false);
554 }
555 // Release the sending event
556 if (my_sending_event_present) {
557 enforce_cl_retcode(clReleaseEvent(my_sending_event), "Failed to release an event");
558 my_sending_event_present = false;
559 }
560 return d;
561 }
562
send(opencl_device device,const cl_event * e)563 opencl_async_msg<void*, Factory> send(opencl_device device, const cl_event *e) {
564 opencl_device::device_id_type device_id = device.my_device_id;
565 if (!my_factory->is_same_context(my_curr_device_id.load<tbb::acquire>(), device_id)) {
566 {
567 tbb::spin_mutex::scoped_lock lock(my_sending_lock);
568 if (!my_factory->is_same_context(my_curr_device_id.load<tbb::relaxed>(), device_id)) {
569 __TBB_ASSERT(my_host_ptr, "The buffer has not been mapped");
570 opencl_async_msg<void*, Factory> d(my_host_ptr);
571 my_factory->enqueue_unmap_buffer(device, *this, d);
572 my_sending_event = *d.get_event();
573 my_sending_event_present = true;
574 enforce_cl_retcode(clRetainEvent(my_sending_event), "Failed to retain an event");
575 my_host_ptr = NULL;
576 my_curr_device_id.store<tbb::release>(device_id);
577 }
578 }
579 __TBB_ASSERT(my_sending_event_present, NULL);
580 }
581
582 // !e means that buffer has come from the host
583 if (!e && my_sending_event_present) e = &my_sending_event;
584
585 __TBB_ASSERT(!my_host_ptr, "The buffer has not been unmapped");
586 return e ? opencl_async_msg<void*, Factory>(NULL, *e) : opencl_async_msg<void*, Factory>(NULL);
587 }
588
589 virtual void map_memory( opencl_device, opencl_async_msg<void*, Factory> & ) = 0;
590 protected:
591 cl_mem my_cl_mem;
592 tbb::atomic<opencl_device::device_id_type> my_curr_device_id;
593 void* my_host_ptr;
594 Factory *my_factory;
595
596 tbb::spin_mutex my_sending_lock;
597 bool my_sending_event_present;
598 cl_event my_sending_event;
599 };
600
601 template <typename Factory>
602 class opencl_buffer_impl : public opencl_memory<Factory> {
603 size_t my_size;
604 public:
opencl_buffer_impl(size_t size,Factory & f)605 opencl_buffer_impl( size_t size, Factory& f ) : opencl_memory<Factory>( f ), my_size( size ) {
606 cl_int err;
607 this->my_cl_mem = clCreateBuffer( this->my_factory->context(), CL_MEM_ALLOC_HOST_PTR, size, NULL, &err );
608 enforce_cl_retcode( err, "Failed to create an OpenCL buffer" );
609 }
610
611 // The constructor for subbuffers.
opencl_buffer_impl(cl_mem m,size_t index,size_t size,Factory & f)612 opencl_buffer_impl( cl_mem m, size_t index, size_t size, Factory& f ) : opencl_memory<Factory>( f ), my_size( size ) {
613 cl_int err;
614 cl_buffer_region region = { index, size };
615 this->my_cl_mem = clCreateSubBuffer( m, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err );
616 enforce_cl_retcode( err, "Failed to create an OpenCL subbuffer" );
617 }
618
size()619 size_t size() const {
620 return my_size;
621 }
622
map_memory(opencl_device device,opencl_async_msg<void *,Factory> & dmsg)623 void map_memory( opencl_device device, opencl_async_msg<void*, Factory> &dmsg ) __TBB_override {
624 this->my_factory->enqueue_map_buffer( device, *this, dmsg );
625 }
626
627 #if TBB_USE_ASSERT
628 template <typename, typename>
629 friend class opencl_buffer;
630 #endif
631 };
632
633 enum access_type {
634 read_write,
635 write_only,
636 read_only
637 };
638
639 template <typename T, typename Factory = opencl_info::default_opencl_factory>
640 class __TBB_DEPRECATED_IN_VERBOSE_MODE
641 opencl_subbuffer;
642
643 template <typename T, typename Factory = opencl_info::default_opencl_factory>
644 class __TBB_DEPRECATED_IN_VERBOSE_MODE
645 opencl_buffer {
646 public:
647 typedef cl_mem native_object_type;
648 typedef opencl_buffer memory_object_type;
649 typedef Factory opencl_factory_type;
650
651 template<access_type a> using iterator = T*;
652
653 template <access_type a>
access()654 iterator<a> access() const {
655 T* ptr = (T*)my_impl->get_host_ptr();
656 __TBB_ASSERT( ptr, NULL );
657 return iterator<a>( ptr );
658 }
659
data()660 T* data() const { return &access<read_write>()[0]; }
661
662 template <access_type a = read_write>
begin()663 iterator<a> begin() const { return access<a>(); }
664
665 template <access_type a = read_write>
end()666 iterator<a> end() const { return access<a>()+my_impl->size()/sizeof(T); }
667
size()668 size_t size() const { return my_impl->size()/sizeof(T); }
669
670 T& operator[] ( ptrdiff_t k ) { return begin()[k]; }
671
opencl_buffer()672 opencl_buffer() {}
673 opencl_buffer( size_t size );
opencl_buffer(Factory & f,size_t size)674 opencl_buffer( Factory &f, size_t size ) : my_impl( std::make_shared<impl_type>( size*sizeof(T), f ) ) {}
675
native_object()676 cl_mem native_object() const {
677 return my_impl->get_cl_mem();
678 }
679
memory_object()680 const opencl_buffer& memory_object() const {
681 return *this;
682 }
683
send(opencl_device device,opencl_async_msg<opencl_buffer,Factory> & dependency)684 void send( opencl_device device, opencl_async_msg<opencl_buffer, Factory> &dependency ) const {
685 __TBB_ASSERT( dependency.data( /*wait = */false ) == *this, NULL );
686 opencl_async_msg<void*, Factory> d = my_impl->send( device, dependency.get_event() );
687 const cl_event *e = d.get_event();
688 if ( e ) dependency.set_event( *e );
689 else dependency.clear_event();
690 }
receive(const opencl_async_msg<opencl_buffer,Factory> & dependency)691 void receive( const opencl_async_msg<opencl_buffer, Factory> &dependency ) const {
692 __TBB_ASSERT( dependency.data( /*wait = */false ) == *this, NULL );
693 opencl_async_msg<void*, Factory> d = my_impl->receive( dependency.get_event() );
694 const cl_event *e = d.get_event();
695 if ( e ) dependency.set_event( *e );
696 else dependency.clear_event();
697 }
698
699 opencl_subbuffer<T, Factory> subbuffer( size_t index, size_t size ) const;
700 private:
701 // The constructor for subbuffers.
opencl_buffer(Factory & f,cl_mem m,size_t index,size_t size)702 opencl_buffer( Factory &f, cl_mem m, size_t index, size_t size ) : my_impl( std::make_shared<impl_type>( m, index*sizeof(T), size*sizeof(T), f ) ) {}
703
704 typedef opencl_buffer_impl<Factory> impl_type;
705
706 std::shared_ptr<impl_type> my_impl;
707
708 friend bool operator==(const opencl_buffer<T, Factory> &lhs, const opencl_buffer<T, Factory> &rhs) {
709 return lhs.my_impl == rhs.my_impl;
710 }
711
712 template <typename>
713 friend class opencl_factory;
714 template <typename, typename>
715 friend class opencl_subbuffer;
716 };
717
718 template <typename T, typename Factory>
719 class __TBB_DEPRECATED_IN_VERBOSE_MODE
720 opencl_subbuffer : public opencl_buffer<T, Factory> {
721 opencl_buffer<T, Factory> my_owner;
722 public:
opencl_subbuffer()723 opencl_subbuffer() {}
opencl_subbuffer(const opencl_buffer<T,Factory> & owner,size_t index,size_t size)724 opencl_subbuffer( const opencl_buffer<T, Factory> &owner, size_t index, size_t size ) :
725 opencl_buffer<T, Factory>( *owner.my_impl->factory(), owner.native_object(), index, size ), my_owner( owner ) {}
726 };
727
728 template <typename T, typename Factory>
subbuffer(size_t index,size_t size)729 opencl_subbuffer<T, Factory> opencl_buffer<T, Factory>::subbuffer( size_t index, size_t size ) const {
730 return opencl_subbuffer<T, Factory>( *this, index, size );
731 }
732
733
734 #define is_typedef(type) \
735 template <typename T> \
736 struct is_##type { \
737 template <typename C> \
738 static std::true_type check( typename C::type* ); \
739 template <typename C> \
740 static std::false_type check( ... ); \
741 \
742 static const bool value = decltype(check<T>(0))::value; \
743 }
744
745 is_typedef( native_object_type );
746 is_typedef( memory_object_type );
747
748 template <typename T>
get_native_object(const T & t)749 typename std::enable_if<is_native_object_type<T>::value, typename T::native_object_type>::type get_native_object( const T &t ) {
750 return t.native_object();
751 }
752
753 template <typename T>
get_native_object(T t)754 typename std::enable_if<!is_native_object_type<T>::value, T>::type get_native_object( T t ) {
755 return t;
756 }
757
758 // send_if_memory_object checks if the T type has memory_object_type and call the send method for the object.
759 template <typename T, typename Factory>
send_if_memory_object(opencl_device device,opencl_async_msg<T,Factory> & dmsg)760 typename std::enable_if<is_memory_object_type<T>::value>::type send_if_memory_object( opencl_device device, opencl_async_msg<T, Factory> &dmsg ) {
761 const T &t = dmsg.data( false );
762 typedef typename T::memory_object_type mem_obj_t;
763 mem_obj_t mem_obj = t.memory_object();
764 opencl_async_msg<mem_obj_t, Factory> d( mem_obj );
765 if ( dmsg.get_event() ) d.set_event( *dmsg.get_event() );
766 mem_obj.send( device, d );
767 if ( d.get_event() ) dmsg.set_event( *d.get_event() );
768 }
769
770 template <typename T>
send_if_memory_object(opencl_device device,T & t)771 typename std::enable_if<is_memory_object_type<T>::value>::type send_if_memory_object( opencl_device device, T &t ) {
772 typedef typename T::memory_object_type mem_obj_t;
773 mem_obj_t mem_obj = t.memory_object();
774 opencl_async_msg<mem_obj_t, typename mem_obj_t::opencl_factory_type> dmsg( mem_obj );
775 mem_obj.send( device, dmsg );
776 }
777
778 template <typename T>
send_if_memory_object(opencl_device,T &)779 typename std::enable_if<!is_memory_object_type<T>::value>::type send_if_memory_object( opencl_device, T& ) {};
780
781 // receive_if_memory_object checks if the T type has memory_object_type and call the receive method for the object.
782 template <typename T, typename Factory>
receive_if_memory_object(const opencl_async_msg<T,Factory> & dmsg)783 typename std::enable_if<is_memory_object_type<T>::value>::type receive_if_memory_object( const opencl_async_msg<T, Factory> &dmsg ) {
784 const T &t = dmsg.data( false );
785 typedef typename T::memory_object_type mem_obj_t;
786 mem_obj_t mem_obj = t.memory_object();
787 opencl_async_msg<mem_obj_t, Factory> d( mem_obj );
788 if ( dmsg.get_event() ) d.set_event( *dmsg.get_event() );
789 mem_obj.receive( d );
790 if ( d.get_event() ) dmsg.set_event( *d.get_event() );
791 }
792
793 template <typename T>
receive_if_memory_object(const T &)794 typename std::enable_if<!is_memory_object_type<T>::value>::type receive_if_memory_object( const T& ) {}
795
796 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_range {
797 public:
798 typedef size_t range_index_type;
799 typedef std::array<range_index_type, 3> nd_range_type;
800
801 template <typename G = std::initializer_list<int>, typename L = std::initializer_list<int>,
802 typename = typename std::enable_if<!std::is_same<typename std::decay<G>::type, opencl_range>::value>::type>
803 opencl_range(G&& global_work = std::initializer_list<int>({ 0 }), L&& local_work = std::initializer_list<int>({ 0, 0, 0 })) {
804 auto g_it = global_work.begin();
805 auto l_it = local_work.begin();
806 my_global_work_size = { {size_t(-1), size_t(-1), size_t(-1)} };
807 // my_local_work_size is still uninitialized
808 for (int s = 0; s < 3 && g_it != global_work.end(); ++g_it, ++l_it, ++s) {
809 __TBB_ASSERT(l_it != local_work.end(), "global_work & local_work must have same size");
810 my_global_work_size[s] = *g_it;
811 my_local_work_size[s] = *l_it;
812 }
813 }
814
global_range()815 const nd_range_type& global_range() const { return my_global_work_size; }
local_range()816 const nd_range_type& local_range() const { return my_local_work_size; }
817
818 private:
819 nd_range_type my_global_work_size;
820 nd_range_type my_local_work_size;
821 };
822
823 template <typename DeviceFilter>
824 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_factory {
825 public:
826 template<typename T> using async_msg_type = opencl_async_msg<T, opencl_factory<DeviceFilter>>;
827 typedef opencl_device device_type;
828
829 class kernel : tbb::internal::no_assign {
830 public:
kernel(const kernel & k)831 kernel( const kernel& k ) : my_factory( k.my_factory ) {
832 // Clone my_cl_kernel via opencl_program
833 size_t ret_size = 0;
834
835 std::vector<char> kernel_name;
836 for ( size_t curr_size = 32;; curr_size <<= 1 ) {
837 kernel_name.resize( curr_size <<= 1 );
838 enforce_cl_retcode( clGetKernelInfo( k.my_cl_kernel, CL_KERNEL_FUNCTION_NAME, curr_size, kernel_name.data(), &ret_size ), "Failed to get kernel info" );
839 if ( ret_size < curr_size ) break;
840 }
841
842 cl_program program;
843 enforce_cl_retcode( clGetKernelInfo( k.my_cl_kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, &ret_size ), "Failed to get kernel info" );
844 __TBB_ASSERT( ret_size == sizeof(program), NULL );
845
846 my_cl_kernel = opencl_program< factory_type >( my_factory, program ).get_cl_kernel( kernel_name.data() );
847 }
848
~kernel()849 ~kernel() {
850 enforce_cl_retcode( clReleaseKernel( my_cl_kernel ), "Failed to release a kernel" );
851 }
852
853 private:
854 typedef opencl_factory<DeviceFilter> factory_type;
855
kernel(const cl_kernel & k,factory_type & f)856 kernel( const cl_kernel& k, factory_type& f ) : my_cl_kernel( k ), my_factory( f ) {}
857
858 // Data
859 cl_kernel my_cl_kernel;
860 factory_type& my_factory;
861
862 template <typename DeviceFilter_>
863 friend class opencl_factory;
864
865 template <typename Factory>
866 friend class opencl_program;
867 };
868
869 typedef kernel kernel_type;
870
871 // 'range_type' enables kernel_executor with range support
872 // it affects expectations for enqueue_kernel(.....) interface method
873 typedef opencl_range range_type;
874
opencl_factory()875 opencl_factory() {}
~opencl_factory()876 ~opencl_factory() {
877 if ( my_devices.size() ) {
878 for ( auto d = my_devices.begin(); d != my_devices.end(); ++d ) {
879 enforce_cl_retcode( clReleaseCommandQueue( (*d).my_cl_command_queue ), "Failed to release a command queue" );
880 }
881 enforce_cl_retcode( clReleaseContext( my_cl_context ), "Failed to release a context" );
882 }
883 }
884
init(const opencl_device_list & device_list)885 bool init( const opencl_device_list &device_list ) {
886 tbb::spin_mutex::scoped_lock lock( my_devices_mutex );
887 if ( !my_devices.size() ) {
888 my_devices = device_list;
889 return true;
890 }
891 return false;
892 }
893
894
895 private:
896 template <typename Factory>
enqueue_map_buffer(opencl_device device,opencl_buffer_impl<Factory> & buffer,opencl_async_msg<void *,Factory> & dmsg)897 void enqueue_map_buffer( opencl_device device, opencl_buffer_impl<Factory> &buffer, opencl_async_msg<void*, Factory>& dmsg ) {
898 cl_event const* e1 = dmsg.get_event();
899 cl_event e2;
900 cl_int err;
901 void *ptr = clEnqueueMapBuffer( device.my_cl_command_queue, buffer.get_cl_mem(), false, CL_MAP_READ | CL_MAP_WRITE, 0, buffer.size(),
902 e1 == NULL ? 0 : 1, e1, &e2, &err );
903 enforce_cl_retcode( err, "Failed to map a buffer" );
904 dmsg.data( false ) = ptr;
905 dmsg.set_event( e2 );
906 enforce_cl_retcode( clReleaseEvent( e2 ), "Failed to release an event" );
907 }
908
909
910 template <typename Factory>
enqueue_unmap_buffer(opencl_device device,opencl_memory<Factory> & memory,opencl_async_msg<void *,Factory> & dmsg)911 void enqueue_unmap_buffer( opencl_device device, opencl_memory<Factory> &memory, opencl_async_msg<void*, Factory>& dmsg ) {
912 cl_event const* e1 = dmsg.get_event();
913 cl_event e2;
914 enforce_cl_retcode(
915 clEnqueueUnmapMemObject( device.my_cl_command_queue, memory.get_cl_mem(), memory.get_host_ptr(), e1 == NULL ? 0 : 1, e1, &e2 ),
916 "Failed to unmap a buffer" );
917 dmsg.set_event( e2 );
918 enforce_cl_retcode( clReleaseEvent( e2 ), "Failed to release an event" );
919 }
920
921 // --------- Kernel argument & event list helpers --------- //
922 template <size_t NUM_ARGS, typename T>
process_one_arg(const kernel_type & kernel,std::array<cl_event,NUM_ARGS> &,int &,int & place,const T & t)923 void process_one_arg( const kernel_type& kernel, std::array<cl_event, NUM_ARGS>&, int&, int& place, const T& t ) {
924 auto p = get_native_object(t);
925 enforce_cl_retcode( clSetKernelArg(kernel.my_cl_kernel, place++, sizeof(p), &p), "Failed to set a kernel argument" );
926 }
927
928 template <size_t NUM_ARGS, typename T, typename F>
process_one_arg(const kernel_type & kernel,std::array<cl_event,NUM_ARGS> & events,int & num_events,int & place,const opencl_async_msg<T,F> & msg)929 void process_one_arg( const kernel_type& kernel, std::array<cl_event, NUM_ARGS>& events, int& num_events, int& place, const opencl_async_msg<T, F>& msg ) {
930 __TBB_ASSERT((static_cast<typename std::array<cl_event, NUM_ARGS>::size_type>(num_events) < events.size()), NULL);
931
932 const cl_event * const e = msg.get_event();
933 if (e != NULL) {
934 events[num_events++] = *e;
935 }
936
937 process_one_arg( kernel, events, num_events, place, msg.data(false) );
938 }
939
940 template <size_t NUM_ARGS, typename T, typename ...Rest>
process_arg_list(const kernel_type & kernel,std::array<cl_event,NUM_ARGS> & events,int & num_events,int & place,const T & t,const Rest &...args)941 void process_arg_list( const kernel_type& kernel, std::array<cl_event, NUM_ARGS>& events, int& num_events, int& place, const T& t, const Rest&... args ) {
942 process_one_arg( kernel, events, num_events, place, t );
943 process_arg_list( kernel, events, num_events, place, args... );
944 }
945
946 template <size_t NUM_ARGS>
process_arg_list(const kernel_type &,std::array<cl_event,NUM_ARGS> &,int &,int &)947 void process_arg_list( const kernel_type&, std::array<cl_event, NUM_ARGS>&, int&, int& ) {}
948 // ------------------------------------------- //
949 template <typename T>
update_one_arg(cl_event,T &)950 void update_one_arg( cl_event, T& ) {}
951
952 template <typename T, typename F>
update_one_arg(cl_event e,opencl_async_msg<T,F> & msg)953 void update_one_arg( cl_event e, opencl_async_msg<T, F>& msg ) {
954 msg.set_event( e );
955 }
956
957 template <typename T, typename ...Rest>
update_arg_list(cl_event e,T & t,Rest &...args)958 void update_arg_list( cl_event e, T& t, Rest&... args ) {
959 update_one_arg( e, t );
960 update_arg_list( e, args... );
961 }
962
update_arg_list(cl_event)963 void update_arg_list( cl_event ) {}
964 // ------------------------------------------- //
965 public:
966 template <typename ...Args>
send_kernel(opencl_device device,const kernel_type & kernel,const range_type & work_size,Args &...args)967 void send_kernel( opencl_device device, const kernel_type& kernel, const range_type& work_size, Args&... args ) {
968 std::array<cl_event, sizeof...(Args)> events;
969 int num_events = 0;
970 int place = 0;
971 process_arg_list( kernel, events, num_events, place, args... );
972
973 const cl_event e = send_kernel_impl( device, kernel.my_cl_kernel, work_size, num_events, events.data() );
974
975 update_arg_list(e, args...);
976
977 // Release our own reference to cl_event
978 enforce_cl_retcode( clReleaseEvent(e), "Failed to release an event" );
979 }
980
981 // ------------------------------------------- //
982 template <typename T, typename ...Rest>
send_data(opencl_device device,T & t,Rest &...args)983 void send_data(opencl_device device, T& t, Rest&... args) {
984 send_if_memory_object( device, t );
985 send_data( device, args... );
986 }
987
send_data(opencl_device)988 void send_data(opencl_device) {}
989 // ------------------------------------------- //
990
991 private:
send_kernel_impl(opencl_device device,const cl_kernel & kernel,const range_type & work_size,cl_uint num_events,cl_event * event_list)992 cl_event send_kernel_impl( opencl_device device, const cl_kernel& kernel,
993 const range_type& work_size, cl_uint num_events, cl_event* event_list ) {
994 const typename range_type::nd_range_type g_offset = { { 0, 0, 0 } };
995 const typename range_type::nd_range_type& g_size = work_size.global_range();
996 const typename range_type::nd_range_type& l_size = work_size.local_range();
997 cl_uint s;
998 for ( s = 1; s < 3 && g_size[s] != size_t(-1); ++s) {}
999 cl_event event;
1000 enforce_cl_retcode(
1001 clEnqueueNDRangeKernel( device.my_cl_command_queue, kernel, s,
1002 g_offset.data(), g_size.data(), l_size[0] ? l_size.data() : NULL, num_events, num_events ? event_list : NULL, &event ),
1003 "Failed to enqueue a kernel" );
1004 return event;
1005 }
1006
1007 // ------------------------------------------- //
1008 template <typename T>
get_event_from_one_arg(cl_event &,const T &)1009 bool get_event_from_one_arg( cl_event&, const T& ) {
1010 return false;
1011 }
1012
1013 template <typename T, typename F>
get_event_from_one_arg(cl_event & e,const opencl_async_msg<T,F> & msg)1014 bool get_event_from_one_arg( cl_event& e, const opencl_async_msg<T, F>& msg) {
1015 cl_event const *e_ptr = msg.get_event();
1016
1017 if ( e_ptr != NULL ) {
1018 e = *e_ptr;
1019 return true;
1020 }
1021
1022 return false;
1023 }
1024
1025 template <typename T, typename ...Rest>
get_event_from_args(cl_event & e,const T & t,const Rest &...args)1026 bool get_event_from_args( cl_event& e, const T& t, const Rest&... args ) {
1027 if ( get_event_from_one_arg( e, t ) ) {
1028 return true;
1029 }
1030
1031 return get_event_from_args( e, args... );
1032 }
1033
get_event_from_args(cl_event &)1034 bool get_event_from_args( cl_event& ) {
1035 return false;
1036 }
1037 // ------------------------------------------- //
1038
1039 struct finalize_fn : tbb::internal::no_assign {
~finalize_fnfinalize_fn1040 virtual ~finalize_fn() {}
operatorfinalize_fn1041 virtual void operator() () {}
1042 };
1043
1044 template<typename Fn>
1045 struct finalize_fn_leaf : public finalize_fn {
1046 Fn my_fn;
finalize_fn_leaffinalize_fn_leaf1047 finalize_fn_leaf(Fn fn) : my_fn(fn) {}
operatorfinalize_fn_leaf1048 void operator() () __TBB_override { my_fn(); }
1049 };
1050
finalize_callback(cl_event,cl_int event_command_exec_status,void * data)1051 static void CL_CALLBACK finalize_callback(cl_event, cl_int event_command_exec_status, void *data) {
1052 tbb::internal::suppress_unused_warning(event_command_exec_status);
1053 __TBB_ASSERT(event_command_exec_status == CL_COMPLETE, NULL);
1054
1055 finalize_fn * const fn_ptr = static_cast<finalize_fn*>(data);
1056 __TBB_ASSERT(fn_ptr != NULL, "Invalid finalize function pointer");
1057 (*fn_ptr)();
1058
1059 // Function pointer was created by 'new' & this callback must be called once only
1060 delete fn_ptr;
1061 }
1062 public:
1063 template <typename FinalizeFn, typename ...Args>
finalize(opencl_device device,FinalizeFn fn,Args &...args)1064 void finalize( opencl_device device, FinalizeFn fn, Args&... args ) {
1065 cl_event e;
1066
1067 if ( get_event_from_args( e, args... ) ) {
1068 enforce_cl_retcode( clSetEventCallback( e, CL_COMPLETE, finalize_callback,
1069 new finalize_fn_leaf<FinalizeFn>(fn) ), "Failed to set a callback" );
1070 }
1071
1072 enforce_cl_retcode( clFlush( device.my_cl_command_queue ), "Failed to flush an OpenCL command queue" );
1073 }
1074
devices()1075 const opencl_device_list& devices() {
1076 std::call_once( my_once_flag, &opencl_factory::init_once, this );
1077 return my_devices;
1078 }
1079
1080 private:
is_same_context(opencl_device::device_id_type d1,opencl_device::device_id_type d2)1081 bool is_same_context( opencl_device::device_id_type d1, opencl_device::device_id_type d2 ) {
1082 __TBB_ASSERT( d1 != opencl_device::unknown && d2 != opencl_device::unknown, NULL );
1083 // Currently, factory supports only one context so if the both devices are not host it means the are in the same context.
1084 if ( d1 != opencl_device::host && d2 != opencl_device::host )
1085 return true;
1086 return d1 == d2;
1087 }
1088 private:
1089 opencl_factory( const opencl_factory& );
1090 opencl_factory& operator=(const opencl_factory&);
1091
context()1092 cl_context context() {
1093 std::call_once( my_once_flag, &opencl_factory::init_once, this );
1094 return my_cl_context;
1095 }
1096
init_once()1097 void init_once() {
1098 {
1099 tbb::spin_mutex::scoped_lock lock(my_devices_mutex);
1100 if (!my_devices.size())
1101 my_devices = DeviceFilter()( opencl_info::available_devices() );
1102 }
1103
1104 enforce_cl_retcode(my_devices.size() ? CL_SUCCESS : CL_INVALID_DEVICE, "No devices in the device list");
1105 cl_platform_id platform_id = my_devices.begin()->platform_id();
1106 for (opencl_device_list::iterator it = ++my_devices.begin(); it != my_devices.end(); ++it)
1107 enforce_cl_retcode(it->platform_id() == platform_id ? CL_SUCCESS : CL_INVALID_PLATFORM, "All devices should be in the same platform");
1108
1109 std::vector<cl_device_id> cl_device_ids;
1110 for (auto d = my_devices.begin(); d != my_devices.end(); ++d) {
1111 cl_device_ids.push_back((*d).my_cl_device_id);
1112 }
1113
1114 cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, (cl_context_properties)NULL };
1115 cl_int err;
1116 cl_context ctx = clCreateContext(context_properties,
1117 (cl_uint)cl_device_ids.size(),
1118 cl_device_ids.data(),
1119 NULL, NULL, &err);
1120 enforce_cl_retcode(err, "Failed to create context");
1121 my_cl_context = ctx;
1122
1123 size_t device_counter = 0;
1124 for (auto d = my_devices.begin(); d != my_devices.end(); d++) {
1125 (*d).my_device_id = device_counter++;
1126 cl_int err2;
1127 cl_command_queue cq;
1128 #if CL_VERSION_2_0
1129 if ((*d).major_version() >= 2) {
1130 if ((*d).out_of_order_exec_mode_on_host_present()) {
1131 cl_queue_properties props[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0 };
1132 cq = clCreateCommandQueueWithProperties(ctx, (*d).my_cl_device_id, props, &err2);
1133 } else {
1134 cl_queue_properties props[] = { 0 };
1135 cq = clCreateCommandQueueWithProperties(ctx, (*d).my_cl_device_id, props, &err2);
1136 }
1137 } else
1138 #endif
1139 {
1140 cl_command_queue_properties props = (*d).out_of_order_exec_mode_on_host_present() ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE : 0;
1141 // Suppress "declared deprecated" warning for the next line.
1142 #if __TBB_GCC_WARNING_SUPPRESSION_PRESENT
1143 #pragma GCC diagnostic push
1144 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1145 #endif
1146 #if _MSC_VER || __INTEL_COMPILER
1147 #pragma warning( push )
1148 #if __INTEL_COMPILER
1149 #pragma warning (disable: 1478)
1150 #else
1151 #pragma warning (disable: 4996)
1152 #endif
1153 #endif
1154 cq = clCreateCommandQueue(ctx, (*d).my_cl_device_id, props, &err2);
1155 #if _MSC_VER || __INTEL_COMPILER
1156 #pragma warning( pop )
1157 #endif
1158 #if __TBB_GCC_WARNING_SUPPRESSION_PRESENT
1159 #pragma GCC diagnostic pop
1160 #endif
1161 }
1162 enforce_cl_retcode(err2, "Failed to create command queue");
1163 (*d).my_cl_command_queue = cq;
1164 }
1165 }
1166
1167 std::once_flag my_once_flag;
1168 opencl_device_list my_devices;
1169 cl_context my_cl_context;
1170
1171 tbb::spin_mutex my_devices_mutex;
1172
1173 template <typename Factory>
1174 friend class opencl_program;
1175 template <typename Factory>
1176 friend class opencl_buffer_impl;
1177 template <typename Factory>
1178 friend class opencl_memory;
1179 }; // class opencl_factory
1180
1181 // TODO: consider this namespace as public API
1182 namespace opencl_info {
1183
1184 // Default types
1185
1186 template <typename Factory>
1187 struct default_device_selector {
operatordefault_device_selector1188 opencl_device operator()(Factory& f) {
1189 __TBB_ASSERT(!f.devices().empty(), "No available devices");
1190 return *(f.devices().begin());
1191 }
1192 };
1193
1194 struct default_device_filter {
operatordefault_device_filter1195 opencl_device_list operator()(const opencl_device_list &devices) {
1196 opencl_device_list dl;
1197 cl_platform_id platform_id = devices.begin()->platform_id();
1198 for (opencl_device_list::const_iterator it = devices.cbegin(); it != devices.cend(); ++it) {
1199 if (it->platform_id() == platform_id) {
1200 dl.add(*it);
1201 }
1202 }
1203 return dl;
1204 }
1205 };
1206
1207 class default_opencl_factory : public opencl_factory < default_device_filter >, tbb::internal::no_copy {
1208 public:
1209 template<typename T> using async_msg_type = opencl_async_msg<T, default_opencl_factory>;
1210
1211 friend default_opencl_factory& default_factory();
1212
1213 private:
1214 default_opencl_factory() = default;
1215 };
1216
default_factory()1217 inline default_opencl_factory& default_factory() {
1218 static default_opencl_factory default_factory;
1219 return default_factory;
1220 }
1221
1222 } // namespace opencl_info
1223
1224 template <typename T, typename Factory>
opencl_buffer(size_t size)1225 opencl_buffer<T, Factory>::opencl_buffer( size_t size ) : my_impl( std::make_shared<impl_type>( size*sizeof(T), opencl_info::default_factory() ) ) {}
1226
1227
1228 enum class opencl_program_type {
1229 SOURCE,
1230 PRECOMPILED,
1231 SPIR
1232 };
1233
1234 template <typename Factory = opencl_info::default_opencl_factory>
1235 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_program : tbb::internal::no_assign {
1236 public:
1237 typedef typename Factory::kernel_type kernel_type;
1238
opencl_program(Factory & factory,opencl_program_type type,const std::string & program_name)1239 opencl_program( Factory& factory, opencl_program_type type, const std::string& program_name ) : my_factory( factory ), my_type(type) , my_arg_str( program_name) {}
opencl_program(Factory & factory,const char * program_name)1240 opencl_program( Factory& factory, const char* program_name ) : opencl_program( factory, std::string( program_name ) ) {}
opencl_program(Factory & factory,const std::string & program_name)1241 opencl_program( Factory& factory, const std::string& program_name ) : opencl_program( factory, opencl_program_type::SOURCE, program_name ) {}
1242
opencl_program(opencl_program_type type,const std::string & program_name)1243 opencl_program( opencl_program_type type, const std::string& program_name ) : opencl_program( opencl_info::default_factory(), type, program_name ) {}
opencl_program(const char * program_name)1244 opencl_program( const char* program_name ) : opencl_program( opencl_info::default_factory(), program_name ) {}
opencl_program(const std::string & program_name)1245 opencl_program( const std::string& program_name ) : opencl_program( opencl_info::default_factory(), program_name ) {}
opencl_program(opencl_program_type type)1246 opencl_program( opencl_program_type type ) : opencl_program( opencl_info::default_factory(), type ) {}
1247
opencl_program(const opencl_program & src)1248 opencl_program( const opencl_program &src ) : my_factory( src.my_factory ), my_type( src.type ), my_arg_str( src.my_arg_str ), my_cl_program( src.my_cl_program ) {
1249 // Set my_do_once_flag to the called state.
1250 std::call_once( my_do_once_flag, [](){} );
1251 }
1252
get_kernel(const std::string & k)1253 kernel_type get_kernel( const std::string& k ) const {
1254 return kernel_type( get_cl_kernel(k), my_factory );
1255 }
1256
1257 private:
opencl_program(Factory & factory,cl_program program)1258 opencl_program( Factory& factory, cl_program program ) : my_factory( factory ), my_cl_program( program ) {
1259 // Set my_do_once_flag to the called state.
1260 std::call_once( my_do_once_flag, [](){} );
1261 }
1262
get_cl_kernel(const std::string & k)1263 cl_kernel get_cl_kernel( const std::string& k ) const {
1264 std::call_once( my_do_once_flag, [this, &k](){ this->init( k ); } );
1265 cl_int err;
1266 cl_kernel kernel = clCreateKernel( my_cl_program, k.c_str(), &err );
1267 enforce_cl_retcode( err, std::string( "Failed to create kernel: " ) + k );
1268 return kernel;
1269 }
1270
1271 class file_reader {
1272 public:
file_reader(const std::string & filepath)1273 file_reader( const std::string& filepath ) {
1274 std::ifstream file_descriptor( filepath, std::ifstream::binary );
1275 if ( !file_descriptor.is_open() ) {
1276 std::string str = std::string( "Could not open file: " ) + filepath;
1277 std::cerr << str << std::endl;
1278 throw str;
1279 }
1280 file_descriptor.seekg( 0, file_descriptor.end );
1281 size_t length = size_t( file_descriptor.tellg() );
1282 file_descriptor.seekg( 0, file_descriptor.beg );
1283 my_content.resize( length );
1284 char* begin = &*my_content.begin();
1285 file_descriptor.read( begin, length );
1286 file_descriptor.close();
1287 }
content()1288 const char* content() { return &*my_content.cbegin(); }
length()1289 size_t length() { return my_content.length(); }
1290 private:
1291 std::string my_content;
1292 };
1293
1294 class opencl_program_builder {
1295 public:
1296 typedef void (CL_CALLBACK *cl_callback_type)(cl_program, void*);
opencl_program_builder(Factory & f,const std::string & name,cl_program program,cl_uint num_devices,cl_device_id * device_list,const char * options,cl_callback_type callback,void * user_data)1297 opencl_program_builder( Factory& f, const std::string& name, cl_program program,
1298 cl_uint num_devices, cl_device_id* device_list,
1299 const char* options, cl_callback_type callback,
1300 void* user_data ) {
1301 cl_int err = clBuildProgram( program, num_devices, device_list, options,
1302 callback, user_data );
1303 if( err == CL_SUCCESS )
1304 return;
1305 std::string str = std::string( "Failed to build program: " ) + name;
1306 if ( err == CL_BUILD_PROGRAM_FAILURE ) {
1307 const opencl_device_list &devices = f.devices();
1308 for ( auto d = devices.begin(); d != devices.end(); ++d ) {
1309 std::cerr << "Build log for device: " << (*d).name() << std::endl;
1310 size_t log_size;
1311 cl_int query_err = clGetProgramBuildInfo(
1312 program, (*d).my_cl_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL,
1313 &log_size );
1314 enforce_cl_retcode( query_err, "Failed to get build log size" );
1315 if( log_size ) {
1316 std::vector<char> output;
1317 output.resize( log_size );
1318 query_err = clGetProgramBuildInfo(
1319 program, (*d).my_cl_device_id, CL_PROGRAM_BUILD_LOG,
1320 output.size(), output.data(), NULL );
1321 enforce_cl_retcode( query_err, "Failed to get build output" );
1322 std::cerr << output.data() << std::endl;
1323 } else {
1324 std::cerr << "No build log available" << std::endl;
1325 }
1326 }
1327 }
1328 enforce_cl_retcode( err, str );
1329 }
1330 };
1331
1332 class opencl_device_filter {
1333 public:
1334 template<typename Filter>
opencl_device_filter(cl_uint & num_devices,cl_device_id * device_list,Filter filter,const char * message)1335 opencl_device_filter( cl_uint& num_devices, cl_device_id* device_list,
1336 Filter filter, const char* message ) {
1337 for ( cl_uint i = 0; i < num_devices; ++i )
1338 if ( filter(device_list[i]) ) {
1339 device_list[i--] = device_list[--num_devices];
1340 }
1341 if ( !num_devices )
1342 enforce_cl_retcode( CL_DEVICE_NOT_AVAILABLE, message );
1343 }
1344 };
1345
init(const std::string &)1346 void init( const std::string& ) const {
1347 cl_uint num_devices;
1348 enforce_cl_retcode( clGetContextInfo( my_factory.context(), CL_CONTEXT_NUM_DEVICES, sizeof( num_devices ), &num_devices, NULL ),
1349 "Failed to get OpenCL context info" );
1350 if ( !num_devices )
1351 enforce_cl_retcode( CL_DEVICE_NOT_FOUND, "No supported devices found" );
1352 cl_device_id *device_list = (cl_device_id *)alloca( num_devices*sizeof( cl_device_id ) );
1353 enforce_cl_retcode( clGetContextInfo( my_factory.context(), CL_CONTEXT_DEVICES, num_devices*sizeof( cl_device_id ), device_list, NULL ),
1354 "Failed to get OpenCL context info" );
1355 const char *options = NULL;
1356 switch ( my_type ) {
1357 case opencl_program_type::SOURCE: {
1358 file_reader fr( my_arg_str );
1359 const char *s[] = { fr.content() };
1360 const size_t l[] = { fr.length() };
1361 cl_int err;
1362 my_cl_program = clCreateProgramWithSource( my_factory.context(), 1, s, l, &err );
1363 enforce_cl_retcode( err, std::string( "Failed to create program: " ) + my_arg_str );
1364 opencl_device_filter(
1365 num_devices, device_list,
1366 []( const opencl_device& d ) -> bool {
1367 return !d.compiler_available() || !d.linker_available();
1368 }, "No one device supports building program from sources" );
1369 opencl_program_builder(
1370 my_factory, my_arg_str, my_cl_program, num_devices, device_list,
1371 options, /*callback*/ NULL, /*user data*/NULL );
1372 break;
1373 }
1374 case opencl_program_type::SPIR:
1375 options = "-x spir";
1376 case opencl_program_type::PRECOMPILED: {
1377 file_reader fr( my_arg_str );
1378 std::vector<const unsigned char*> s(
1379 num_devices, reinterpret_cast<const unsigned char*>(fr.content()) );
1380 std::vector<size_t> l( num_devices, fr.length() );
1381 std::vector<cl_int> bin_statuses( num_devices, -1 );
1382 cl_int err;
1383 my_cl_program = clCreateProgramWithBinary( my_factory.context(), num_devices,
1384 device_list, l.data(), s.data(),
1385 bin_statuses.data(), &err );
1386 if( err != CL_SUCCESS ) {
1387 std::string statuses_str;
1388 for (auto st = bin_statuses.begin(); st != bin_statuses.end(); ++st) {
1389 statuses_str += std::to_string((*st));
1390 }
1391
1392 enforce_cl_retcode( err, std::string( "Failed to create program, error " + std::to_string( err ) + " : " ) + my_arg_str +
1393 std::string( ", binary_statuses = " ) + statuses_str );
1394 }
1395 opencl_program_builder(
1396 my_factory, my_arg_str, my_cl_program, num_devices, device_list,
1397 options, /*callback*/ NULL, /*user data*/NULL );
1398 break;
1399 }
1400 default:
1401 __TBB_ASSERT( false, "Unsupported program type" );
1402 }
1403 }
1404
1405 Factory& my_factory;
1406 opencl_program_type my_type;
1407 std::string my_arg_str;
1408 mutable cl_program my_cl_program;
1409 mutable std::once_flag my_do_once_flag;
1410
1411 template <typename DeviceFilter>
1412 friend class opencl_factory;
1413
1414 friend class Factory::kernel;
1415 };
1416
1417 template<typename... Args>
1418 class __TBB_DEPRECATED_IN_VERBOSE_MODE opencl_node;
1419
1420 template<typename JP, typename Factory, typename... Ports>
1421 class __TBB_DEPRECATED_IN_VERBOSE_MODE
1422 opencl_node< tuple<Ports...>, JP, Factory > : public streaming_node< tuple<Ports...>, JP, Factory > {
1423 typedef streaming_node < tuple<Ports...>, JP, Factory > base_type;
1424 public:
1425 typedef typename base_type::kernel_type kernel_type;
1426
opencl_node(graph & g,const kernel_type & kernel)1427 opencl_node( graph &g, const kernel_type& kernel )
1428 : base_type( g, kernel, opencl_info::default_device_selector< opencl_info::default_opencl_factory >(), opencl_info::default_factory() )
1429 {
1430 tbb::internal::fgt_multiinput_multioutput_node( CODEPTR(), tbb::internal::FLOW_OPENCL_NODE, this, &this->my_graph );
1431 }
1432
opencl_node(graph & g,const kernel_type & kernel,Factory & f)1433 opencl_node( graph &g, const kernel_type& kernel, Factory &f )
1434 : base_type( g, kernel, opencl_info::default_device_selector <Factory >(), f )
1435 {
1436 tbb::internal::fgt_multiinput_multioutput_node( CODEPTR(), tbb::internal::FLOW_OPENCL_NODE, this, &this->my_graph );
1437 }
1438
1439 template <typename DeviceSelector>
opencl_node(graph & g,const kernel_type & kernel,DeviceSelector d,Factory & f)1440 opencl_node( graph &g, const kernel_type& kernel, DeviceSelector d, Factory &f)
1441 : base_type( g, kernel, d, f)
1442 {
1443 tbb::internal::fgt_multiinput_multioutput_node( CODEPTR(), tbb::internal::FLOW_OPENCL_NODE, this, &this->my_graph );
1444 }
1445 };
1446
1447 template<typename JP, typename... Ports>
1448 class __TBB_DEPRECATED_IN_VERBOSE_MODE
1449 opencl_node< tuple<Ports...>, JP > : public opencl_node < tuple<Ports...>, JP, opencl_info::default_opencl_factory > {
1450 typedef opencl_node < tuple<Ports...>, JP, opencl_info::default_opencl_factory > base_type;
1451 public:
1452 typedef typename base_type::kernel_type kernel_type;
1453
opencl_node(graph & g,const kernel_type & kernel)1454 opencl_node( graph &g, const kernel_type& kernel )
1455 : base_type( g, kernel, opencl_info::default_device_selector< opencl_info::default_opencl_factory >(), opencl_info::default_factory() )
1456 {}
1457
1458 template <typename DeviceSelector>
opencl_node(graph & g,const kernel_type & kernel,DeviceSelector d)1459 opencl_node( graph &g, const kernel_type& kernel, DeviceSelector d )
1460 : base_type( g, kernel, d, opencl_info::default_factory() )
1461 {}
1462 };
1463
1464 template<typename... Ports>
1465 class __TBB_DEPRECATED_IN_VERBOSE_MODE
1466 opencl_node< tuple<Ports...> > : public opencl_node < tuple<Ports...>, queueing, opencl_info::default_opencl_factory > {
1467 typedef opencl_node < tuple<Ports...>, queueing, opencl_info::default_opencl_factory > base_type;
1468 public:
1469 typedef typename base_type::kernel_type kernel_type;
1470
opencl_node(graph & g,const kernel_type & kernel)1471 opencl_node( graph &g, const kernel_type& kernel )
1472 : base_type( g, kernel, opencl_info::default_device_selector< opencl_info::default_opencl_factory >(), opencl_info::default_factory() )
1473 {}
1474
1475 template <typename DeviceSelector>
opencl_node(graph & g,const kernel_type & kernel,DeviceSelector d)1476 opencl_node( graph &g, const kernel_type& kernel, DeviceSelector d )
1477 : base_type( g, kernel, d, opencl_info::default_factory() )
1478 {}
1479 };
1480
1481 } // namespace interfaceX
1482
1483 using interface11::opencl_node;
1484 using interface11::read_only;
1485 using interface11::read_write;
1486 using interface11::write_only;
1487 using interface11::opencl_buffer;
1488 using interface11::opencl_subbuffer;
1489 using interface11::opencl_device;
1490 using interface11::opencl_device_list;
1491 using interface11::opencl_program;
1492 using interface11::opencl_program_type;
1493 using interface11::opencl_async_msg;
1494 using interface11::opencl_factory;
1495 using interface11::opencl_range;
1496
1497 } // namespace flow
1498 } // namespace tbb
1499 #endif /* __TBB_PREVIEW_OPENCL_NODE */
1500
1501 #include "internal/_warning_suppress_disable_notice.h"
1502 #undef __TBB_flow_graph_opencl_node_H_include_area
1503
1504 #endif // __TBB_flow_graph_opencl_node_H
1505