1 
2 // =================================================================================================
3 // This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
4 // a tab-size of two spaces and a max-width of 100 characters per line.
5 //
6 // Author(s):
7 //   Cedric Nugteren <www.cedricnugteren.nl>
8 //
9 // This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API
10 // calls. The main benefits are increased abstraction, automatic memory management, and portability.
11 // Portability here means that a similar header exists for CUDA with the same classes and
12 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
13 //
14 // This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
15 // therefore contains the following header copyright notice:
16 //
17 // =================================================================================================
18 //
19 // Copyright 2015 SURFsara
20 //
21 // Licensed under the Apache License, Version 2.0 (the "License");
22 // you may not use this file except in compliance with the License.
23 // You may obtain a copy of the License at
24 //
25 //  http://www.apache.org/licenses/LICENSE-2.0
26 //
27 // Unless required by applicable law or agreed to in writing, software
28 // distributed under the License is distributed on an "AS IS" BASIS,
29 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
30 // See the License for the specific language governing permissions and
31 // limitations under the License.
32 //
33 // =================================================================================================
34 
35 #ifndef CLTUNE_CLPP11_H_
36 #define CLTUNE_CLPP11_H_
37 
38 // C++
39 #include <algorithm> // std::copy
40 #include <string>    // std::string
41 #include <vector>    // std::vector
42 #include <memory>    // std::shared_ptr
43 #include <stdexcept> // std::runtime_error
44 #include <numeric>   // std::accumulate
45 
46 // OpenCL
47 #if defined(__APPLE__) || defined(__MACOSX)
48   #include <OpenCL/opencl.h>
49 #else
50   #include <CL/opencl.h>
51 #endif
52 
53 namespace cltune {
54 // =================================================================================================
55 
56 // Error occurred in the C++11 OpenCL header (this file)
Error(const std::string & message)57 inline void Error(const std::string &message) {
58   throw std::runtime_error("Internal OpenCL error: "+message);
59 }
60 
61 // Error occurred in OpenCL
CheckError(const cl_int status)62 inline void CheckError(const cl_int status) {
63   if (status != CL_SUCCESS) {
64     throw std::runtime_error("Internal OpenCL error: "+std::to_string(status));
65   }
66 }
67 
68 // =================================================================================================
69 
70 // C++11 version of 'cl_event'
71 class Event {
72  public:
73 
74   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
Event(const cl_event event)75   explicit Event(const cl_event event):
76       event_(new cl_event) {
77     *event_ = event;
78   }
79 
80   // Regular constructor with memory management
Event()81   explicit Event():
82       event_(new cl_event, [](cl_event* e) {
83         if (*e) { CheckError(clReleaseEvent(*e)); }
84         delete e;
85       }) {
86     *event_ = nullptr;
87   }
88 
89   // Waits for completion of this event
WaitForCompletion()90   void WaitForCompletion() const {
91     CheckError(clWaitForEvents(1, &(*event_)));
92   }
93 
94   // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
95   // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
96   // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
GetElapsedTime()97   float GetElapsedTime() const {
98     WaitForCompletion();
99     const auto bytes = sizeof(cl_ulong);
100     auto time_start = cl_ulong{0};
101     clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
102     auto time_end = cl_ulong{0};
103     clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
104     return static_cast<float>(time_end - time_start) * 1.0e-6f;
105   }
106 
107   // Accessor to the private data-member
operator()108   cl_event& operator()() { return *event_; }
operator()109   const cl_event& operator()() const { return *event_; }
pointer()110   cl_event* pointer() { return &(*event_); }
pointer()111   const cl_event* pointer() const { return &(*event_); }
112  private:
113   std::shared_ptr<cl_event> event_;
114 };
115 
116 // Pointer to an OpenCL event
117 using EventPointer = cl_event*;
118 
119 // =================================================================================================
120 
121 // C++11 version of 'cl_platform_id'
122 class Platform {
123  public:
124 
125   // Constructor based on the regular OpenCL data-type
Platform(const cl_platform_id platform)126   explicit Platform(const cl_platform_id platform): platform_(platform) { }
127 
128   // Initializes the platform
Platform(const size_t platform_id)129   explicit Platform(const size_t platform_id) {
130     auto num_platforms = cl_uint{0};
131     CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
132     if (num_platforms == 0) { Error("no platforms found"); }
133     auto platforms = std::vector<cl_platform_id>(num_platforms);
134     CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr));
135     if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
136     platform_ = platforms[platform_id];
137   }
138 
139   // Returns the number of devices on this platform
NumDevices()140   size_t NumDevices() const {
141     auto result = cl_uint{0};
142     CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result));
143     return static_cast<size_t>(result);
144   }
145 
146   // Accessor to the private data-member
operator()147   const cl_platform_id& operator()() const { return platform_; }
148  private:
149   cl_platform_id platform_;
150 };
151 
152 // Retrieves a vector with all platforms
GetAllPlatforms()153 inline std::vector<Platform> GetAllPlatforms() {
154   auto num_platforms = cl_uint{0};
155   CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
156   auto all_platforms = std::vector<Platform>();
157   for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
158     all_platforms.push_back(Platform(platform_id));
159   }
160   return all_platforms;
161 }
162 
163 // =================================================================================================
164 
165 // C++11 version of 'cl_device_id'
166 class Device {
167  public:
168 
169   // Constructor based on the regular OpenCL data-type
Device(const cl_device_id device)170   explicit Device(const cl_device_id device): device_(device) { }
171 
172   // Initialize the device. Note that this constructor can throw exceptions!
Device(const Platform & platform,const size_t device_id)173   explicit Device(const Platform &platform, const size_t device_id) {
174     auto num_devices = platform.NumDevices();
175     if (num_devices == 0) { Error("no devices found"); }
176     auto devices = std::vector<cl_device_id>(num_devices);
177     CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
178                               devices.data(), nullptr));
179     if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
180     device_ = devices[device_id];
181   }
182 
183   // Methods to retrieve device information
Version()184   std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); }
VersionNumber()185   size_t VersionNumber() const
186   {
187     std::string version_string = Version().substr(7);
188     // Space separates the end of the OpenCL version number from the beginning of the
189     // vendor-specific information.
190     size_t next_whitespace = version_string.find(' ');
191     size_t version = (size_t) (100.0 * std::stod(version_string.substr(0, next_whitespace)));
192     return version;
193   }
Vendor()194   std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); }
Name()195   std::string Name() const { return GetInfoString(CL_DEVICE_NAME); }
Type()196   std::string Type() const {
197     auto type = GetInfo<cl_device_type>(CL_DEVICE_TYPE);
198     switch(type) {
199       case CL_DEVICE_TYPE_CPU: return "CPU";
200       case CL_DEVICE_TYPE_GPU: return "GPU";
201       case CL_DEVICE_TYPE_ACCELERATOR: return "accelerator";
202       default: return "default";
203     }
204   }
MaxWorkGroupSize()205   size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
MaxWorkItemDimensions()206   size_t MaxWorkItemDimensions() const {
207     return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS));
208   }
MaxWorkItemSizes()209   std::vector<size_t> MaxWorkItemSizes() const {
210     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
211   }
LocalMemSize()212   unsigned long LocalMemSize() const {
213     return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
214   }
Capabilities()215   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
CoreClock()216   size_t CoreClock() const {
217     return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_CLOCK_FREQUENCY));
218   }
ComputeUnits()219   size_t ComputeUnits() const {
220     return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_COMPUTE_UNITS));
221   }
MemorySize()222   unsigned long MemorySize() const {
223     return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE));
224   }
MaxAllocSize()225   unsigned long MaxAllocSize() const {
226     return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_MAX_MEM_ALLOC_SIZE));
227   }
MemoryClock()228   size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
MemoryBusWidth()229   size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
230 
231   // Configuration-validity checks
IsLocalMemoryValid(const cl_ulong local_mem_usage)232   bool IsLocalMemoryValid(const cl_ulong local_mem_usage) const {
233     return (local_mem_usage <= LocalMemSize());
234   }
IsThreadConfigValid(const std::vector<size_t> & local)235   bool IsThreadConfigValid(const std::vector<size_t> &local) const {
236     auto local_size = size_t{1};
237     for (const auto &item: local) { local_size *= item; }
238     for (auto i=size_t{0}; i<local.size(); ++i) {
239       if (local[i] > MaxWorkItemSizes()[i]) { return false; }
240     }
241     if (local_size > MaxWorkGroupSize()) { return false; }
242     if (local.size() > MaxWorkItemDimensions()) { return false; }
243     return true;
244   }
245 
246   // Query for a specific type of device or brand
IsCPU()247   bool IsCPU() const { return Type() == "CPU"; }
IsGPU()248   bool IsGPU() const { return Type() == "GPU"; }
IsAMD()249   bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." ||
250                               Vendor() == "AuthenticAMD";; }
IsNVIDIA()251   bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
IsIntel()252   bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" ||
253                                 Vendor() == "GenuineIntel"; }
IsARM()254   bool IsARM() const { return Vendor() == "ARM"; }
255 
256   // Accessor to the private data-member
operator()257   const cl_device_id& operator()() const { return device_; }
258  private:
259   cl_device_id device_;
260 
261   // Private helper functions
262   template <typename T>
GetInfo(const cl_device_info info)263   T GetInfo(const cl_device_info info) const {
264     auto bytes = size_t{0};
265     CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
266     auto result = T(0);
267     CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
268     return result;
269   }
270   template <typename T>
GetInfoVector(const cl_device_info info)271   std::vector<T> GetInfoVector(const cl_device_info info) const {
272     auto bytes = size_t{0};
273     CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
274     auto result = std::vector<T>(bytes/sizeof(T));
275     CheckError(clGetDeviceInfo(device_, info, bytes, result.data(), nullptr));
276     return result;
277   }
GetInfoString(const cl_device_info info)278   std::string GetInfoString(const cl_device_info info) const {
279     auto bytes = size_t{0};
280     CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
281     auto result = std::string{};
282     result.resize(bytes);
283     CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
284     return std::string{result.c_str()}; // Removes any trailing '\0'-characters
285   }
286 };
287 
288 // =================================================================================================
289 
290 // C++11 version of 'cl_context'
291 class Context {
292  public:
293 
294   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
Context(const cl_context context)295   explicit Context(const cl_context context):
296       context_(new cl_context) {
297     *context_ = context;
298   }
299 
300   // Regular constructor with memory management
Context(const Device & device)301   explicit Context(const Device &device):
302       context_(new cl_context, [](cl_context* c) { CheckError(clReleaseContext(*c)); delete c; }) {
303     auto status = CL_SUCCESS;
304     const cl_device_id dev = device();
305     *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
306     CheckError(status);
307   }
308 
309   // Accessor to the private data-member
operator()310   const cl_context& operator()() const { return *context_; }
pointer()311   cl_context* pointer() const { return &(*context_); }
312  private:
313   std::shared_ptr<cl_context> context_;
314 };
315 
316 // Pointer to an OpenCL context
317 using ContextPointer = cl_context*;
318 
319 // =================================================================================================
320 
321 // Enumeration of build statuses of the run-time compilation process
322 enum class BuildStatus { kSuccess, kError, kInvalid };
323 
324 // C++11 version of 'cl_program'. Additionally holds the program's source code.
325 class Program {
326  public:
327   // Note that there is no constructor based on the regular OpenCL data-type because of extra state
328 
329   // Source-based constructor with memory management
Program(const Context & context,std::string source)330   explicit Program(const Context &context, std::string source):
331       program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
332       length_(source.length()),
333       source_(std::move(source)),
334       source_ptr_(&source_[0]) {
335     auto status = CL_SUCCESS;
336     *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
337     CheckError(status);
338   }
339 
340   // Binary-based constructor with memory management
Program(const Device & device,const Context & context,const std::string & binary)341   explicit Program(const Device &device, const Context &context, const std::string& binary):
342       program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
343       length_(binary.length()),
344       source_(binary),
345       source_ptr_(&source_[0]) {
346     auto status1 = CL_SUCCESS;
347     auto status2 = CL_SUCCESS;
348     const cl_device_id dev = device();
349     *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
350                                           reinterpret_cast<const unsigned char**>(&source_ptr_),
351                                           &status1, &status2);
352     CheckError(status1);
353     CheckError(status2);
354   }
355 
356   // Compiles the device program and returns whether or not there where any warnings/errors
Build(const Device & device,std::vector<std::string> & options)357   BuildStatus Build(const Device &device, std::vector<std::string> &options) {
358     auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
359     const cl_device_id dev = device();
360     auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr);
361     if (status == CL_BUILD_PROGRAM_FAILURE) {
362       return BuildStatus::kError;
363     }
364     else if (status == CL_INVALID_BINARY) {
365       return BuildStatus::kInvalid;
366     }
367     else {
368       CheckError(status);
369       return BuildStatus::kSuccess;
370     }
371   }
372 
373   // Retrieves the warning/error message from the compiler (if any)
GetBuildInfo(const Device & device)374   std::string GetBuildInfo(const Device &device) const {
375     auto bytes = size_t{0};
376     auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG};
377     CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes));
378     auto result = std::string{};
379     result.resize(bytes);
380     CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr));
381     return result;
382   }
383 
384   // Retrieves a binary or an intermediate representation of the compiled program
GetIR()385   std::string GetIR() const {
386     auto bytes = size_t{0};
387     CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
388     auto result = std::string{};
389     result.resize(bytes);
390     auto result_ptr = result.data();
391     CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
392     return result;
393   }
394 
395   // Accessor to the private data-member
operator()396   const cl_program& operator()() const { return *program_; }
397  private:
398   std::shared_ptr<cl_program> program_;
399   size_t length_;
400   std::string source_; // Note: the source can also be a binary or IR
401   const char* source_ptr_;
402 };
403 
404 // =================================================================================================
405 
406 // C++11 version of 'cl_command_queue'
407 class Queue {
408  public:
409 
410   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
Queue(const cl_command_queue queue)411   explicit Queue(const cl_command_queue queue):
412       queue_(new cl_command_queue) {
413     *queue_ = queue;
414   }
415 
416   // Regular constructor with memory management
Queue(const Context & context,const Device & device)417   explicit Queue(const Context &context, const Device &device):
418       queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
419                                                              delete s; }) {
420     auto status = CL_SUCCESS;
421     #ifdef CL_VERSION_2_0
422       size_t ocl_version = device.VersionNumber();
423       if (ocl_version >= 200)
424       {
425         cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
426         *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
427       }
428       else
429       {
430         *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
431       }
432     #else
433       *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
434     #endif
435     CheckError(status);
436   }
437 
438   // Synchronizes the queue
Finish(Event &)439   void Finish(Event &) const {
440     Finish();
441   }
Finish()442   void Finish() const {
443     CheckError(clFinish(*queue_));
444   }
445 
446   // Retrieves the corresponding context or device
GetContext()447   Context GetContext() const {
448     auto bytes = size_t{0};
449     CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes));
450     cl_context result;
451     CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr));
452     return Context(result);
453   }
GetDevice()454   Device GetDevice() const {
455     auto bytes = size_t{0};
456     CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes));
457     cl_device_id result;
458     CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr));
459     return Device(result);
460   }
461 
462   // Accessor to the private data-member
operator()463   const cl_command_queue& operator()() const { return *queue_; }
464  private:
465   std::shared_ptr<cl_command_queue> queue_;
466 };
467 
468 // =================================================================================================
469 
470 // C++11 version of host memory
471 template <typename T>
472 class BufferHost {
473  public:
474 
475   // Regular constructor with memory management
BufferHost(const Context &,const size_t size)476   explicit BufferHost(const Context &, const size_t size):
477       buffer_(new std::vector<T>(size)) {
478   }
479 
480   // Retrieves the actual allocated size in bytes
GetSize()481   size_t GetSize() const {
482     return buffer_->size()*sizeof(T);
483   }
484 
485   // Compatibility with std::vector
size()486   size_t size() const { return buffer_->size(); }
begin()487   T* begin() { return &(*buffer_)[0]; }
end()488   T* end() { return &(*buffer_)[buffer_->size()-1]; }
489   T& operator[](const size_t i) { return (*buffer_)[i]; }
data()490   T* data() { return buffer_->data(); }
data()491   const T* data() const { return buffer_->data(); }
492 
493  private:
494   std::shared_ptr<std::vector<T>> buffer_;
495 };
496 
497 // =================================================================================================
498 
499 // Enumeration of buffer access types
500 enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
501 
502 // C++11 version of 'cl_mem'
503 template <typename T>
504 class Buffer {
505  public:
506 
507   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
Buffer(const cl_mem buffer)508   explicit Buffer(const cl_mem buffer):
509       buffer_(new cl_mem),
510       access_(BufferAccess::kNotOwned) {
511     *buffer_ = buffer;
512   }
513 
514   // Regular constructor with memory management. If this class does not own the buffer object, then
515   // the memory will not be freed automatically afterwards.
Buffer(const Context & context,const BufferAccess access,const size_t size)516   explicit Buffer(const Context &context, const BufferAccess access, const size_t size):
517       buffer_(new cl_mem, [access](cl_mem* m) {
518         if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); }
519         delete m;
520       }),
521       access_(access) {
522     auto flags = cl_mem_flags{CL_MEM_READ_WRITE};
523     if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; }
524     if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
525     auto status = CL_SUCCESS;
526     *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
527     CheckError(status);
528   }
529 
530   // As above, but now with read/write access as a default
Buffer(const Context & context,const size_t size)531   explicit Buffer(const Context &context, const size_t size):
532     Buffer<T>(context, BufferAccess::kReadWrite, size) {
533   }
534 
535   // Constructs a new buffer based on an existing host-container
536   template <typename Iterator>
Buffer(const Context & context,const Queue & queue,Iterator start,Iterator end)537   explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
538     Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
539     auto size = static_cast<size_t>(end - start);
540     auto pointer = &*start;
541     CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0,
542                                     nullptr, nullptr));
543     queue.Finish();
544   }
545 
546   // Copies from device to host: reading the device buffer a-synchronously
547   void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
548     if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
549     CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
550                                    host, 0, nullptr, nullptr));
551   }
552   void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
553                  const size_t offset = 0) const {
554     if (host.size() < size) { Error("target host buffer is too small"); }
555     ReadAsync(queue, size, host.data(), offset);
556   }
557   void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
558                  const size_t offset = 0) const {
559     if (host.size() < size) { Error("target host buffer is too small"); }
560     ReadAsync(queue, size, host.data(), offset);
561   }
562 
563   // Copies from device to host: reading the device buffer
564   void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
565     ReadAsync(queue, size, host, offset);
566     queue.Finish();
567   }
568   void Read(const Queue &queue, const size_t size, std::vector<T> &host,
569             const size_t offset = 0) const {
570     Read(queue, size, host.data(), offset);
571   }
572   void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
573             const size_t offset = 0) const {
574     Read(queue, size, host.data(), offset);
575   }
576 
577   // Copies from host to device: writing the device buffer a-synchronously
578   void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
579     if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
580     if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
581     CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
582                                     host, 0, nullptr, nullptr));
583   }
584   void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
585                   const size_t offset = 0) {
586     WriteAsync(queue, size, host.data(), offset);
587   }
588   void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
589                   const size_t offset = 0) {
590     WriteAsync(queue, size, host.data(), offset);
591   }
592 
593   // Copies from host to device: writing the device buffer
594   void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
595     WriteAsync(queue, size, host, offset);
596     queue.Finish();
597   }
598   void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
599              const size_t offset = 0) {
600     Write(queue, size, host.data(), offset);
601   }
602   void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
603              const size_t offset = 0) {
604     Write(queue, size, host.data(), offset);
605   }
606 
607   // Copies the contents of this buffer into another device buffer
CopyToAsync(const Queue & queue,const size_t size,const Buffer<T> & destination)608   void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
609     CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0,
610                                    nullptr, nullptr));
611   }
CopyTo(const Queue & queue,const size_t size,const Buffer<T> & destination)612   void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
613     CopyToAsync(queue, size, destination);
614     queue.Finish();
615   }
616 
617   // Retrieves the actual allocated size in bytes
GetSize()618   size_t GetSize() const {
619     const auto bytes = sizeof(size_t);
620     auto result = size_t{0};
621     CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
622     return result;
623   }
624 
625   // Accessor to the private data-member
operator()626   const cl_mem& operator()() const { return *buffer_; }
627  private:
628   std::shared_ptr<cl_mem> buffer_;
629   const BufferAccess access_;
630 };
631 
632 // =================================================================================================
633 
634 // C++11 version of 'cl_kernel'
635 class Kernel {
636  public:
637 
638   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
Kernel(const cl_kernel kernel)639   explicit Kernel(const cl_kernel kernel):
640       kernel_(new cl_kernel) {
641     *kernel_ = kernel;
642   }
643 
644   // Regular constructor with memory management
Kernel(const Program & program,const std::string & name)645   explicit Kernel(const Program &program, const std::string &name):
646       kernel_(new cl_kernel, [](cl_kernel* k) { CheckError(clReleaseKernel(*k)); delete k; }) {
647     auto status = CL_SUCCESS;
648     *kernel_ = clCreateKernel(program(), name.c_str(), &status);
649     CheckError(status);
650   }
651 
652   // Sets a kernel argument at the indicated position
653   template <typename T>
SetArgument(const size_t index,const T & value)654   void SetArgument(const size_t index, const T &value) {
655     CheckError(clSetKernelArg(*kernel_, static_cast<cl_uint>(index), sizeof(T), &value));
656   }
657   template <typename T>
SetArgument(const size_t index,Buffer<T> & value)658   void SetArgument(const size_t index, Buffer<T> &value) {
659     SetArgument(index, value());
660   }
661 
662   // Sets all arguments in one go using parameter packs. Note that this overwrites previously set
663   // arguments using 'SetArgument' or 'SetArguments'.
664   template <typename... Args>
SetArguments(Args &...args)665   void SetArguments(Args&... args) {
666     SetArgumentsRecursive(0, args...);
667   }
668 
669   // Retrieves the amount of local memory used per work-group for this kernel
LocalMemUsage(const Device & device)670   unsigned long LocalMemUsage(const Device &device) const {
671     const auto bytes = sizeof(cl_ulong);
672     auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
673     auto result = cl_ulong{0};
674     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
675     return static_cast<unsigned long>(result);
676   }
677 
678   // Retrieves the name of the kernel
GetFunctionName()679   std::string GetFunctionName() const {
680     auto bytes = size_t{0};
681     CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
682     auto result = std::string{};
683     result.resize(bytes);
684     CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, bytes, &result[0], nullptr));
685     return std::string{result.c_str()}; // Removes any trailing '\0'-characters
686   }
687 
688   // Launches a kernel onto the specified queue
Launch(const Queue & queue,const std::vector<size_t> & global,const std::vector<size_t> & local,EventPointer event)689   void Launch(const Queue &queue, const std::vector<size_t> &global,
690               const std::vector<size_t> &local, EventPointer event) {
691     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
692                                       nullptr, global.data(), local.data(),
693                                       0, nullptr, event));
694   }
695 
696   // As above, but with an event waiting list
Launch(const Queue & queue,const std::vector<size_t> & global,const std::vector<size_t> & local,EventPointer event,const std::vector<Event> & waitForEvents)697   void Launch(const Queue &queue, const std::vector<size_t> &global,
698               const std::vector<size_t> &local, EventPointer event,
699               const std::vector<Event> &waitForEvents) {
700 
701     // Builds a plain version of the events waiting list
702     auto waitForEventsPlain = std::vector<cl_event>();
703     for (auto &waitEvent : waitForEvents) {
704       if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); }
705     }
706 
707     // Launches the kernel while waiting for other events
708     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
709                                       nullptr, global.data(), !local.empty() ? local.data() : nullptr,
710                                       static_cast<cl_uint>(waitForEventsPlain.size()),
711                                       !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr,
712                                       event));
713   }
714 
715   // Accessor to the private data-member
operator()716   const cl_kernel& operator()() const { return *kernel_; }
717  private:
718   std::shared_ptr<cl_kernel> kernel_;
719 
720   // Internal implementation for the recursive SetArguments function.
721   template <typename T>
SetArgumentsRecursive(const size_t index,T & first)722   void SetArgumentsRecursive(const size_t index, T &first) {
723     SetArgument(index, first);
724   }
725   template <typename T, typename... Args>
SetArgumentsRecursive(const size_t index,T & first,Args &...args)726   void SetArgumentsRecursive(const size_t index, T &first, Args&... args) {
727     SetArgument(index, first);
728     SetArgumentsRecursive(index+1, args...);
729   }
730 };
731 
732 // =================================================================================================
733 } // namespace cltune
734 
735 // CLTUNE_CLPP11_H_
736 #endif
737