1 
2 // =================================================================================================
3 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
4 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
5 // width of 100 characters per line.
6 //
7 // Author(s):
8 //   Cedric Nugteren <www.cedricnugteren.nl>
9 //
10 // This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API
11 // calls. The main benefits are increased abstraction, automatic memory management, and portability.
12 // Portability here means that a similar header exists for CUDA with the same classes and
13 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
14 //
15 // This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
16 // therefore contains the following header copyright notice:
17 //
18 // =================================================================================================
19 //
20 // Copyright 2015 SURFsara
21 //
22 // Licensed under the Apache License, Version 2.0 (the "License");
23 // you may not use this file except in compliance with the License.
24 // You may obtain a copy of the License at
25 //
26 //  http://www.apache.org/licenses/LICENSE-2.0
27 //
28 // Unless required by applicable law or agreed to in writing, software
29 // distributed under the License is distributed on an "AS IS" BASIS,
30 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31 // See the License for the specific language governing permissions and
32 // limitations under the License.
33 //
34 // =================================================================================================
35 
36 #ifndef CLBLAST_CLPP11_H_
37 #define CLBLAST_CLPP11_H_
38 
39 // C++
40 #include <algorithm> // std::copy
41 #include <string>    // std::string
42 #include <vector>    // std::vector
43 #include <memory>    // std::shared_ptr
44 #include <numeric>   // std::accumulate
45 #include <cstring>   // std::strlen
46 
47 // OpenCL
48 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
49 #if defined(__APPLE__) || defined(__MACOSX)
50   #include <OpenCL/opencl.h>
51 #else
52   #include <CL/opencl.h>
53 #endif
54 
55 // Exception classes
56 #include "cxpp11_common.hpp"
57 
58 namespace clblast {
59 // =================================================================================================
60 
61 // Represents a runtime error returned by an OpenCL API function
62 class CLError : public ErrorCode<DeviceError, cl_int> {
63  public:
CLError(cl_int status,const std::string & where)64   explicit CLError(cl_int status, const std::string &where):
65       ErrorCode(status,
66                 where,
67                 "OpenCL error: " + where + ": " + std::to_string(static_cast<int>(status))) {
68   }
69 
Check(const cl_int status,const std::string & where)70   static void Check(const cl_int status, const std::string &where) {
71     if (status != CL_SUCCESS) {
72       throw CLError(status, where);
73     }
74   }
75 
CheckDtor(const cl_int status,const std::string & where)76   static void CheckDtor(const cl_int status, const std::string &where) {
77     if (status != CL_SUCCESS) {
78       fprintf(stderr, "CLBlast: %s (ignoring)\n", CLError(status, where).what());
79     }
80   }
81 };
82 
83 // =================================================================================================
84 
85 // Error occurred in OpenCL
86 #define CheckError(call) CLError::Check(call, CLError::TrimCallString(#call))
87 
88 // Error occured in OpenCL (no-exception version for destructors)
89 #define CheckErrorDtor(call) CLError::CheckDtor(call, CLError::TrimCallString(#call))
90 
91 // =================================================================================================
92 
93 // C++11 version of 'cl_event'
94 class Event {
95  public:
96 
97   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
Event(const cl_event event)98   explicit Event(const cl_event event):
99       event_(new cl_event) {
100     *event_ = event;
101   }
102 
103   // Regular constructor with memory management
Event()104   explicit Event():
105       event_(new cl_event, [](cl_event* e) {
106         if (*e) { CheckErrorDtor(clReleaseEvent(*e)); }
107         delete e;
108       }) {
109     *event_ = nullptr;
110   }
111 
112   // Waits for completion of this event
WaitForCompletion() const113   void WaitForCompletion() const {
114     CheckError(clWaitForEvents(1, &(*event_)));
115   }
116 
117   // Retrieves the elapsed time of the last recorded event.
118   // (Note that there is a bug in Apple's OpenCL implementation of the 'clGetEventProfilingInfo' function:
119   //  http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx)
120   // However, in our case the reply size is fixed to be cl_ulong, so we are not affected.
GetElapsedTime() const121   float GetElapsedTime() const {
122     WaitForCompletion();
123     const auto bytes = sizeof(cl_ulong);
124     auto time_start = cl_ulong{0};
125     CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr));
126     auto time_end = cl_ulong{0};
127     CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr));
128     return static_cast<float>(time_end - time_start) * 1.0e-6f;
129   }
130 
131   // Accessor to the private data-member
operator ()()132   cl_event& operator()() { return *event_; }
operator ()() const133   const cl_event& operator()() const { return *event_; }
pointer()134   cl_event* pointer() { return &(*event_); }
pointer() const135   const cl_event* pointer() const { return &(*event_); }
136  private:
137   std::shared_ptr<cl_event> event_;
138 };
139 
140 // Pointer to an OpenCL event
141 using EventPointer = cl_event*;
142 
143 // =================================================================================================
144 
145 // C++11 version of 'cl_platform_id'
146 class Platform {
147  public:
148 
149   // Constructor based on the regular OpenCL data-type
Platform(const cl_platform_id platform)150   explicit Platform(const cl_platform_id platform): platform_(platform) { }
151 
152   // Initializes the platform
Platform(const size_t platform_id)153   explicit Platform(const size_t platform_id) {
154     auto num_platforms = cl_uint{0};
155     CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
156     if (num_platforms == 0) {
157       throw RuntimeError("Platform: no platforms found");
158     }
159     if (platform_id >= num_platforms) {
160       throw RuntimeError("Platform: invalid platform ID "+std::to_string(platform_id));
161     }
162     auto platforms = std::vector<cl_platform_id>(num_platforms);
163     CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr));
164     platform_ = platforms[platform_id];
165   }
166 
167   // Methods to retrieve platform information
Name() const168   std::string Name() const { return GetInfoString(CL_PLATFORM_NAME); }
Vendor() const169   std::string Vendor() const { return GetInfoString(CL_PLATFORM_VENDOR); }
Version() const170   std::string Version() const { return GetInfoString(CL_PLATFORM_VERSION); }
171 
172   // Returns the number of devices on this platform
NumDevices() const173   size_t NumDevices() const {
174     auto result = cl_uint{0};
175     CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result));
176     return static_cast<size_t>(result);
177   }
178 
179   // Accessor to the private data-member
operator ()() const180   const cl_platform_id& operator()() const { return platform_; }
181  private:
182   cl_platform_id platform_;
183 
184   // Private helper functions
GetInfoString(const cl_device_info info) const185   std::string GetInfoString(const cl_device_info info) const {
186     auto bytes = size_t{0};
187     CheckError(clGetPlatformInfo(platform_, info, 0, nullptr, &bytes));
188     auto result = std::string{};
189     result.resize(bytes);
190     CheckError(clGetPlatformInfo(platform_, info, bytes, &result[0], nullptr));
191     result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
192     return result;
193   }
194 };
195 
196 // Retrieves a vector with all platforms
GetAllPlatforms()197 inline std::vector<Platform> GetAllPlatforms() {
198   auto num_platforms = cl_uint{0};
199   CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
200   auto all_platforms = std::vector<Platform>();
201   for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
202     all_platforms.push_back(Platform(platform_id));
203   }
204   return all_platforms;
205 }
206 
207 // =================================================================================================
208 
209 // C++11 version of 'cl_device_id'
210 class Device {
211  public:
212 
213   // Constructor based on the regular OpenCL data-type
Device(const cl_device_id device)214   explicit Device(const cl_device_id device): device_(device) { }
215 
216   // Initialize the device. Note that this constructor can throw exceptions!
Device(const Platform & platform,const size_t device_id)217   explicit Device(const Platform &platform, const size_t device_id) {
218     auto num_devices = platform.NumDevices();
219     if (num_devices == 0) {
220       throw RuntimeError("Device: no devices found");
221     }
222     if (device_id >= num_devices) {
223       throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
224     }
225 
226     auto devices = std::vector<cl_device_id>(num_devices);
227     CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
228                               devices.data(), nullptr));
229     device_ = devices[device_id];
230   }
231 
232   // Methods to retrieve device information
Platform() const233   cl_platform_id Platform() const { return GetInfo<cl_platform_id>(CL_DEVICE_PLATFORM); }
Version() const234   std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); }
VersionNumber() const235   size_t VersionNumber() const
236   {
237     std::string version_string = Version().substr(7);
238     // Space separates the end of the OpenCL version number from the beginning of the
239     // vendor-specific information.
240     size_t next_whitespace = version_string.find(' ');
241     size_t version = (size_t) (100.0 * std::stod(version_string.substr(0, next_whitespace)));
242     return version;
243   }
Vendor() const244   std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); }
Name() const245   std::string Name() const { return GetInfoString(CL_DEVICE_NAME); }
Type() const246   std::string Type() const {
247     auto type = GetInfo<cl_device_type>(CL_DEVICE_TYPE);
248     switch(type) {
249       case CL_DEVICE_TYPE_CPU: return "CPU";
250       case CL_DEVICE_TYPE_GPU: return "GPU";
251       case CL_DEVICE_TYPE_ACCELERATOR: return "accelerator";
252       default: return "default";
253     }
254   }
MaxWorkGroupSize() const255   size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
MaxWorkItemDimensions() const256   size_t MaxWorkItemDimensions() const {
257     return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS));
258   }
MaxWorkItemSizes() const259   std::vector<size_t> MaxWorkItemSizes() const {
260     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
261   }
LocalMemSize() const262   unsigned long LocalMemSize() const {
263     return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
264   }
Capabilities() const265   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
HasExtension(const std::string & extension) const266   bool HasExtension(const std::string &extension) const {
267     const auto extensions = Capabilities();
268     return extensions.find(extension) != std::string::npos;
269   }
270 
CoreClock() const271   size_t CoreClock() const {
272     return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_CLOCK_FREQUENCY));
273   }
ComputeUnits() const274   size_t ComputeUnits() const {
275     return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_COMPUTE_UNITS));
276   }
MemorySize() const277   unsigned long MemorySize() const {
278     return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE));
279   }
MaxAllocSize() const280   unsigned long MaxAllocSize() const {
281     return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_MAX_MEM_ALLOC_SIZE));
282   }
MemoryClock() const283   size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
MemoryBusWidth() const284   size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
285 
286   // Configuration-validity checks
IsLocalMemoryValid(const cl_ulong local_mem_usage) const287   bool IsLocalMemoryValid(const cl_ulong local_mem_usage) const {
288     return (local_mem_usage <= LocalMemSize());
289   }
IsThreadConfigValid(const std::vector<size_t> & local) const290   bool IsThreadConfigValid(const std::vector<size_t> &local) const {
291     auto local_size = size_t{1};
292     for (const auto &item: local) { local_size *= item; }
293     for (auto i=size_t{0}; i<local.size(); ++i) {
294       if (local[i] > MaxWorkItemSizes()[i]) { return false; }
295     }
296     if (local_size > MaxWorkGroupSize()) { return false; }
297     if (local.size() > MaxWorkItemDimensions()) { return false; }
298     return true;
299   }
300 
301   // Query for a specific type of device or brand
IsCPU() const302   bool IsCPU() const { return Type() == "CPU"; }
IsGPU() const303   bool IsGPU() const { return Type() == "GPU"; }
IsAMD() const304   bool IsAMD() const { return Vendor() == "AMD" ||
305                               Vendor() == "Advanced Micro Devices, Inc." ||
306                               Vendor() == "AuthenticAMD"; }
IsNVIDIA() const307   bool IsNVIDIA() const { return Vendor() == "NVIDIA" ||
308                                  Vendor() == "NVIDIA Corporation"; }
IsIntel() const309   bool IsIntel() const { return Vendor() == "INTEL" ||
310                                 Vendor() == "Intel" ||
311                                 Vendor() == "GenuineIntel" ||
312                                 Vendor() == "Intel(R) Corporation"; }
IsARM() const313   bool IsARM() const { return Vendor() == "ARM"; }
314 
315   // Platform specific extensions
AMDBoardName() const316   std::string AMDBoardName() const { // check for 'cl_amd_device_attribute_query' first
317     #ifndef CL_DEVICE_BOARD_NAME_AMD
318       #define CL_DEVICE_BOARD_NAME_AMD 0x4038
319     #endif
320     return GetInfoString(CL_DEVICE_BOARD_NAME_AMD);
321   }
NVIDIAComputeCapability() const322   std::string NVIDIAComputeCapability() const { // check for 'cl_nv_device_attribute_query' first
323     #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
324        #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
325     #endif
326     #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
327       #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
328     #endif
329     return std::string{"SM"} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV)) +
330            std::string{"."} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV));
331   }
332 
333 
334   // Accessor to the private data-member
operator ()() const335   const cl_device_id& operator()() const { return device_; }
336  private:
337   cl_device_id device_;
338 
339   // Private helper functions
340   template <typename T>
GetInfo(const cl_device_info info) const341   T GetInfo(const cl_device_info info) const {
342     auto bytes = size_t{0};
343     CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
344     auto result = T(0);
345     CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
346     return result;
347   }
348   template <typename T>
GetInfoVector(const cl_device_info info) const349   std::vector<T> GetInfoVector(const cl_device_info info) const {
350     auto bytes = size_t{0};
351     CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
352     auto result = std::vector<T>(bytes/sizeof(T));
353     CheckError(clGetDeviceInfo(device_, info, bytes, result.data(), nullptr));
354     return result;
355   }
GetInfoString(const cl_device_info info) const356   std::string GetInfoString(const cl_device_info info) const {
357     auto bytes = size_t{0};
358     CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
359     auto result = std::string{};
360     result.resize(bytes);
361     CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
362     result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
363     return result;
364   }
365 };
366 
367 // =================================================================================================
368 
369 // C++11 version of 'cl_context'
370 class Context {
371  public:
372 
373   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
Context(const cl_context context)374   explicit Context(const cl_context context):
375       context_(new cl_context) {
376     *context_ = context;
377   }
378 
379   // Regular constructor with memory management
Context(const Device & device)380   explicit Context(const Device &device):
381       context_(new cl_context, [](cl_context* c) {
382         if (*c) { CheckErrorDtor(clReleaseContext(*c)); }
383         delete c;
384       }) {
385     auto status = CL_SUCCESS;
386     const cl_device_id dev = device();
387     *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
388     CLError::Check(status, "clCreateContext");
389   }
390 
391   // Accessor to the private data-member
operator ()() const392   const cl_context& operator()() const { return *context_; }
pointer() const393   cl_context* pointer() const { return &(*context_); }
394  private:
395   std::shared_ptr<cl_context> context_;
396 };
397 
398 // Pointer to an OpenCL context
399 using ContextPointer = cl_context*;
400 
401 // =================================================================================================
402 
403 // Enumeration of build statuses of the run-time compilation process
404 enum class BuildStatus { kSuccess, kError, kInvalid };
405 
406 // C++11 version of 'cl_program'.
407 class Program {
408  public:
409   Program() = default;
410 
411   // Source-based constructor with memory management
Program(const Context & context,const std::string & source)412   explicit Program(const Context &context, const std::string &source):
413       program_(new cl_program, [](cl_program* p) {
414         if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
415         delete p;
416       }) {
417     const char *source_ptr = &source[0];
418     size_t length = source.length();
419     auto status = CL_SUCCESS;
420     *program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
421     CLError::Check(status, "clCreateProgramWithSource");
422   }
423 
424   // Binary-based constructor with memory management
Program(const Device & device,const Context & context,const std::string & binary)425   explicit Program(const Device &device, const Context &context, const std::string &binary):
426       program_(new cl_program, [](cl_program* p) {
427         if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
428         delete p;
429       }) {
430     const char *binary_ptr = &binary[0];
431     size_t length = binary.length();
432     auto status1 = CL_SUCCESS;
433     auto status2 = CL_SUCCESS;
434     const cl_device_id dev = device();
435     *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
436                                           reinterpret_cast<const unsigned char**>(&binary_ptr),
437                                           &status1, &status2);
438     CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
439     CLError::Check(status2, "clCreateProgramWithBinary");
440   }
441 
442   // Compiles the device program and returns whether or not there where any warnings/errors
Build(const Device & device,std::vector<std::string> & options)443   void Build(const Device &device, std::vector<std::string> &options) {
444     options.push_back("-cl-std=CL1.1");
445     auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
446     const cl_device_id dev = device();
447     CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
448   }
449 
450   // Retrieves the warning/error message from the compiler (if any)
GetBuildInfo(const Device & device) const451   std::string GetBuildInfo(const Device &device) const {
452     auto bytes = size_t{0};
453     auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG};
454     CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes));
455     auto result = std::string{};
456     result.resize(bytes);
457     CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr));
458     return result;
459   }
460 
461   // Retrieves a binary or an intermediate representation of the compiled program
GetIR() const462   std::string GetIR() const {
463     auto bytes = size_t{0};
464     CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
465     auto result = std::string{};
466     result.resize(bytes);
467     auto result_ptr = result.data();
468     CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
469     return result;
470   }
471 
472   // Accessor to the private data-member
operator ()() const473   const cl_program& operator()() const { return *program_; }
474  private:
475   std::shared_ptr<cl_program> program_;
476 };
477 
478 // =================================================================================================
479 
480 // C++11 version of 'cl_command_queue'
481 class Queue {
482  public:
483 
484   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
Queue(const cl_command_queue queue)485   explicit Queue(const cl_command_queue queue):
486       queue_(new cl_command_queue) {
487     *queue_ = queue;
488   }
489 
490   // Regular constructor with memory management
Queue(const Context & context,const Device & device)491   explicit Queue(const Context &context, const Device &device):
492       queue_(new cl_command_queue, [](cl_command_queue* s) {
493         if (*s) { CheckErrorDtor(clReleaseCommandQueue(*s)); }
494         delete s;
495       }) {
496     auto status = CL_SUCCESS;
497     *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
498     CLError::Check(status, "clCreateCommandQueue");
499   }
500 
501   // Synchronizes the queue
Finish(Event &) const502   void Finish(Event &) const {
503     Finish();
504   }
Finish() const505   void Finish() const {
506     CheckError(clFinish(*queue_));
507   }
508 
509   // Retrieves the corresponding context or device
GetContext() const510   Context GetContext() const {
511     auto bytes = size_t{0};
512     CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes));
513     cl_context result;
514     CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr));
515     return Context(result);
516   }
GetDevice() const517   Device GetDevice() const {
518     auto bytes = size_t{0};
519     CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes));
520     cl_device_id result;
521     CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr));
522     return Device(result);
523   }
524 
525   // Accessor to the private data-member
operator ()() const526   const cl_command_queue& operator()() const { return *queue_; }
527  private:
528   std::shared_ptr<cl_command_queue> queue_;
529 };
530 
531 // =================================================================================================
532 
533 // C++11 version of host memory
534 template <typename T>
535 class BufferHost {
536  public:
537 
538   // Regular constructor with memory management
BufferHost(const Context &,const size_t size)539   explicit BufferHost(const Context &, const size_t size):
540       buffer_(new std::vector<T>(size)) {
541   }
542 
543   // Retrieves the actual allocated size in bytes
GetSize() const544   size_t GetSize() const {
545     return buffer_->size()*sizeof(T);
546   }
547 
548   // Compatibility with std::vector
size() const549   size_t size() const { return buffer_->size(); }
begin()550   T* begin() { return &(*buffer_)[0]; }
end()551   T* end() { return &(*buffer_)[buffer_->size()-1]; }
operator [](const size_t i)552   T& operator[](const size_t i) { return (*buffer_)[i]; }
data()553   T* data() { return buffer_->data(); }
data() const554   const T* data() const { return buffer_->data(); }
555 
556  private:
557   std::shared_ptr<std::vector<T>> buffer_;
558 };
559 
560 // =================================================================================================
561 
562 // Enumeration of buffer access types
563 enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
564 
565 // C++11 version of 'cl_mem'
566 template <typename T>
567 class Buffer {
568  public:
569 
570   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
Buffer(const cl_mem buffer)571   explicit Buffer(const cl_mem buffer):
572       buffer_(new cl_mem),
573       access_(BufferAccess::kNotOwned) {
574     *buffer_ = buffer;
575   }
576 
577   // Regular constructor with memory management. If this class does not own the buffer object, then
578   // the memory will not be freed automatically afterwards.
Buffer(const Context & context,const BufferAccess access,const size_t size)579   explicit Buffer(const Context &context, const BufferAccess access, const size_t size):
580       buffer_(new cl_mem, [access](cl_mem* m) {
581         if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); }
582         delete m;
583       }),
584       access_(access) {
585     auto flags = cl_mem_flags{CL_MEM_READ_WRITE};
586     if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; }
587     if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
588     auto status = CL_SUCCESS;
589     *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
590     CLError::Check(status, "clCreateBuffer");
591   }
592 
593   // As above, but now with read/write access as a default
Buffer(const Context & context,const size_t size)594   explicit Buffer(const Context &context, const size_t size):
595     Buffer<T>(context, BufferAccess::kReadWrite, size) {
596   }
597 
598   // Constructs a new buffer based on an existing host-container
599   template <typename Iterator>
Buffer(const Context & context,const Queue & queue,Iterator start,Iterator end)600   explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
601     Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
602     auto size = static_cast<size_t>(end - start);
603     auto pointer = &*start;
604     CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0,
605                                     nullptr, nullptr));
606     queue.Finish();
607   }
608 
609   // Copies from device to host: reading the device buffer a-synchronously
ReadAsync(const Queue & queue,const size_t size,T * host,const size_t offset=0) const610   void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
611     if (access_ == BufferAccess::kWriteOnly) {
612       throw LogicError("Buffer: reading from a write-only buffer");
613     }
614     CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
615                                    host, 0, nullptr, nullptr));
616   }
ReadAsync(const Queue & queue,const size_t size,std::vector<T> & host,const size_t offset=0) const617   void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
618                  const size_t offset = 0) const {
619     if (host.size() < size) {
620       throw LogicError("Buffer: target host buffer is too small");
621     }
622     ReadAsync(queue, size, host.data(), offset);
623   }
ReadAsync(const Queue & queue,const size_t size,BufferHost<T> & host,const size_t offset=0) const624   void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
625                  const size_t offset = 0) const {
626     if (host.size() < size) {
627       throw LogicError("Buffer: target host buffer is too small");
628     }
629     ReadAsync(queue, size, host.data(), offset);
630   }
631 
632   // Copies from device to host: reading the device buffer
Read(const Queue & queue,const size_t size,T * host,const size_t offset=0) const633   void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
634     ReadAsync(queue, size, host, offset);
635     queue.Finish();
636   }
Read(const Queue & queue,const size_t size,std::vector<T> & host,const size_t offset=0) const637   void Read(const Queue &queue, const size_t size, std::vector<T> &host,
638             const size_t offset = 0) const {
639     Read(queue, size, host.data(), offset);
640   }
Read(const Queue & queue,const size_t size,BufferHost<T> & host,const size_t offset=0) const641   void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
642             const size_t offset = 0) const {
643     Read(queue, size, host.data(), offset);
644   }
645 
646   // Copies from host to device: writing the device buffer a-synchronously
WriteAsync(const Queue & queue,const size_t size,const T * host,const size_t offset=0)647   void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
648     if (GetSize() < (offset+size)*sizeof(T)) {
649       throw LogicError("Buffer: target device buffer is too small");
650     }
651     CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
652                                     host, 0, nullptr, nullptr));
653   }
WriteAsync(const Queue & queue,const size_t size,const std::vector<T> & host,const size_t offset=0)654   void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
655                   const size_t offset = 0) {
656     WriteAsync(queue, size, host.data(), offset);
657   }
WriteAsync(const Queue & queue,const size_t size,const BufferHost<T> & host,const size_t offset=0)658   void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
659                   const size_t offset = 0) {
660     WriteAsync(queue, size, host.data(), offset);
661   }
662 
663   // Copies from host to device: writing the device buffer
Write(const Queue & queue,const size_t size,const T * host,const size_t offset=0)664   void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
665     WriteAsync(queue, size, host, offset);
666     queue.Finish();
667   }
Write(const Queue & queue,const size_t size,const std::vector<T> & host,const size_t offset=0)668   void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
669              const size_t offset = 0) {
670     Write(queue, size, host.data(), offset);
671   }
Write(const Queue & queue,const size_t size,const BufferHost<T> & host,const size_t offset=0)672   void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
673              const size_t offset = 0) {
674     Write(queue, size, host.data(), offset);
675   }
676 
677   // Copies the contents of this buffer into another device buffer
CopyToAsync(const Queue & queue,const size_t size,const Buffer<T> & destination) const678   void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
679     CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0,
680                                    nullptr, nullptr));
681   }
CopyTo(const Queue & queue,const size_t size,const Buffer<T> & destination) const682   void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
683     CopyToAsync(queue, size, destination);
684     queue.Finish();
685   }
686 
687   // Retrieves the actual allocated size in bytes
GetSize() const688   size_t GetSize() const {
689     const auto bytes = sizeof(size_t);
690     auto result = size_t{0};
691     CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
692     return result;
693   }
694 
695   // Accessor to the private data-member
operator ()() const696   const cl_mem& operator()() const { return *buffer_; }
697  private:
698   std::shared_ptr<cl_mem> buffer_;
699   const BufferAccess access_;
700 };
701 
702 // =================================================================================================
703 
704 // C++11 version of 'cl_kernel'
705 class Kernel {
706  public:
707 
708   // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
Kernel(const cl_kernel kernel)709   explicit Kernel(const cl_kernel kernel):
710       kernel_(new cl_kernel) {
711     *kernel_ = kernel;
712   }
713 
714   // Regular constructor with memory management
Kernel(const Program & program,const std::string & name)715   explicit Kernel(const Program &program, const std::string &name):
716       kernel_(new cl_kernel, [](cl_kernel* k) {
717         if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
718         delete k;
719       }) {
720     auto status = CL_SUCCESS;
721     *kernel_ = clCreateKernel(program(), name.c_str(), &status);
722     CLError::Check(status, "clCreateKernel");
723   }
724 
725   // Sets a kernel argument at the indicated position
726   template <typename T>
SetArgument(const size_t index,const T & value)727   void SetArgument(const size_t index, const T &value) {
728     CheckError(clSetKernelArg(*kernel_, static_cast<cl_uint>(index), sizeof(T), &value));
729   }
730   template <typename T>
SetArgument(const size_t index,Buffer<T> & value)731   void SetArgument(const size_t index, Buffer<T> &value) {
732     SetArgument(index, value());
733   }
734 
735   // Sets all arguments in one go using parameter packs. Note that this overwrites previously set
736   // arguments using 'SetArgument' or 'SetArguments'.
737   template <typename... Args>
SetArguments(Args &...args)738   void SetArguments(Args&... args) {
739     SetArgumentsRecursive(0, args...);
740   }
741 
742   // Retrieves the amount of local memory used per work-group for this kernel
LocalMemUsage(const Device & device) const743   unsigned long LocalMemUsage(const Device &device) const {
744     const auto bytes = sizeof(cl_ulong);
745     auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
746     auto result = cl_ulong{0};
747     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
748     return static_cast<unsigned long>(result);
749   }
750 
751   // Retrieves the name of the kernel
GetFunctionName() const752   std::string GetFunctionName() const {
753     auto bytes = size_t{0};
754     CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
755     auto result = std::string{};
756     result.resize(bytes);
757     CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, bytes, &result[0], nullptr));
758     return std::string{result.c_str()}; // Removes any trailing '\0'-characters
759   }
760 
761   // Launches a kernel onto the specified queue
Launch(const Queue & queue,const std::vector<size_t> & global,const std::vector<size_t> & local,EventPointer event)762   void Launch(const Queue &queue, const std::vector<size_t> &global,
763               const std::vector<size_t> &local, EventPointer event) {
764     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
765                                       nullptr, global.data(), local.data(),
766                                       0, nullptr, event));
767   }
768 
769   // As above, but with an event waiting list
Launch(const Queue & queue,const std::vector<size_t> & global,const std::vector<size_t> & local,EventPointer event,const std::vector<Event> & waitForEvents)770   void Launch(const Queue &queue, const std::vector<size_t> &global,
771               const std::vector<size_t> &local, EventPointer event,
772               const std::vector<Event> &waitForEvents) {
773 
774     // Builds a plain version of the events waiting list
775     auto waitForEventsPlain = std::vector<cl_event>();
776     for (auto &waitEvent : waitForEvents) {
777       if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); }
778     }
779 
780     // Launches the kernel while waiting for other events
781     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
782                                       nullptr, global.data(), !local.empty() ? local.data() : nullptr,
783                                       static_cast<cl_uint>(waitForEventsPlain.size()),
784                                       !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr,
785                                       event));
786   }
787 
788   // Accessor to the private data-member
operator ()() const789   const cl_kernel& operator()() const { return *kernel_; }
790  private:
791   std::shared_ptr<cl_kernel> kernel_;
792 
793   // Internal implementation for the recursive SetArguments function.
794   template <typename T>
SetArgumentsRecursive(const size_t index,T & first)795   void SetArgumentsRecursive(const size_t index, T &first) {
796     SetArgument(index, first);
797   }
798   template <typename T, typename... Args>
SetArgumentsRecursive(const size_t index,T & first,Args &...args)799   void SetArgumentsRecursive(const size_t index, T &first, Args&... args) {
800     SetArgument(index, first);
801     SetArgumentsRecursive(index+1, args...);
802   }
803 };
804 
805 // =================================================================================================
806 } // namespace clblast
807 
808 // CLBLAST_CLPP11_H_
809 #endif
810