1 #ifndef VEXCL_BACKEND_OPENCL_KERNEL_HPP
2 #define VEXCL_BACKEND_OPENCL_KERNEL_HPP
3 
4 /*
5 The MIT License
6 
7 Copyright (c) 2012-2018 Denis Demidov <dennis.demidov@gmail.com>
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.
26 */
27 
28 /**
29  * \file   vexcl/backend/opencl/kernel.hpp
30  * \author Denis Demidov <dennis.demidov@gmail.com>
31  * \brief  An abstraction over OpenCL compute kernel.
32  */
33 
34 #include <functional>
35 
36 #include <vexcl/backend/opencl/defines.hpp>
37 #ifdef VEXCL_HAVE_OPENCL_HPP
38 #  include <CL/opencl.hpp>
39 #else
40 #  include <CL/cl2.hpp>
41 #endif
42 
43 #include <vexcl/backend/opencl/compiler.hpp>
44 
45 namespace vex {
46 namespace backend {
47 namespace opencl {
48 
49 template <typename T>
50 struct kernel_arg_pusher {
setvex::backend::opencl::kernel_arg_pusher51     static void set(cl::Kernel &k, unsigned argpos, const T &arg) {
52         k.setArg(argpos, arg);
53     }
54 };
55 
56 /// An abstraction over OpenCL compute kernel.
57 class kernel {
58     public:
kernel()59         kernel() : argpos(0), w_size(0), g_size(0) {}
60 
61         /// Constructor. Creates a backend::kernel instance from source.
kernel(const cl::CommandQueue & queue,const std::string & src,const std::string & name,size_t smem_per_thread=0,const std::string & options="")62         kernel(const cl::CommandQueue &queue,
63                const std::string &src,
64                const std::string &name,
65                size_t smem_per_thread = 0,
66                const std::string &options = ""
67                ) : argpos(0)
68         {
69             auto P = build_sources(queue, src, options);
70 
71             K = cl::Kernel(P, name.c_str());
72 #ifdef VEXCL_AMD_SI_WORKAROUND
73             N = cl::Kernel(P, "__null_kernel");
74 #endif
75 
76             config(queue,
77                     [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; });
78         }
79 
80         /// Constructor. Creates a backend::kernel instance from source.
kernel(const cl::CommandQueue & queue,const std::string & src,const std::string & name,std::function<size_t (size_t)> smem,const std::string & options="")81         kernel(const cl::CommandQueue &queue,
82                const std::string &src, const std::string &name,
83                std::function<size_t(size_t)> smem,
84                const std::string &options = ""
85                )
86             : argpos(0), K(build_sources(queue, src, options), name.c_str())
87         {
88             config(queue, smem);
89         }
90 
91         /// Constructor. Extracts a backend::kernel instance from backend::program.
kernel(const cl::CommandQueue & queue,const cl::Program & program,const std::string & name,size_t smem_per_thread=0)92         kernel(const cl::CommandQueue &queue,
93                const cl::Program &program,
94                const std::string &name,
95                size_t smem_per_thread = 0
96                )
97             : argpos(0), K(program, name.c_str())
98         {
99             config(queue,
100                     [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; });
101         }
102 
103         /// Constructor. Extracts a backend::kernel instance from backend::program.
kernel(const cl::CommandQueue & queue,const cl::Program & program,const std::string & name,std::function<size_t (size_t)> smem)104         kernel(const cl::CommandQueue &queue, const cl::Program &program,
105                const std::string &name,
106                std::function<size_t(size_t)> smem
107                )
108             : argpos(0), K(program, name.c_str())
109         {
110             config(queue, smem);
111         }
112 
113         /// Adds an argument to the kernel.
114         template <class Arg>
push_arg(const Arg & arg)115         void push_arg(const Arg &arg) {
116             kernel_arg_pusher<Arg>::set(K, argpos++, arg);
117         }
118 
119         /// Adds an argument to the kernel.
120         template <typename T>
push_arg(device_vector<T> && arg)121         void push_arg(device_vector<T> &&arg) {
122             K.setArg(argpos++, arg.raw());
123         }
124 
125         /// Adds local memory to the kernel.
set_smem(size_t smem_per_thread)126         void set_smem(size_t smem_per_thread) {
127             cl::LocalSpaceArg smem = { smem_per_thread * workgroup_size() };
128             K.setArg(argpos++, smem);
129         }
130 
131         /// Adds local memory to the kernel.
132         template <class F>
set_smem(F && f)133         void set_smem(F &&f) {
134             cl::LocalSpaceArg smem = { f(workgroup_size()) };
135             K.setArg(argpos++, smem);
136         }
137 
138         /// Enqueue the kernel to the specified command queue.
operator ()(const cl::CommandQueue & q)139         void operator()(const cl::CommandQueue &q) {
140             q.enqueueNDRangeKernel(K, cl::NullRange, g_size, w_size);
141 #ifdef VEXCL_AMD_SI_WORKAROUND
142             q.enqueueNDRangeKernel(N, cl::NullRange, 1, cl::NullRange);
143 #endif
144             argpos = 0;
145         }
146 
147 #ifndef BOOST_NO_VARIADIC_TEMPLATES
148         /// Enqueue the kernel to the specified command queue with the given arguments
149         template <class Arg1, class... OtherArgs>
operator ()(const cl::CommandQueue & q,Arg1 && arg1,OtherArgs &&...other_args)150         void operator()(const cl::CommandQueue &q, Arg1 &&arg1, OtherArgs&&... other_args) {
151             push_arg(std::forward<Arg1>(arg1));
152 
153             (*this)(q, std::forward<OtherArgs>(other_args)...);
154         }
155 #endif
156 
157         /// Workgroup size.
workgroup_size() const158         size_t workgroup_size() const {
159             size_t threads = 1;
160             for(size_t i = 0; i < w_size.dimensions(); ++i)
161                 threads *= static_cast<const size_t*>(w_size)[i];
162             return threads;
163         }
164 
165         /// Standard number of workgroups to launch on a device.
num_workgroups(const cl::CommandQueue & q)166         static inline size_t num_workgroups(const cl::CommandQueue &q) {
167             // This is a simple heuristic-based estimate. More advanced technique may
168             // be employed later.
169             cl::Device d = q.getInfo<CL_QUEUE_DEVICE>();
170             return 8 * d.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
171         }
172 
173         /// The maximum number of threads per block, beyond which a launch of the kernel would fail.
max_threads_per_block(const cl::CommandQueue & q) const174         size_t max_threads_per_block(const cl::CommandQueue &q) const {
175             cl::Device d = q.getInfo<CL_QUEUE_DEVICE>();
176             return K.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(d);
177         }
178 
179         /// The size in bytes of shared memory per block available for this kernel.
max_shared_memory_per_block(const cl::CommandQueue & q) const180         size_t max_shared_memory_per_block(const cl::CommandQueue &q) const {
181             cl::Device d = q.getInfo<CL_QUEUE_DEVICE>();
182 
183             return static_cast<size_t>(d.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>())
184                  - static_cast<size_t>(K.getWorkGroupInfo<CL_KERNEL_LOCAL_MEM_SIZE>(d));
185         }
186 
187         /// Select best launch configuration for the given shared memory requirements.
config(const cl::CommandQueue & queue,std::function<size_t (size_t)> smem)188         kernel& config(const cl::CommandQueue &queue, std::function<size_t(size_t)> smem) {
189             cl::Device dev = queue.getInfo<CL_QUEUE_DEVICE>();
190 
191             size_t ws;
192 
193             if ( is_cpu(queue) ) {
194                 ws = 1;
195             } else {
196                 // Select workgroup size that would fit into the device.
197                 ws = dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0] / 2;
198 
199                 size_t max_ws   = max_threads_per_block(queue);
200                 size_t max_smem = max_shared_memory_per_block(queue);
201 
202                 // Reduce workgroup size until it satisfies resource requirements:
203                 while( (ws > max_ws) || (smem(ws) > max_smem) )
204                     ws /= 2;
205             }
206 
207             return config(num_workgroups(queue), ws);
208         }
209 
210         /// Set launch configuration.
config(ndrange blocks,ndrange threads,size_t shared_memory=0)211         kernel& config(ndrange blocks, ndrange threads, size_t shared_memory = 0) {
212             size_t dim = std::max(blocks.dimensions(), threads.dimensions());
213 
214             const size_t *b = blocks;
215             const size_t *t = threads;
216 
217             switch(dim) {
218                 case 3:
219                     g_size = ndrange(b[0] * t[0], b[1] * t[1], b[2] * t[2]);
220                     break;
221                 case 2:
222                     g_size = ndrange(b[0] * t[0], b[1] * t[1]);
223                     break;
224                 case 1:
225                 default:
226                     g_size = ndrange(b[0] * t[0]);
227                     break;
228             }
229 
230             w_size = threads;
231 
232             if (shared_memory) {
233                 cl::LocalSpaceArg smem = { shared_memory };
234                 K.setArg(argpos++, smem);
235             }
236 
237             return *this;
238         }
239 
240         /// Set launch configuration.
config(size_t blocks,size_t threads,size_t shared_memory=0)241         kernel& config(size_t blocks, size_t threads, size_t shared_memory = 0) {
242             return config(ndrange(blocks), ndrange(threads), shared_memory);
243         }
244 
preferred_work_group_size_multiple(const backend::command_queue & q) const245         size_t preferred_work_group_size_multiple(const backend::command_queue &q) const {
246             return K.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(
247                     q.getInfo<CL_QUEUE_DEVICE>()
248                     );
249         }
250 
251         /// Get reference to the underlying object.
get() const252         const cl::Kernel& get() const { return K; }
get()253         cl::Kernel& get() { return K; }
254 
255         /// Reset argument counter.
reset()256         void reset() {
257             argpos = 0;
258         }
259     private:
260         unsigned argpos;
261 
262         cl::Kernel K;
263 #ifdef VEXCL_AMD_SI_WORKAROUND
264         cl::Kernel N;
265 #endif
266 
267         backend::ndrange w_size;
268         backend::ndrange g_size;
269 };
270 
271 } // namespace opencl
272 } // namespace backend
273 } // namespace vex
274 
275 #endif
276