1 #ifndef VEXCL_BACKEND_OPENCL_KERNEL_HPP 2 #define VEXCL_BACKEND_OPENCL_KERNEL_HPP 3 4 /* 5 The MIT License 6 7 Copyright (c) 2012-2018 Denis Demidov <dennis.demidov@gmail.com> 8 9 Permission is hereby granted, free of charge, to any person obtaining a copy 10 of this software and associated documentation files (the "Software"), to deal 11 in the Software without restriction, including without limitation the rights 12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 copies of the Software, and to permit persons to whom the Software is 14 furnished to do so, subject to the following conditions: 15 16 The above copyright notice and this permission notice shall be included in 17 all copies or substantial portions of the Software. 18 19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 25 THE SOFTWARE. 26 */ 27 28 /** 29 * \file vexcl/backend/opencl/kernel.hpp 30 * \author Denis Demidov <dennis.demidov@gmail.com> 31 * \brief An abstraction over OpenCL compute kernel. 32 */ 33 34 #include <functional> 35 36 #include <vexcl/backend/opencl/defines.hpp> 37 #ifdef VEXCL_HAVE_OPENCL_HPP 38 # include <CL/opencl.hpp> 39 #else 40 # include <CL/cl2.hpp> 41 #endif 42 43 #include <vexcl/backend/opencl/compiler.hpp> 44 45 namespace vex { 46 namespace backend { 47 namespace opencl { 48 49 template <typename T> 50 struct kernel_arg_pusher { setvex::backend::opencl::kernel_arg_pusher51 static void set(cl::Kernel &k, unsigned argpos, const T &arg) { 52 k.setArg(argpos, arg); 53 } 54 }; 55 56 /// An abstraction over OpenCL compute kernel. 57 class kernel { 58 public: kernel()59 kernel() : argpos(0), w_size(0), g_size(0) {} 60 61 /// Constructor. Creates a backend::kernel instance from source. kernel(const cl::CommandQueue & queue,const std::string & src,const std::string & name,size_t smem_per_thread=0,const std::string & options="")62 kernel(const cl::CommandQueue &queue, 63 const std::string &src, 64 const std::string &name, 65 size_t smem_per_thread = 0, 66 const std::string &options = "" 67 ) : argpos(0) 68 { 69 auto P = build_sources(queue, src, options); 70 71 K = cl::Kernel(P, name.c_str()); 72 #ifdef VEXCL_AMD_SI_WORKAROUND 73 N = cl::Kernel(P, "__null_kernel"); 74 #endif 75 76 config(queue, 77 [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; }); 78 } 79 80 /// Constructor. Creates a backend::kernel instance from source. kernel(const cl::CommandQueue & queue,const std::string & src,const std::string & name,std::function<size_t (size_t)> smem,const std::string & options="")81 kernel(const cl::CommandQueue &queue, 82 const std::string &src, const std::string &name, 83 std::function<size_t(size_t)> smem, 84 const std::string &options = "" 85 ) 86 : argpos(0), K(build_sources(queue, src, options), name.c_str()) 87 { 88 config(queue, smem); 89 } 90 91 /// Constructor. Extracts a backend::kernel instance from backend::program. kernel(const cl::CommandQueue & queue,const cl::Program & program,const std::string & name,size_t smem_per_thread=0)92 kernel(const cl::CommandQueue &queue, 93 const cl::Program &program, 94 const std::string &name, 95 size_t smem_per_thread = 0 96 ) 97 : argpos(0), K(program, name.c_str()) 98 { 99 config(queue, 100 [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; }); 101 } 102 103 /// Constructor. Extracts a backend::kernel instance from backend::program. kernel(const cl::CommandQueue & queue,const cl::Program & program,const std::string & name,std::function<size_t (size_t)> smem)104 kernel(const cl::CommandQueue &queue, const cl::Program &program, 105 const std::string &name, 106 std::function<size_t(size_t)> smem 107 ) 108 : argpos(0), K(program, name.c_str()) 109 { 110 config(queue, smem); 111 } 112 113 /// Adds an argument to the kernel. 114 template <class Arg> push_arg(const Arg & arg)115 void push_arg(const Arg &arg) { 116 kernel_arg_pusher<Arg>::set(K, argpos++, arg); 117 } 118 119 /// Adds an argument to the kernel. 120 template <typename T> push_arg(device_vector<T> && arg)121 void push_arg(device_vector<T> &&arg) { 122 K.setArg(argpos++, arg.raw()); 123 } 124 125 /// Adds local memory to the kernel. set_smem(size_t smem_per_thread)126 void set_smem(size_t smem_per_thread) { 127 cl::LocalSpaceArg smem = { smem_per_thread * workgroup_size() }; 128 K.setArg(argpos++, smem); 129 } 130 131 /// Adds local memory to the kernel. 132 template <class F> set_smem(F && f)133 void set_smem(F &&f) { 134 cl::LocalSpaceArg smem = { f(workgroup_size()) }; 135 K.setArg(argpos++, smem); 136 } 137 138 /// Enqueue the kernel to the specified command queue. operator ()(const cl::CommandQueue & q)139 void operator()(const cl::CommandQueue &q) { 140 q.enqueueNDRangeKernel(K, cl::NullRange, g_size, w_size); 141 #ifdef VEXCL_AMD_SI_WORKAROUND 142 q.enqueueNDRangeKernel(N, cl::NullRange, 1, cl::NullRange); 143 #endif 144 argpos = 0; 145 } 146 147 #ifndef BOOST_NO_VARIADIC_TEMPLATES 148 /// Enqueue the kernel to the specified command queue with the given arguments 149 template <class Arg1, class... OtherArgs> operator ()(const cl::CommandQueue & q,Arg1 && arg1,OtherArgs &&...other_args)150 void operator()(const cl::CommandQueue &q, Arg1 &&arg1, OtherArgs&&... other_args) { 151 push_arg(std::forward<Arg1>(arg1)); 152 153 (*this)(q, std::forward<OtherArgs>(other_args)...); 154 } 155 #endif 156 157 /// Workgroup size. workgroup_size() const158 size_t workgroup_size() const { 159 size_t threads = 1; 160 for(size_t i = 0; i < w_size.dimensions(); ++i) 161 threads *= static_cast<const size_t*>(w_size)[i]; 162 return threads; 163 } 164 165 /// Standard number of workgroups to launch on a device. num_workgroups(const cl::CommandQueue & q)166 static inline size_t num_workgroups(const cl::CommandQueue &q) { 167 // This is a simple heuristic-based estimate. More advanced technique may 168 // be employed later. 169 cl::Device d = q.getInfo<CL_QUEUE_DEVICE>(); 170 return 8 * d.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); 171 } 172 173 /// The maximum number of threads per block, beyond which a launch of the kernel would fail. max_threads_per_block(const cl::CommandQueue & q) const174 size_t max_threads_per_block(const cl::CommandQueue &q) const { 175 cl::Device d = q.getInfo<CL_QUEUE_DEVICE>(); 176 return K.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(d); 177 } 178 179 /// The size in bytes of shared memory per block available for this kernel. max_shared_memory_per_block(const cl::CommandQueue & q) const180 size_t max_shared_memory_per_block(const cl::CommandQueue &q) const { 181 cl::Device d = q.getInfo<CL_QUEUE_DEVICE>(); 182 183 return static_cast<size_t>(d.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()) 184 - static_cast<size_t>(K.getWorkGroupInfo<CL_KERNEL_LOCAL_MEM_SIZE>(d)); 185 } 186 187 /// Select best launch configuration for the given shared memory requirements. config(const cl::CommandQueue & queue,std::function<size_t (size_t)> smem)188 kernel& config(const cl::CommandQueue &queue, std::function<size_t(size_t)> smem) { 189 cl::Device dev = queue.getInfo<CL_QUEUE_DEVICE>(); 190 191 size_t ws; 192 193 if ( is_cpu(queue) ) { 194 ws = 1; 195 } else { 196 // Select workgroup size that would fit into the device. 197 ws = dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0] / 2; 198 199 size_t max_ws = max_threads_per_block(queue); 200 size_t max_smem = max_shared_memory_per_block(queue); 201 202 // Reduce workgroup size until it satisfies resource requirements: 203 while( (ws > max_ws) || (smem(ws) > max_smem) ) 204 ws /= 2; 205 } 206 207 return config(num_workgroups(queue), ws); 208 } 209 210 /// Set launch configuration. config(ndrange blocks,ndrange threads,size_t shared_memory=0)211 kernel& config(ndrange blocks, ndrange threads, size_t shared_memory = 0) { 212 size_t dim = std::max(blocks.dimensions(), threads.dimensions()); 213 214 const size_t *b = blocks; 215 const size_t *t = threads; 216 217 switch(dim) { 218 case 3: 219 g_size = ndrange(b[0] * t[0], b[1] * t[1], b[2] * t[2]); 220 break; 221 case 2: 222 g_size = ndrange(b[0] * t[0], b[1] * t[1]); 223 break; 224 case 1: 225 default: 226 g_size = ndrange(b[0] * t[0]); 227 break; 228 } 229 230 w_size = threads; 231 232 if (shared_memory) { 233 cl::LocalSpaceArg smem = { shared_memory }; 234 K.setArg(argpos++, smem); 235 } 236 237 return *this; 238 } 239 240 /// Set launch configuration. config(size_t blocks,size_t threads,size_t shared_memory=0)241 kernel& config(size_t blocks, size_t threads, size_t shared_memory = 0) { 242 return config(ndrange(blocks), ndrange(threads), shared_memory); 243 } 244 preferred_work_group_size_multiple(const backend::command_queue & q) const245 size_t preferred_work_group_size_multiple(const backend::command_queue &q) const { 246 return K.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>( 247 q.getInfo<CL_QUEUE_DEVICE>() 248 ); 249 } 250 251 /// Get reference to the underlying object. get() const252 const cl::Kernel& get() const { return K; } get()253 cl::Kernel& get() { return K; } 254 255 /// Reset argument counter. reset()256 void reset() { 257 argpos = 0; 258 } 259 private: 260 unsigned argpos; 261 262 cl::Kernel K; 263 #ifdef VEXCL_AMD_SI_WORKAROUND 264 cl::Kernel N; 265 #endif 266 267 backend::ndrange w_size; 268 backend::ndrange g_size; 269 }; 270 271 } // namespace opencl 272 } // namespace backend 273 } // namespace vex 274 275 #endif 276