1 /**
2  * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef GLOW_BACKENDS_OPENCL_OPENCL_H
17 #define GLOW_BACKENDS_OPENCL_OPENCL_H
18 
19 #include "glow/Backend/Backend.h"
20 #include "glow/Backend/BackendUtils.h"
21 #include "glow/Backend/CompiledFunction.h"
22 #include "glow/Base/Tensor.h"
23 #include "glow/Base/Traits.h"
24 #include "glow/ExecutionContext/ExecutionContext.h"
25 #include "glow/Graph/Node.h"
26 #include "glow/IR/IR.h"
27 
28 #include "llvm/ADT/ArrayRef.h"
29 
30 #include <unordered_map>
31 
32 #if defined(__APPLE__) || defined(__MACOSX)
33 #include "OpenCL/opencl.h"
34 #else
35 #include <CL/cl.h>
36 #endif
37 
38 namespace glow {
39 
40 class ConvolutionInst;
41 class Value;
42 namespace runtime {
43 struct OpenCLDeviceBindings;
44 }
45 
46 /// A helper struct with information about kernels launches.
47 struct KernelLaunch {
48   /// Kernel that was launched.
49   cl_kernel kernel_;
50   /// The name of the kernel that was launched.
51   std::string name_;
52   /// The type of the kernel that was launched.
53   std::string type_;
54   /// Event associated with the start of the kernel.
55   /// Used only when profiling is enabled.
56   cl_event event_;
57   /// Constructor to be used by launching Glow's CL kernels.
KernelLaunchKernelLaunch58   KernelLaunch(cl_kernel kernel, std::string name, std::string type,
59                cl_event event)
60       : kernel_(kernel), name_(name), type_(type), event_(event) {}
61   /// Constructor to be used when launching an "external" CL kernel, e.g.
62   /// provided by such libraries like CLBlast, etc.
KernelLaunchKernelLaunch63   KernelLaunch(const std::string &name, std::string type, cl_event event)
64       : kernel_(nullptr), name_(name), type_(type), event_(event) {}
65 };
66 
67 /// Add an macro definition with an integer value to the set of options.
68 template <typename T>
addIntOption(std::vector<std::string> & options,const std::string & name,const T value)69 static void addIntOption(std::vector<std::string> &options,
70                          const std::string &name, const T value) {
71   options.push_back("-D" + name + "=" + std::to_string(value));
72 }
73 
74 using ManualEventMap =
75     std::map<std::string, std::pair<Placeholder *, const TraceInfo::Event *>>;
76 
77 /// A Glow IR function compiled for OpenCL.
78 class OpenCLFunction final : public CompiledFunction {
79   /// A helper type representing a key for the program's cache.
80   /// Each compiled program is uniquely identified by its source code, set of
81   /// compiler options that were used and the device it was compiled for.
82   using ProgramKey = std::tuple<const std::string, const std::string,
83                                 const cl_device_id, const cl_context>;
84   struct ProgramKeyHash {
operatorProgramKeyHash85     std::size_t operator()(const ProgramKey &K) const noexcept {
86       return llvm::hash_combine(std::get<0>(K), std::get<1>(K), std::get<2>(K));
87     }
88   };
89   /// The IR to be executed.
90   std::unique_ptr<IRFunction> F_;
91 
92   /// Cache of compiled programs.
93   /// The same source code can be compile with different options (e.g. with
94   /// different set of macro definitions) and/or for a different device and
95   /// would result in different programs.
96   std::unordered_map<ProgramKey, cl_program, ProgramKeyHash> programsCache_;
97 
98   /// is kernel level profiling (autoInstrumentation) enabled.
99   bool kernelProfiling_{false};
100   /// Manual trace events:
101   ManualEventMap manualTraceEvents_;
102 
103 public:
104   /// Ctor.
105   explicit OpenCLFunction(std::unique_ptr<IRFunction> F,
106                           runtime::RuntimeBundle &&bundle, TraceInfo traceInfo);
107 
108   /// @name CompiledFunction interface
109   ///@{
110   ~OpenCLFunction() override;
111 
112   Error execute(ExecutionContext *context) override;
113 
114   void freeCompilationResources() override;
115 
116   /// Collects constants for runtime.
117   void collectConstants(const Module *module) override;
118 
119   /// \returns the backend used to compile this function.
getCompileBackendName()120   std::string getCompileBackendName() const override { return "OpenCL"; }
121   ///@}
122 
123   /// Returns IR function pointer.
getIR()124   IRFunction *getIR() { return F_.get(); }
125 
126   /// Create a program from the \p source using provided \p options.
127   cl_program createProgram(const std::string &source,
128                            const std::vector<std::string> &options,
129                            cl_command_queue queue);
130 
131   /// Returns metadata about manual TraceEvents defined in this function.
getManualTraceEvents()132   ManualEventMap &getManualTraceEvents() { return manualTraceEvents_; }
133 
134 private:
135   /// Returns the directory of cached pre-built programs for the given device.
136   /// \returns the directory as given by the user.
137   std::string deviceProgramCacheDir(cl_device_id deviceId);
138 
139   /// Returns the (hashed) file name of a cached pre-built program for the
140   /// given source and set of build options.
141   /// \returns the filename (without directory).
142   std::string diskCacheProgramFileName(cl_device_id deviceId,
143                                        const std::string &source,
144                                        const std::string &options);
145 
146   /// (Tries to) load a program with the given (hashed) filename
147   /// from the disk cache.
148   /// \returns pointer to the program, if found, nullptr otherwise.
149   cl_program loadProgramFromDiskCache(std::string cacheDirectory,
150                                       std::string programFileName,
151                                       cl_context ctx, cl_device_id device);
152 
153   /// Save the given program to the disk cache.
154   void saveProgramToDiskCache(std::string cacheDirectory,
155                               std::string programFilename, cl_program program,
156                               cl_context ctx, cl_device_id deviceId);
157 
158   /// Copy the value from a device to a provided buffer.
159   /// \returns number of copied bytes.
160   uint64_t copyValueFromDevice(const Value *v,
161                                runtime::OpenCLDeviceBindings *devBindings,
162                                void *buf = nullptr);
163   /// Copy value from the provided buffer to the device.
164   /// \returns number of copied bytes.
165   uint64_t copyValueToDevice(const Value *v,
166                              runtime::OpenCLDeviceBindings *devBindings,
167                              void *buf = nullptr);
168   /// Fill the device \p buffer with a given \p value.
169   /// \param len number of buffer elements to be filled by the \p value.
170   /// Elements are considered to be of the type described by \p elemKind.
171   void fillBuffer(cl_mem buffer, uint64_t start, uint64_t len, float value,
172                   ElemKind elemKind,
173                   runtime::OpenCLDeviceBindings *devBindings);
174 
175   /// Execution a convolution instruction which uses NCHW format.
176   void executeNCHWConvolution(const ConvolutionInst *CC,
177                               ExecutionContext *executionContext,
178                               runtime::OpenCLDeviceBindings *devBindings);
179   /// Allocate a device buffer of required \p size.
180   cl_mem allocDeviceBuffer(uint64_t size, cl_context clContext);
181   /// Frees a device buffer.
182   void freeDeviceBuffer(cl_mem buf);
183 
184   /// Create kernel with a given \p name from a \p program.
185   /// If \p program is nullptr, try to find the kernel with a given \p name
186   /// in any of compiled programs.
187   cl_kernel createKernel(const std::string &name, cl_program program);
188 
189   /// Enqueue a \p kernel on a provided \p commands queue.
190   void enqueueKernel(llvm::StringRef name, cl_command_queue commands,
191                      cl_kernel kernel, cl_device_id device,
192                      llvm::ArrayRef<size_t> global,
193                      std::vector<KernelLaunch> &kernelLaunches);
194   /// Enqueue a \p kernel on a provided \p commands queue using specified \p
195   /// global and \p local work sizes.
196   void enqueueKernel(llvm::StringRef name, cl_command_queue commands,
197                      cl_kernel kernel, cl_device_id device,
198                      llvm::ArrayRef<size_t> global,
199                      llvm::ArrayRef<size_t> local,
200                      std::vector<KernelLaunch> &kernelLaunches);
201 
202   /// Load outputs from the device into \p bindings.
203   void updatePlaceholders(PlaceholderBindings *bindings,
204                           runtime::OpenCLDeviceBindings *devBindings);
205 
206   /// Read trace events out of this func and write them into /p bindings
207   void translateTraceEventsCL(ExecutionContext *context,
208                               runtime::OpenCLDeviceBindings *devBindings);
209 };
210 
211 /// This is the OpenCL backend.
212 class OCLBackend final : public BackendUsingGlowIR {
213 public:
214   /// Ctor.
215   OCLBackend() = default;
216 
217   /// @name Backend methods.
218   /// This is the implementation of the Backend interface.
219   ///@{
220   ~OCLBackend() override = default;
221 
getBackendName()222   std::string getBackendName() const override {
223     return Named::getName().empty() ? getName() : Named::getName().str();
224   }
getName()225   static std::string getName() { return "OpenCL"; }
numDevices()226   static unsigned numDevices() { return 1; }
227 
228   std::unique_ptr<CompiledFunction>
229   compileIR(std::unique_ptr<IRFunction> IR) const override;
230 
231   Expected<std::unique_ptr<CompiledFunction>>
232   compile(Function *F, const BackendOptions &opts) const override;
233 
234   Expected<bool> transformPostLowering(
235       Function *F, CompilationContext &cctx,
236       const glow::runtime::DeviceInfo *devInfo) const override;
237 
238   bool isOpSupported(const NodeInfo &NI) const override;
239 
240   bool verify(const Function &F, bool verbose = true) const override;
241   bool verify(const IRFunction &IR) const override;
242 
243   TensorLayoutCommon &getTensorLayoutRequirements() const override;
244 
shouldLower(const Node * N)245   bool shouldLower(const Node *N) const override {
246     // The group convolution is supported in OpenCL slow convolution kernel.
247     if (N->getKind() == Kinded::Kind::ConvolutionNodeKind) {
248       return false;
249     }
250     // Do not lower ReLU to max, but let it pass to the backend where we
251     // can implement it with a unary max(0, x) kernel. This also enables fusing
252     // convolution with ReLU.
253     if (N->getKind() == Kinded::Kind::ReluNodeKind) {
254       return false;
255     }
256     return true;
257   }
258 
259   /// Size of each TraceEvent (for manual events).
getTraceEventDataSize()260   size_t getTraceEventDataSize() const override { return sizeof(uint64_t); }
261 
262   runtime::DeviceManager *
263   createDeviceManager(const runtime::DeviceConfig &deviceConfig) override;
264 
265   /// \returns whether the backend supports fusing \p activation into \p parent.
supportsFusedActivation(Node * parent,Node * activation)266   bool supportsFusedActivation(Node *parent, Node *activation) const override {
267     // Only support convolution+relu fusions for now.
268     bool V = parent->getKind() == Kinded::Kind::ConvolutionNodeKind &&
269              activation->getKind() == Kinded::Kind::ReluNodeKind;
270     return V;
271   }
272 
273 private:
274   /// Parses the graph \F and builds a TraceInfo structure from any found
275   /// TraceEventNodes.
276   TraceInfo buildManualTraceInfo(Function *F) const;
277 
278   /// Enables kernel profiling to generate TraceEvents after run.
279   void autoInstrument(TraceInfo &traceInfo, IRFunction *IR) const;
280 
281   /// @}
282 };
283 
284 namespace runtime {
285 /// OpenCLDeviceBindings inherits from DeviceBindings, it contains per run
286 /// device specific information used to run a compiled function on a specific
287 /// device.
288 struct OpenCLDeviceBindings : DeviceBindings {
OpenCLDeviceBindingsOpenCLDeviceBindings289   OpenCLDeviceBindings(cl_mem buffer, cl_command_queue commands,
290                        cl_device_id device, cl_context ctx, cl_program prog)
291       : DeviceBindings(OCLBackend::getName()), deviceBuffer{buffer},
292         commandQueue{commands}, deviceId{device}, context{ctx}, program{prog} {}
293 
294   /// CL memory buffer. Currently this contains both mutable and immutable
295   /// weights, the buffer is allocated once when the network is added.
296   cl_mem deviceBuffer;
297 
298   /// CL compute command queue. A per run queue for the specific device.
299   ///
300   cl_command_queue commandQueue;
301 
302   /// CL compute device id. Identifies the CL device to be used.
303   ///
304   cl_device_id deviceId;
305 
306   /// CL compute context. Identifies a context on the CL device the computation
307   /// will take place in.
308   ///
309   cl_context context;
310 
311   /// CL program which was compiled at addNetwork.
312   cl_program program;
313 
314   /// A list of kernels and their associated events.
315   std::vector<KernelLaunch> kernelLaunches;
316 };
317 } // namespace runtime
318 } // namespace glow
319 
320 #endif // GLOW_BACKENDS_OPENCL_OPENCL_H
321