1 /**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #ifndef GLOW_BACKENDS_OPENCL_OPENCL_H
17 #define GLOW_BACKENDS_OPENCL_OPENCL_H
18
19 #include "glow/Backend/Backend.h"
20 #include "glow/Backend/BackendUtils.h"
21 #include "glow/Backend/CompiledFunction.h"
22 #include "glow/Base/Tensor.h"
23 #include "glow/Base/Traits.h"
24 #include "glow/ExecutionContext/ExecutionContext.h"
25 #include "glow/Graph/Node.h"
26 #include "glow/IR/IR.h"
27
28 #include "llvm/ADT/ArrayRef.h"
29
30 #include <unordered_map>
31
32 #if defined(__APPLE__) || defined(__MACOSX)
33 #include "OpenCL/opencl.h"
34 #else
35 #include <CL/cl.h>
36 #endif
37
38 namespace glow {
39
40 class ConvolutionInst;
41 class Value;
42 namespace runtime {
43 struct OpenCLDeviceBindings;
44 }
45
46 /// A helper struct with information about kernels launches.
47 struct KernelLaunch {
48 /// Kernel that was launched.
49 cl_kernel kernel_;
50 /// The name of the kernel that was launched.
51 std::string name_;
52 /// The type of the kernel that was launched.
53 std::string type_;
54 /// Event associated with the start of the kernel.
55 /// Used only when profiling is enabled.
56 cl_event event_;
57 /// Constructor to be used by launching Glow's CL kernels.
KernelLaunchKernelLaunch58 KernelLaunch(cl_kernel kernel, std::string name, std::string type,
59 cl_event event)
60 : kernel_(kernel), name_(name), type_(type), event_(event) {}
61 /// Constructor to be used when launching an "external" CL kernel, e.g.
62 /// provided by such libraries like CLBlast, etc.
KernelLaunchKernelLaunch63 KernelLaunch(const std::string &name, std::string type, cl_event event)
64 : kernel_(nullptr), name_(name), type_(type), event_(event) {}
65 };
66
67 /// Add an macro definition with an integer value to the set of options.
68 template <typename T>
addIntOption(std::vector<std::string> & options,const std::string & name,const T value)69 static void addIntOption(std::vector<std::string> &options,
70 const std::string &name, const T value) {
71 options.push_back("-D" + name + "=" + std::to_string(value));
72 }
73
74 using ManualEventMap =
75 std::map<std::string, std::pair<Placeholder *, const TraceInfo::Event *>>;
76
77 /// A Glow IR function compiled for OpenCL.
78 class OpenCLFunction final : public CompiledFunction {
79 /// A helper type representing a key for the program's cache.
80 /// Each compiled program is uniquely identified by its source code, set of
81 /// compiler options that were used and the device it was compiled for.
82 using ProgramKey = std::tuple<const std::string, const std::string,
83 const cl_device_id, const cl_context>;
84 struct ProgramKeyHash {
operatorProgramKeyHash85 std::size_t operator()(const ProgramKey &K) const noexcept {
86 return llvm::hash_combine(std::get<0>(K), std::get<1>(K), std::get<2>(K));
87 }
88 };
89 /// The IR to be executed.
90 std::unique_ptr<IRFunction> F_;
91
92 /// Cache of compiled programs.
93 /// The same source code can be compile with different options (e.g. with
94 /// different set of macro definitions) and/or for a different device and
95 /// would result in different programs.
96 std::unordered_map<ProgramKey, cl_program, ProgramKeyHash> programsCache_;
97
98 /// is kernel level profiling (autoInstrumentation) enabled.
99 bool kernelProfiling_{false};
100 /// Manual trace events:
101 ManualEventMap manualTraceEvents_;
102
103 public:
104 /// Ctor.
105 explicit OpenCLFunction(std::unique_ptr<IRFunction> F,
106 runtime::RuntimeBundle &&bundle, TraceInfo traceInfo);
107
108 /// @name CompiledFunction interface
109 ///@{
110 ~OpenCLFunction() override;
111
112 Error execute(ExecutionContext *context) override;
113
114 void freeCompilationResources() override;
115
116 /// Collects constants for runtime.
117 void collectConstants(const Module *module) override;
118
119 /// \returns the backend used to compile this function.
getCompileBackendName()120 std::string getCompileBackendName() const override { return "OpenCL"; }
121 ///@}
122
123 /// Returns IR function pointer.
getIR()124 IRFunction *getIR() { return F_.get(); }
125
126 /// Create a program from the \p source using provided \p options.
127 cl_program createProgram(const std::string &source,
128 const std::vector<std::string> &options,
129 cl_command_queue queue);
130
131 /// Returns metadata about manual TraceEvents defined in this function.
getManualTraceEvents()132 ManualEventMap &getManualTraceEvents() { return manualTraceEvents_; }
133
134 private:
135 /// Returns the directory of cached pre-built programs for the given device.
136 /// \returns the directory as given by the user.
137 std::string deviceProgramCacheDir(cl_device_id deviceId);
138
139 /// Returns the (hashed) file name of a cached pre-built program for the
140 /// given source and set of build options.
141 /// \returns the filename (without directory).
142 std::string diskCacheProgramFileName(cl_device_id deviceId,
143 const std::string &source,
144 const std::string &options);
145
146 /// (Tries to) load a program with the given (hashed) filename
147 /// from the disk cache.
148 /// \returns pointer to the program, if found, nullptr otherwise.
149 cl_program loadProgramFromDiskCache(std::string cacheDirectory,
150 std::string programFileName,
151 cl_context ctx, cl_device_id device);
152
153 /// Save the given program to the disk cache.
154 void saveProgramToDiskCache(std::string cacheDirectory,
155 std::string programFilename, cl_program program,
156 cl_context ctx, cl_device_id deviceId);
157
158 /// Copy the value from a device to a provided buffer.
159 /// \returns number of copied bytes.
160 uint64_t copyValueFromDevice(const Value *v,
161 runtime::OpenCLDeviceBindings *devBindings,
162 void *buf = nullptr);
163 /// Copy value from the provided buffer to the device.
164 /// \returns number of copied bytes.
165 uint64_t copyValueToDevice(const Value *v,
166 runtime::OpenCLDeviceBindings *devBindings,
167 void *buf = nullptr);
168 /// Fill the device \p buffer with a given \p value.
169 /// \param len number of buffer elements to be filled by the \p value.
170 /// Elements are considered to be of the type described by \p elemKind.
171 void fillBuffer(cl_mem buffer, uint64_t start, uint64_t len, float value,
172 ElemKind elemKind,
173 runtime::OpenCLDeviceBindings *devBindings);
174
175 /// Execution a convolution instruction which uses NCHW format.
176 void executeNCHWConvolution(const ConvolutionInst *CC,
177 ExecutionContext *executionContext,
178 runtime::OpenCLDeviceBindings *devBindings);
179 /// Allocate a device buffer of required \p size.
180 cl_mem allocDeviceBuffer(uint64_t size, cl_context clContext);
181 /// Frees a device buffer.
182 void freeDeviceBuffer(cl_mem buf);
183
184 /// Create kernel with a given \p name from a \p program.
185 /// If \p program is nullptr, try to find the kernel with a given \p name
186 /// in any of compiled programs.
187 cl_kernel createKernel(const std::string &name, cl_program program);
188
189 /// Enqueue a \p kernel on a provided \p commands queue.
190 void enqueueKernel(llvm::StringRef name, cl_command_queue commands,
191 cl_kernel kernel, cl_device_id device,
192 llvm::ArrayRef<size_t> global,
193 std::vector<KernelLaunch> &kernelLaunches);
194 /// Enqueue a \p kernel on a provided \p commands queue using specified \p
195 /// global and \p local work sizes.
196 void enqueueKernel(llvm::StringRef name, cl_command_queue commands,
197 cl_kernel kernel, cl_device_id device,
198 llvm::ArrayRef<size_t> global,
199 llvm::ArrayRef<size_t> local,
200 std::vector<KernelLaunch> &kernelLaunches);
201
202 /// Load outputs from the device into \p bindings.
203 void updatePlaceholders(PlaceholderBindings *bindings,
204 runtime::OpenCLDeviceBindings *devBindings);
205
206 /// Read trace events out of this func and write them into /p bindings
207 void translateTraceEventsCL(ExecutionContext *context,
208 runtime::OpenCLDeviceBindings *devBindings);
209 };
210
211 /// This is the OpenCL backend.
212 class OCLBackend final : public BackendUsingGlowIR {
213 public:
214 /// Ctor.
215 OCLBackend() = default;
216
217 /// @name Backend methods.
218 /// This is the implementation of the Backend interface.
219 ///@{
220 ~OCLBackend() override = default;
221
getBackendName()222 std::string getBackendName() const override {
223 return Named::getName().empty() ? getName() : Named::getName().str();
224 }
getName()225 static std::string getName() { return "OpenCL"; }
numDevices()226 static unsigned numDevices() { return 1; }
227
228 std::unique_ptr<CompiledFunction>
229 compileIR(std::unique_ptr<IRFunction> IR) const override;
230
231 Expected<std::unique_ptr<CompiledFunction>>
232 compile(Function *F, const BackendOptions &opts) const override;
233
234 Expected<bool> transformPostLowering(
235 Function *F, CompilationContext &cctx,
236 const glow::runtime::DeviceInfo *devInfo) const override;
237
238 bool isOpSupported(const NodeInfo &NI) const override;
239
240 bool verify(const Function &F, bool verbose = true) const override;
241 bool verify(const IRFunction &IR) const override;
242
243 TensorLayoutCommon &getTensorLayoutRequirements() const override;
244
shouldLower(const Node * N)245 bool shouldLower(const Node *N) const override {
246 // The group convolution is supported in OpenCL slow convolution kernel.
247 if (N->getKind() == Kinded::Kind::ConvolutionNodeKind) {
248 return false;
249 }
250 // Do not lower ReLU to max, but let it pass to the backend where we
251 // can implement it with a unary max(0, x) kernel. This also enables fusing
252 // convolution with ReLU.
253 if (N->getKind() == Kinded::Kind::ReluNodeKind) {
254 return false;
255 }
256 return true;
257 }
258
259 /// Size of each TraceEvent (for manual events).
getTraceEventDataSize()260 size_t getTraceEventDataSize() const override { return sizeof(uint64_t); }
261
262 runtime::DeviceManager *
263 createDeviceManager(const runtime::DeviceConfig &deviceConfig) override;
264
265 /// \returns whether the backend supports fusing \p activation into \p parent.
supportsFusedActivation(Node * parent,Node * activation)266 bool supportsFusedActivation(Node *parent, Node *activation) const override {
267 // Only support convolution+relu fusions for now.
268 bool V = parent->getKind() == Kinded::Kind::ConvolutionNodeKind &&
269 activation->getKind() == Kinded::Kind::ReluNodeKind;
270 return V;
271 }
272
273 private:
274 /// Parses the graph \F and builds a TraceInfo structure from any found
275 /// TraceEventNodes.
276 TraceInfo buildManualTraceInfo(Function *F) const;
277
278 /// Enables kernel profiling to generate TraceEvents after run.
279 void autoInstrument(TraceInfo &traceInfo, IRFunction *IR) const;
280
281 /// @}
282 };
283
284 namespace runtime {
285 /// OpenCLDeviceBindings inherits from DeviceBindings, it contains per run
286 /// device specific information used to run a compiled function on a specific
287 /// device.
288 struct OpenCLDeviceBindings : DeviceBindings {
OpenCLDeviceBindingsOpenCLDeviceBindings289 OpenCLDeviceBindings(cl_mem buffer, cl_command_queue commands,
290 cl_device_id device, cl_context ctx, cl_program prog)
291 : DeviceBindings(OCLBackend::getName()), deviceBuffer{buffer},
292 commandQueue{commands}, deviceId{device}, context{ctx}, program{prog} {}
293
294 /// CL memory buffer. Currently this contains both mutable and immutable
295 /// weights, the buffer is allocated once when the network is added.
296 cl_mem deviceBuffer;
297
298 /// CL compute command queue. A per run queue for the specific device.
299 ///
300 cl_command_queue commandQueue;
301
302 /// CL compute device id. Identifies the CL device to be used.
303 ///
304 cl_device_id deviceId;
305
306 /// CL compute context. Identifies a context on the CL device the computation
307 /// will take place in.
308 ///
309 cl_context context;
310
311 /// CL program which was compiled at addNetwork.
312 cl_program program;
313
314 /// A list of kernels and their associated events.
315 std::vector<KernelLaunch> kernelLaunches;
316 };
317 } // namespace runtime
318 } // namespace glow
319
320 #endif // GLOW_BACKENDS_OPENCL_OPENCL_H
321