1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 #include "utvm_graph_runtime.h"
21
22 #include <dlfcn.h>
23
24 #include <cassert>
25 #include <string>
26
27 #include "picojson.h"
28
29 namespace tvm {
30 namespace micro {
31 namespace {
32
TVMSToI(const std::string & str)33 int TVMSToI(const std::string& str) {
34 // For platforms (e.g. older NDK versions) where std::stoi(...) is not available.
35 char* end;
36 return std::strtol(str.c_str(), &end, 10);
37 }
38
ParseOutputs(const picojson::array & joutputs,DynArray<NodeEntry> * outputs)39 void ParseOutputs(const picojson::array& joutputs, DynArray<NodeEntry>* outputs) {
40 outputs->resize(joutputs.size());
41 for (size_t i = 0; i < joutputs.size(); ++i) {
42 const auto& joutput_i = joutputs[i].get<picojson::array>();
43 (*outputs)[i] = NodeEntry{static_cast<uint32_t>(joutput_i[0].get<double>()),
44 static_cast<uint32_t>(joutput_i[1].get<double>()),
45 static_cast<uint32_t>(joutput_i[2].get<double>())};
46 }
47 }
48
ParseAttrs(const picojson::object & jattr,GraphAttr * attr)49 void ParseAttrs(const picojson::object& jattr, GraphAttr* attr) {
50 // parse dltype
51 for (const auto& jdltype_ : jattr.at("dltype").get<picojson::array>()) {
52 if (jdltype_.is<std::string>()) {
53 continue;
54 }
55 const auto& jdltype = jdltype_.get<picojson::array>();
56
57 attr->dltype.resize(jdltype.size());
58 for (size_t i = 0; i < jdltype.size(); ++i) {
59 attr->dltype[i] = jdltype[i].get<std::string>();
60 }
61 }
62 for (const auto& jstorage_id_ : jattr.at("storage_id").get<picojson::array>()) {
63 if (jstorage_id_.is<std::string>()) {
64 continue;
65 }
66 const auto& jstorage_id = jstorage_id_.get<picojson::array>();
67
68 attr->storage_id.resize(jstorage_id.size());
69 for (size_t i = 0; i < jstorage_id.size(); ++i) {
70 attr->storage_id[i] = static_cast<int>(jstorage_id[i].get<double>());
71 }
72 }
73 for (const auto& jshape_ : jattr.at("shape").get<picojson::array>()) {
74 if (jshape_.is<std::string>()) {
75 continue;
76 }
77 const auto& jshape = jshape_.get<picojson::array>();
78 attr->shape.resize(jshape.size());
79 for (size_t i = 0; i < jshape.size(); ++i) {
80 const auto& jshape_i = jshape[i].get<picojson::array>();
81 attr->shape[i].resize(jshape_i.size());
82 for (size_t j = 0; j < jshape_i.size(); ++j) {
83 attr->shape[i][j] = static_cast<int64_t>(jshape_i[j].get<double>());
84 }
85 }
86 }
87 }
88
ParseNodes(const picojson::array & jnodes,DynArray<Node> * nodes)89 void ParseNodes(const picojson::array& jnodes, DynArray<Node>* nodes) {
90 nodes->resize(jnodes.size());
91 for (size_t i = 0; i < nodes->size(); ++i) {
92 auto* n = &(*nodes)[i];
93 const auto& jn = jnodes[i].get<picojson::object>();
94 n->op_type = jn.at("op").get<std::string>();
95 n->name = jn.at("name").get<std::string>();
96 const auto jinputs = jn.at("inputs").get<picojson::array>();
97 n->inputs.resize(jinputs.size());
98 for (size_t i = 0; i < jinputs.size(); ++i) {
99 const auto& jinput_i = jinputs[i].get<picojson::array>();
100 n->inputs[i] = NodeEntry{static_cast<uint32_t>(jinput_i[0].get<double>()),
101 static_cast<uint32_t>(jinput_i[1].get<double>()),
102 static_cast<uint32_t>(jinput_i[2].get<double>())};
103 }
104 const auto& jattrs_ = jn.find("attrs");
105 if (jattrs_ != jn.end()) {
106 const auto& jattrs = jattrs_->second.get<picojson::object>();
107 n->param.func_name = jattrs.at("func_name").get<std::string>();
108 n->param.num_inputs = TVMSToI(jattrs.at("num_inputs").get<std::string>());
109 n->param.num_outputs = TVMSToI(jattrs.at("num_outputs").get<std::string>());
110 n->param.flatten_data = TVMSToI(jattrs.at("flatten_data").get<std::string>());
111 }
112 }
113 }
114
ParseArgNodes(const picojson::array & jinput_nodes,DynArray<uint32_t> * input_nodes)115 void ParseArgNodes(const picojson::array& jinput_nodes, DynArray<uint32_t>* input_nodes) {
116 input_nodes->resize(jinput_nodes.size());
117 for (size_t i = 0; i < jinput_nodes.size(); ++i) {
118 (*input_nodes)[i] = static_cast<uint32_t>(jinput_nodes[i].get<double>());
119 }
120 }
121 } // namespace
122
~NDArray()123 NDArray::~NDArray() {}
124
Empty(const DynArray<int64_t> & shape,DLDataType dtype,DLContext ctx)125 NDArray NDArray::Empty(const DynArray<int64_t>& shape, DLDataType dtype, DLContext ctx) {
126 NDArray r;
127 int64_t nbytes = (dtype.bits * dtype.lanes + 7) / 8;
128 for (const auto& s : shape) {
129 nbytes *= s;
130 }
131
132 r.storage_ = std::shared_ptr<void>(
133 TVMBackendAllocWorkspace(static_cast<int>(ctx.device_type), static_cast<int>(ctx.device_id),
134 nbytes, dtype.code, dtype.bits),
135 [=](void* ptr) {
136 if (ptr) {
137 TVMBackendFreeWorkspace(ctx.device_type, ctx.device_id, ptr);
138 }
139 });
140 r.shape_ = shape;
141 r.dtype_ = dtype;
142 r.ctx_ = ctx;
143 return r;
144 }
145
CreateView(const DynArray<int64_t> & shape,DLDataType dtype)146 NDArray NDArray::CreateView(const DynArray<int64_t>& shape, DLDataType dtype) {
147 NDArray r;
148 r.storage_ = storage_;
149 r.shape_ = shape;
150 r.dtype_ = dtype;
151 r.ctx_ = ctx_;
152 return r;
153 }
154
ToDLTensor()155 DLTensor NDArray::ToDLTensor() {
156 DLTensor r;
157 r.data = storage_.get();
158 assert(r.data != nullptr);
159 r.ctx = ctx_;
160 r.ndim = shape_.size();
161 r.dtype = dtype_;
162 r.shape = shape_.data();
163 r.strides = nullptr;
164 r.byte_offset = 0;
165 return r;
166 }
167
GetDataSize(const DLTensor & arr)168 size_t GetDataSize(const DLTensor& arr) {
169 size_t size = 1;
170 for (size_t i = 0; i < static_cast<size_t>(arr.ndim); ++i) {
171 size *= static_cast<size_t>(arr.shape[i]);
172 }
173 size *= (arr.dtype.bits * arr.dtype.lanes + 7) / 8;
174 return size;
175 }
176
CopyFrom(DLTensor * src)177 void NDArray::CopyFrom(DLTensor* src) {
178 std::memcpy(storage_.get(),
179 reinterpret_cast<const uint8_t*>(src->data) + static_cast<size_t>(src->byte_offset),
180 GetDataSize(*src));
181 }
182
CopyTo(DLTensor * dst) const183 void NDArray::CopyTo(DLTensor* dst) const {
184 std::memcpy(reinterpret_cast<uint8_t*>(dst->data) + static_cast<size_t>(dst->byte_offset),
185 storage_.get(), GetDataSize(*dst));
186 }
187
DSOModule(const std::string & name)188 DSOModule::DSOModule(const std::string& name) {
189 dlerror();
190 lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
191 assert(!dlerror());
192 assert(lib_handle_ != nullptr);
193
194 #define TVM_INIT_CONTEXT_FUNC(FuncName) \
195 if (auto* fp = reinterpret_cast<decltype(&FuncName)*>(GetSymbol("__" #FuncName))) { \
196 *fp = FuncName; \
197 }
198 // Initialize the functions
199 TVM_INIT_CONTEXT_FUNC(TVMAPISetLastError);
200 TVM_INIT_CONTEXT_FUNC(TVMBackendAllocWorkspace);
201 TVM_INIT_CONTEXT_FUNC(TVMBackendFreeWorkspace);
202 TVM_INIT_CONTEXT_FUNC(TVMBackendParallelLaunch);
203 // TODO(tulloch): implement these functions?
204 // TVM_INIT_CONTEXT_FUNC(TVMFuncCall);
205 // TVM_INIT_CONTEXT_FUNC(TVMBackendGetFuncFromEnv);
206 // TVM_INIT_CONTEXT_FUNC(TVMBackendParallelBarrier);
207 #undef TVM_INIT_CONTEXT_FUNC
208 }
209
~DSOModule()210 DSOModule::~DSOModule() {
211 if (lib_handle_) {
212 dlclose(lib_handle_);
213 }
214 }
215
GetFunction(const std::string & name) const216 BackendPackedCFunc DSOModule::GetFunction(const std::string& name) const {
217 auto faddr = reinterpret_cast<BackendPackedCFunc>(GetSymbol(name.c_str()));
218 assert(faddr);
219 return faddr;
220 }
221
GetSymbol(const char * name) const222 void* DSOModule::GetSymbol(const char* name) const {
223 dlerror();
224 auto* f = dlsym(lib_handle_, name);
225 assert(!dlerror());
226 return f;
227 }
228
MicroGraphRuntime(const std::string & graph_json,DSOModule * module)229 MicroGraphRuntime::MicroGraphRuntime(const std::string& graph_json, DSOModule* module) {
230 assert(module);
231 module_ = module;
232 picojson::value v;
233 picojson::parse(v, graph_json);
234 ParseNodes(v.get<picojson::object>()["nodes"].get<picojson::array>(), &nodes_);
235 ParseArgNodes(v.get<picojson::object>()["arg_nodes"].get<picojson::array>(), &input_nodes_);
236 ParseArgNodes(v.get<picojson::object>()["node_row_ptr"].get<picojson::array>(), &node_row_ptr_);
237 ParseOutputs(v.get<picojson::object>()["heads"].get<picojson::array>(), &outputs_);
238 ParseAttrs(v.get<picojson::object>()["attrs"].get<picojson::object>(), &attrs_);
239 SetupStorage();
240 SetupOpExecs();
241 }
242
~MicroGraphRuntime()243 MicroGraphRuntime::~MicroGraphRuntime() {}
244
Run()245 void MicroGraphRuntime::Run() {
246 for (size_t i = 0; i < op_execs_.size(); ++i) {
247 if (op_execs_[i]) op_execs_[i]();
248 }
249 }
250
SetInput(int index,DLTensor * data_in)251 void MicroGraphRuntime::SetInput(int index, DLTensor* data_in) {
252 assert(static_cast<size_t>(index) < input_nodes_.size());
253 uint32_t eid = this->entry_id(input_nodes_[index], 0);
254 data_entry_[eid].CopyFrom(data_in);
255 }
256
CopyOutputTo(int index,DLTensor * data_out)257 void MicroGraphRuntime::CopyOutputTo(int index, DLTensor* data_out) {
258 assert(static_cast<size_t>(index) < outputs_.size());
259 uint32_t eid = this->entry_id(outputs_[index]);
260 const NDArray& data = data_entry_[eid];
261 data.CopyTo(data_out);
262 }
263
SetupStorage()264 void MicroGraphRuntime::SetupStorage() {
265 // Grab saved optimization plan from graph.
266 DynArray<DLDataType> vtype(attrs_.dltype.size());
267 for (size_t i = 0; i < attrs_.dltype.size(); ++i) {
268 assert(attrs_.dltype[i] == "float32");
269 DLDataType ty;
270 ty.bits = 32;
271 ty.lanes = 1;
272 ty.code = kDLFloat;
273 vtype[i] = ty;
274 }
275
276 // Size and device type of each storage pool entry.
277 std::vector<PoolEntry> pool_entry;
278 // Find the maximum space size.
279 for (size_t i = 0; i < attrs_.shape.size(); ++i) {
280 int storage_id = attrs_.storage_id[i];
281 // Use the fallback device if no device index is available.
282 int device_type = static_cast<int>(ctx_.device_type);
283 size_t size = 1;
284 for (int64_t sz : attrs_.shape[i]) {
285 size *= static_cast<size_t>(sz);
286 }
287 assert(storage_id >= 0);
288 DLDataType t = vtype[i];
289 size_t bits = t.bits * t.lanes;
290 assert(bits % 8U == 0U || bits == 1U);
291 size_t bytes = ((bits + 7U) / 8U) * size;
292
293 uint32_t sid = static_cast<uint32_t>(storage_id);
294 if (sid >= pool_entry.size()) {
295 pool_entry.resize(sid + 1, {0, -1});
296 } else {
297 assert(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type);
298 }
299 pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
300 pool_entry[sid].device_type = device_type;
301 }
302
303 // Allocate the space.
304 storage_pool_.resize(pool_entry.size());
305 for (size_t i = 0; i < pool_entry.size(); ++i) {
306 const auto& pit = pool_entry[i];
307 DynArray<int64_t> shape(1);
308 shape[0] = static_cast<int64_t>(pit.size + 3) / 4;
309 storage_pool_[i] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx_);
310 }
311
312 // Assign the pooled entries. A unified memory pool is used to simplify
313 // memory assignment for each node entry. The allocated memory on each device
314 // is mapped to this pool.
315 data_entry_.resize(num_node_entries());
316 for (size_t i = 0; i < data_entry_.size(); ++i) {
317 int storage_id = attrs_.storage_id[i];
318 assert(static_cast<size_t>(storage_id) < storage_pool_.size());
319 data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
320 }
321 }
322
CreateTVMOp(const DSOModule & module,const TVMOpParam & param,const DynArray<DLTensor> & args,size_t num_inputs)323 std::function<void()> CreateTVMOp(const DSOModule& module, const TVMOpParam& param,
324 const DynArray<DLTensor>& args, size_t num_inputs) {
325 typedef union {
326 void* v_handle;
327 } TVMValue;
328 /*typedef*/ enum {
329 kTVMDLTensorHandle = 7U,
330 } /*TVMArgTypeCode*/;
331 struct OpArgs {
332 DynArray<DLTensor> args;
333 DynArray<TVMValue> arg_values;
334 DynArray<int> arg_tcodes;
335 DynArray<int64_t> shape_data;
336 };
337
338 std::shared_ptr<OpArgs> arg_ptr = std::make_shared<OpArgs>();
339 arg_ptr->args = args;
340 if (param.flatten_data) {
341 arg_ptr->shape_data.resize(arg_ptr->args.size());
342 }
343 arg_ptr->arg_values.resize(arg_ptr->args.size());
344 arg_ptr->arg_tcodes.resize(arg_ptr->args.size());
345 for (size_t i = 0; i < arg_ptr->args.size(); ++i) {
346 TVMValue v;
347 DLTensor* t = &(arg_ptr->args[i]);
348 v.v_handle = t;
349 arg_ptr->arg_values[i] = v;
350 arg_ptr->arg_tcodes[i] = kTVMDLTensorHandle;
351 if (param.flatten_data) {
352 arg_ptr->shape_data[i] =
353 std::accumulate(t->shape, t->shape + t->ndim, 1, std::multiplies<int64_t>());
354 t->ndim = 1;
355 t->shape = &(arg_ptr->shape_data[i]);
356 }
357 }
358
359 if (param.func_name == "__nop") {
360 return []() {};
361 } else if (param.func_name == "__copy") {
362 assert(false);
363 }
364
365 BackendPackedCFunc pf = module.GetFunction(param.func_name);
366 assert(pf != nullptr);
367
368 auto fexec = [arg_ptr, pf]() {
369 assert(pf);
370 (pf)(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(),
371 static_cast<int>(arg_ptr->arg_values.size()));
372 };
373 return fexec;
374 }
375
SetupOpExecs()376 void MicroGraphRuntime::SetupOpExecs() {
377 op_execs_.resize(nodes_.size());
378 // setup the array and requirements.
379 for (uint32_t nid = 0; nid < nodes_.size(); ++nid) {
380 const auto& inode = nodes_[nid];
381 if (inode.op_type == "null") continue;
382 DynArray<DLTensor> args(inode.inputs.size() + inode.param.num_outputs);
383 for (size_t i = 0; i < inode.inputs.size(); ++i) {
384 const auto& e = inode.inputs[i];
385 args[i] = data_entry_[this->entry_id(e)].ToDLTensor();
386 }
387 for (size_t index = 0; index < inode.param.num_outputs; ++index) {
388 uint32_t eid = this->entry_id(nid, index);
389 args[index + inode.inputs.size()] = data_entry_[eid].ToDLTensor();
390 }
391 assert(inode.op_type == "tvm_op");
392 op_execs_[nid] = CreateTVMOp(*module_, inode.param, args, inode.inputs.size());
393 }
394 }
395
396 } // namespace micro
397 } // namespace tvm
398