1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *   http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied.  See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #include "utvm_graph_runtime.h"
21 
22 #include <dlfcn.h>
23 
24 #include <cassert>
25 #include <string>
26 
27 #include "picojson.h"
28 
29 namespace tvm {
30 namespace micro {
31 namespace {
32 
TVMSToI(const std::string & str)33 int TVMSToI(const std::string& str) {
34   // For platforms (e.g. older NDK versions) where std::stoi(...) is not available.
35   char* end;
36   return std::strtol(str.c_str(), &end, 10);
37 }
38 
ParseOutputs(const picojson::array & joutputs,DynArray<NodeEntry> * outputs)39 void ParseOutputs(const picojson::array& joutputs, DynArray<NodeEntry>* outputs) {
40   outputs->resize(joutputs.size());
41   for (size_t i = 0; i < joutputs.size(); ++i) {
42     const auto& joutput_i = joutputs[i].get<picojson::array>();
43     (*outputs)[i] = NodeEntry{static_cast<uint32_t>(joutput_i[0].get<double>()),
44                               static_cast<uint32_t>(joutput_i[1].get<double>()),
45                               static_cast<uint32_t>(joutput_i[2].get<double>())};
46   }
47 }
48 
ParseAttrs(const picojson::object & jattr,GraphAttr * attr)49 void ParseAttrs(const picojson::object& jattr, GraphAttr* attr) {
50   // parse dltype
51   for (const auto& jdltype_ : jattr.at("dltype").get<picojson::array>()) {
52     if (jdltype_.is<std::string>()) {
53       continue;
54     }
55     const auto& jdltype = jdltype_.get<picojson::array>();
56 
57     attr->dltype.resize(jdltype.size());
58     for (size_t i = 0; i < jdltype.size(); ++i) {
59       attr->dltype[i] = jdltype[i].get<std::string>();
60     }
61   }
62   for (const auto& jstorage_id_ : jattr.at("storage_id").get<picojson::array>()) {
63     if (jstorage_id_.is<std::string>()) {
64       continue;
65     }
66     const auto& jstorage_id = jstorage_id_.get<picojson::array>();
67 
68     attr->storage_id.resize(jstorage_id.size());
69     for (size_t i = 0; i < jstorage_id.size(); ++i) {
70       attr->storage_id[i] = static_cast<int>(jstorage_id[i].get<double>());
71     }
72   }
73   for (const auto& jshape_ : jattr.at("shape").get<picojson::array>()) {
74     if (jshape_.is<std::string>()) {
75       continue;
76     }
77     const auto& jshape = jshape_.get<picojson::array>();
78     attr->shape.resize(jshape.size());
79     for (size_t i = 0; i < jshape.size(); ++i) {
80       const auto& jshape_i = jshape[i].get<picojson::array>();
81       attr->shape[i].resize(jshape_i.size());
82       for (size_t j = 0; j < jshape_i.size(); ++j) {
83         attr->shape[i][j] = static_cast<int64_t>(jshape_i[j].get<double>());
84       }
85     }
86   }
87 }
88 
ParseNodes(const picojson::array & jnodes,DynArray<Node> * nodes)89 void ParseNodes(const picojson::array& jnodes, DynArray<Node>* nodes) {
90   nodes->resize(jnodes.size());
91   for (size_t i = 0; i < nodes->size(); ++i) {
92     auto* n = &(*nodes)[i];
93     const auto& jn = jnodes[i].get<picojson::object>();
94     n->op_type = jn.at("op").get<std::string>();
95     n->name = jn.at("name").get<std::string>();
96     const auto jinputs = jn.at("inputs").get<picojson::array>();
97     n->inputs.resize(jinputs.size());
98     for (size_t i = 0; i < jinputs.size(); ++i) {
99       const auto& jinput_i = jinputs[i].get<picojson::array>();
100       n->inputs[i] = NodeEntry{static_cast<uint32_t>(jinput_i[0].get<double>()),
101                                static_cast<uint32_t>(jinput_i[1].get<double>()),
102                                static_cast<uint32_t>(jinput_i[2].get<double>())};
103     }
104     const auto& jattrs_ = jn.find("attrs");
105     if (jattrs_ != jn.end()) {
106       const auto& jattrs = jattrs_->second.get<picojson::object>();
107       n->param.func_name = jattrs.at("func_name").get<std::string>();
108       n->param.num_inputs = TVMSToI(jattrs.at("num_inputs").get<std::string>());
109       n->param.num_outputs = TVMSToI(jattrs.at("num_outputs").get<std::string>());
110       n->param.flatten_data = TVMSToI(jattrs.at("flatten_data").get<std::string>());
111     }
112   }
113 }
114 
ParseArgNodes(const picojson::array & jinput_nodes,DynArray<uint32_t> * input_nodes)115 void ParseArgNodes(const picojson::array& jinput_nodes, DynArray<uint32_t>* input_nodes) {
116   input_nodes->resize(jinput_nodes.size());
117   for (size_t i = 0; i < jinput_nodes.size(); ++i) {
118     (*input_nodes)[i] = static_cast<uint32_t>(jinput_nodes[i].get<double>());
119   }
120 }
121 }  // namespace
122 
~NDArray()123 NDArray::~NDArray() {}
124 
Empty(const DynArray<int64_t> & shape,DLDataType dtype,DLContext ctx)125 NDArray NDArray::Empty(const DynArray<int64_t>& shape, DLDataType dtype, DLContext ctx) {
126   NDArray r;
127   int64_t nbytes = (dtype.bits * dtype.lanes + 7) / 8;
128   for (const auto& s : shape) {
129     nbytes *= s;
130   }
131 
132   r.storage_ = std::shared_ptr<void>(
133       TVMBackendAllocWorkspace(static_cast<int>(ctx.device_type), static_cast<int>(ctx.device_id),
134                                nbytes, dtype.code, dtype.bits),
135       [=](void* ptr) {
136         if (ptr) {
137           TVMBackendFreeWorkspace(ctx.device_type, ctx.device_id, ptr);
138         }
139       });
140   r.shape_ = shape;
141   r.dtype_ = dtype;
142   r.ctx_ = ctx;
143   return r;
144 }
145 
CreateView(const DynArray<int64_t> & shape,DLDataType dtype)146 NDArray NDArray::CreateView(const DynArray<int64_t>& shape, DLDataType dtype) {
147   NDArray r;
148   r.storage_ = storage_;
149   r.shape_ = shape;
150   r.dtype_ = dtype;
151   r.ctx_ = ctx_;
152   return r;
153 }
154 
ToDLTensor()155 DLTensor NDArray::ToDLTensor() {
156   DLTensor r;
157   r.data = storage_.get();
158   assert(r.data != nullptr);
159   r.ctx = ctx_;
160   r.ndim = shape_.size();
161   r.dtype = dtype_;
162   r.shape = shape_.data();
163   r.strides = nullptr;
164   r.byte_offset = 0;
165   return r;
166 }
167 
GetDataSize(const DLTensor & arr)168 size_t GetDataSize(const DLTensor& arr) {
169   size_t size = 1;
170   for (size_t i = 0; i < static_cast<size_t>(arr.ndim); ++i) {
171     size *= static_cast<size_t>(arr.shape[i]);
172   }
173   size *= (arr.dtype.bits * arr.dtype.lanes + 7) / 8;
174   return size;
175 }
176 
CopyFrom(DLTensor * src)177 void NDArray::CopyFrom(DLTensor* src) {
178   std::memcpy(storage_.get(),
179               reinterpret_cast<const uint8_t*>(src->data) + static_cast<size_t>(src->byte_offset),
180               GetDataSize(*src));
181 }
182 
CopyTo(DLTensor * dst) const183 void NDArray::CopyTo(DLTensor* dst) const {
184   std::memcpy(reinterpret_cast<uint8_t*>(dst->data) + static_cast<size_t>(dst->byte_offset),
185               storage_.get(), GetDataSize(*dst));
186 }
187 
DSOModule(const std::string & name)188 DSOModule::DSOModule(const std::string& name) {
189   dlerror();
190   lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
191   assert(!dlerror());
192   assert(lib_handle_ != nullptr);
193 
194 #define TVM_INIT_CONTEXT_FUNC(FuncName)                                               \
195   if (auto* fp = reinterpret_cast<decltype(&FuncName)*>(GetSymbol("__" #FuncName))) { \
196     *fp = FuncName;                                                                   \
197   }
198   // Initialize the functions
199   TVM_INIT_CONTEXT_FUNC(TVMAPISetLastError);
200   TVM_INIT_CONTEXT_FUNC(TVMBackendAllocWorkspace);
201   TVM_INIT_CONTEXT_FUNC(TVMBackendFreeWorkspace);
202   TVM_INIT_CONTEXT_FUNC(TVMBackendParallelLaunch);
203 // TODO(tulloch): implement these functions?
204 // TVM_INIT_CONTEXT_FUNC(TVMFuncCall);
205 // TVM_INIT_CONTEXT_FUNC(TVMBackendGetFuncFromEnv);
206 // TVM_INIT_CONTEXT_FUNC(TVMBackendParallelBarrier);
207 #undef TVM_INIT_CONTEXT_FUNC
208 }
209 
~DSOModule()210 DSOModule::~DSOModule() {
211   if (lib_handle_) {
212     dlclose(lib_handle_);
213   }
214 }
215 
GetFunction(const std::string & name) const216 BackendPackedCFunc DSOModule::GetFunction(const std::string& name) const {
217   auto faddr = reinterpret_cast<BackendPackedCFunc>(GetSymbol(name.c_str()));
218   assert(faddr);
219   return faddr;
220 }
221 
GetSymbol(const char * name) const222 void* DSOModule::GetSymbol(const char* name) const {
223   dlerror();
224   auto* f = dlsym(lib_handle_, name);
225   assert(!dlerror());
226   return f;
227 }
228 
MicroGraphRuntime(const std::string & graph_json,DSOModule * module)229 MicroGraphRuntime::MicroGraphRuntime(const std::string& graph_json, DSOModule* module) {
230   assert(module);
231   module_ = module;
232   picojson::value v;
233   picojson::parse(v, graph_json);
234   ParseNodes(v.get<picojson::object>()["nodes"].get<picojson::array>(), &nodes_);
235   ParseArgNodes(v.get<picojson::object>()["arg_nodes"].get<picojson::array>(), &input_nodes_);
236   ParseArgNodes(v.get<picojson::object>()["node_row_ptr"].get<picojson::array>(), &node_row_ptr_);
237   ParseOutputs(v.get<picojson::object>()["heads"].get<picojson::array>(), &outputs_);
238   ParseAttrs(v.get<picojson::object>()["attrs"].get<picojson::object>(), &attrs_);
239   SetupStorage();
240   SetupOpExecs();
241 }
242 
~MicroGraphRuntime()243 MicroGraphRuntime::~MicroGraphRuntime() {}
244 
Run()245 void MicroGraphRuntime::Run() {
246   for (size_t i = 0; i < op_execs_.size(); ++i) {
247     if (op_execs_[i]) op_execs_[i]();
248   }
249 }
250 
SetInput(int index,DLTensor * data_in)251 void MicroGraphRuntime::SetInput(int index, DLTensor* data_in) {
252   assert(static_cast<size_t>(index) < input_nodes_.size());
253   uint32_t eid = this->entry_id(input_nodes_[index], 0);
254   data_entry_[eid].CopyFrom(data_in);
255 }
256 
CopyOutputTo(int index,DLTensor * data_out)257 void MicroGraphRuntime::CopyOutputTo(int index, DLTensor* data_out) {
258   assert(static_cast<size_t>(index) < outputs_.size());
259   uint32_t eid = this->entry_id(outputs_[index]);
260   const NDArray& data = data_entry_[eid];
261   data.CopyTo(data_out);
262 }
263 
SetupStorage()264 void MicroGraphRuntime::SetupStorage() {
265   // Grab saved optimization plan from graph.
266   DynArray<DLDataType> vtype(attrs_.dltype.size());
267   for (size_t i = 0; i < attrs_.dltype.size(); ++i) {
268     assert(attrs_.dltype[i] == "float32");
269     DLDataType ty;
270     ty.bits = 32;
271     ty.lanes = 1;
272     ty.code = kDLFloat;
273     vtype[i] = ty;
274   }
275 
276   // Size and device type of each storage pool entry.
277   std::vector<PoolEntry> pool_entry;
278   // Find the maximum space size.
279   for (size_t i = 0; i < attrs_.shape.size(); ++i) {
280     int storage_id = attrs_.storage_id[i];
281     // Use the fallback device if no device index is available.
282     int device_type = static_cast<int>(ctx_.device_type);
283     size_t size = 1;
284     for (int64_t sz : attrs_.shape[i]) {
285       size *= static_cast<size_t>(sz);
286     }
287     assert(storage_id >= 0);
288     DLDataType t = vtype[i];
289     size_t bits = t.bits * t.lanes;
290     assert(bits % 8U == 0U || bits == 1U);
291     size_t bytes = ((bits + 7U) / 8U) * size;
292 
293     uint32_t sid = static_cast<uint32_t>(storage_id);
294     if (sid >= pool_entry.size()) {
295       pool_entry.resize(sid + 1, {0, -1});
296     } else {
297       assert(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type);
298     }
299     pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
300     pool_entry[sid].device_type = device_type;
301   }
302 
303   // Allocate the space.
304   storage_pool_.resize(pool_entry.size());
305   for (size_t i = 0; i < pool_entry.size(); ++i) {
306     const auto& pit = pool_entry[i];
307     DynArray<int64_t> shape(1);
308     shape[0] = static_cast<int64_t>(pit.size + 3) / 4;
309     storage_pool_[i] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx_);
310   }
311 
312   // Assign the pooled entries. A unified memory pool is used to simplify
313   // memory assignment for each node entry. The allocated memory on each device
314   // is mapped to this pool.
315   data_entry_.resize(num_node_entries());
316   for (size_t i = 0; i < data_entry_.size(); ++i) {
317     int storage_id = attrs_.storage_id[i];
318     assert(static_cast<size_t>(storage_id) < storage_pool_.size());
319     data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
320   }
321 }
322 
CreateTVMOp(const DSOModule & module,const TVMOpParam & param,const DynArray<DLTensor> & args,size_t num_inputs)323 std::function<void()> CreateTVMOp(const DSOModule& module, const TVMOpParam& param,
324                                   const DynArray<DLTensor>& args, size_t num_inputs) {
325   typedef union {
326     void* v_handle;
327   } TVMValue;
328   /*typedef*/ enum {
329     kTVMDLTensorHandle = 7U,
330   } /*TVMArgTypeCode*/;
331   struct OpArgs {
332     DynArray<DLTensor> args;
333     DynArray<TVMValue> arg_values;
334     DynArray<int> arg_tcodes;
335     DynArray<int64_t> shape_data;
336   };
337 
338   std::shared_ptr<OpArgs> arg_ptr = std::make_shared<OpArgs>();
339   arg_ptr->args = args;
340   if (param.flatten_data) {
341     arg_ptr->shape_data.resize(arg_ptr->args.size());
342   }
343   arg_ptr->arg_values.resize(arg_ptr->args.size());
344   arg_ptr->arg_tcodes.resize(arg_ptr->args.size());
345   for (size_t i = 0; i < arg_ptr->args.size(); ++i) {
346     TVMValue v;
347     DLTensor* t = &(arg_ptr->args[i]);
348     v.v_handle = t;
349     arg_ptr->arg_values[i] = v;
350     arg_ptr->arg_tcodes[i] = kTVMDLTensorHandle;
351     if (param.flatten_data) {
352       arg_ptr->shape_data[i] =
353           std::accumulate(t->shape, t->shape + t->ndim, 1, std::multiplies<int64_t>());
354       t->ndim = 1;
355       t->shape = &(arg_ptr->shape_data[i]);
356     }
357   }
358 
359   if (param.func_name == "__nop") {
360     return []() {};
361   } else if (param.func_name == "__copy") {
362     assert(false);
363   }
364 
365   BackendPackedCFunc pf = module.GetFunction(param.func_name);
366   assert(pf != nullptr);
367 
368   auto fexec = [arg_ptr, pf]() {
369     assert(pf);
370     (pf)(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(),
371          static_cast<int>(arg_ptr->arg_values.size()));
372   };
373   return fexec;
374 }
375 
SetupOpExecs()376 void MicroGraphRuntime::SetupOpExecs() {
377   op_execs_.resize(nodes_.size());
378   // setup the array and requirements.
379   for (uint32_t nid = 0; nid < nodes_.size(); ++nid) {
380     const auto& inode = nodes_[nid];
381     if (inode.op_type == "null") continue;
382     DynArray<DLTensor> args(inode.inputs.size() + inode.param.num_outputs);
383     for (size_t i = 0; i < inode.inputs.size(); ++i) {
384       const auto& e = inode.inputs[i];
385       args[i] = data_entry_[this->entry_id(e)].ToDLTensor();
386     }
387     for (size_t index = 0; index < inode.param.num_outputs; ++index) {
388       uint32_t eid = this->entry_id(nid, index);
389       args[index + inode.inputs.size()] = data_entry_[eid].ToDLTensor();
390     }
391     assert(inode.op_type == "tvm_op");
392     op_execs_[nid] = CreateTVMOp(*module_, inode.param, args, inode.inputs.size());
393   }
394 }
395 
396 }  // namespace micro
397 }  // namespace tvm
398