1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17from shutil import which 18import json 19import pytest 20import sys 21import numpy as np 22 23import tvm 24from tvm import relay 25from tvm import module as _tvm_module 26from tvm.contrib import util 27 28tmp_path = util.tempdir() 29 30 31def generate_csource_module(): 32 """Mock the codegen with an external library (e.g., CBLAS/cuDNN)""" 33 34 code = r''' 35 #include <tvm/runtime/c_runtime_api.h> 36 #include <dlpack/dlpack.h> 37 #include <cstdint> 38 #include <cstring> 39 #include <iostream> 40 41 #define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_) \ 42 extern "C" void p_ID_(float* a, float* b, float* out) { \ 43 for (int64_t i = 0; i < p_DIM1_; ++i) { \ 44 out[i] = a[i] p_OP_ b[i]; \ 45 } \ 46 } 47 48 #define GCC_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_) \ 49 extern "C" void p_ID_(float* a, float* b, float* out) { \ 50 for (int64_t i = 0; i < p_DIM1_; ++i) { \ 51 for (int64_t j = 0; j < p_DIM2_; ++j) { \ 52 int64_t k = i * p_DIM2_ + j; \ 53 out[k] = a[k] p_OP_ b[k]; \ 54 } \ 55 } \ 56 } 57 GCC_BINARY_OP_2D(gcc_1_0, *, 10, 10); 58 GCC_BINARY_OP_2D(gcc_1_1, -, 10, 10); 59 GCC_BINARY_OP_2D(gcc_1_2, +, 10, 10); 60 61 extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5, 62 float* gcc_input6, float* gcc_input7, float* out) { 63 float* buf_0 = (float*)malloc(4 * 100); 64 float* buf_1 = (float*)malloc(4 * 100); 65 gcc_1_2(gcc_input4, gcc_input5, buf_0); 66 gcc_1_1(buf_0, gcc_input6, buf_1); 67 gcc_1_0(buf_1, gcc_input7, out); 68 free(buf_0); 69 free(buf_1); 70 } 71 72 extern "C" int json_rt_1(TVMValue* value, int* type_code, int nargs) { 73 if (nargs != 5) { 74 printf("Expect 5 args, but get %d", nargs); 75 return 1; 76 } 77 DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle); 78 DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle); 79 DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle); 80 DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle); 81 DLTensor* out = static_cast<DLTensor*>(value[4].v_handle); 82 gcc_1_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data), 83 static_cast<float*>(arg2->data), static_cast<float*>(arg3->data), 84 static_cast<float*>(out->data)); 85 return 0; 86 } 87 88 GCC_BINARY_OP_2D(gcc_0_0, *, 10, 10); 89 GCC_BINARY_OP_2D(gcc_0_1, -, 10, 10); 90 GCC_BINARY_OP_2D(gcc_0_2, +, 10, 10); 91 92 extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1, 93 float* gcc_input2, float* gcc_input3, float* out) { 94 float* buf_0 = (float*)malloc(4 * 100); 95 float* buf_1 = (float*)malloc(4 * 100); 96 gcc_0_2(gcc_input0, gcc_input1, buf_0); 97 gcc_0_1(buf_0, gcc_input2, buf_1); 98 gcc_0_0(buf_1, gcc_input3, out); 99 free(buf_0); 100 free(buf_1); 101 } 102 103 extern "C" int json_rt_0(TVMValue* value, int* type_code, int nargs) { 104 if (nargs != 5) { 105 printf("Expect 5 args, but get %d", nargs); 106 return 1; 107 } 108 DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle); 109 DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle); 110 DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle); 111 DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle); 112 DLTensor* out = static_cast<DLTensor*>(value[4].v_handle); 113 gcc_0_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data), 114 static_cast<float*>(arg2->data), static_cast<float*>(arg3->data), 115 static_cast<float*>(out->data)); 116 return 0; 117 } 118 ''' 119 csource_module = _tvm_module.csource_module_create(code, "cc") 120 return csource_module 121 122 123def generate_engine_module(): 124 """ 125 Mock the codegen of an external backend with its own runtime engine 126 (e.g., MKL-DNN/TensorRT) 127 """ 128 129 code = r''' 130 #include <tvm/runtime/c_runtime_api.h> 131 #include <dlpack/dlpack.h> 132 #include "gcc_engine.h" 133 134 extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5, 135 float* gcc_input6, float* gcc_input7, float* out) { 136 137 std::string graph = 138 "add_2d,10,10\n" 139 "sub_2d,10,10\n" 140 "mul_2d,10,10\n"; 141 142 Engine engine; 143 engine.run(graph, {gcc_input4, gcc_input5, gcc_input6, gcc_input7}, out); 144 } 145 146 147 extern "C" int json_rt_1(TVMValue* value, int* type_code, int nargs) { 148 if (nargs != 5) { 149 printf("Expect 5 args, but get %d", nargs); 150 return 1; 151 } 152 DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle); 153 DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle); 154 DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle); 155 DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle); 156 DLTensor* out = static_cast<DLTensor*>(value[4].v_handle); 157 gcc_1_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data), 158 static_cast<float*>(arg2->data), static_cast<float*>(arg3->data), 159 static_cast<float*>(out->data)); 160 return 0; 161 } 162 163 extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1, 164 float* gcc_input2, float* gcc_input3, float* out) { 165 166 std::string graph = 167 "add_2d,10,10\n" 168 "sub_2d,10,10\n" 169 "mul_2d,10,10\n"; 170 171 Engine engine; 172 engine.run(graph, {gcc_input0, gcc_input1, gcc_input2, gcc_input3}, out); 173 174 } 175 176 extern "C" int json_rt_0(TVMValue* value, int* type_code, int nargs) { 177 if (nargs != 5) { 178 printf("Expect 5 args, but get %d", nargs); 179 return 1; 180 } 181 DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle); 182 DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle); 183 DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle); 184 DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle); 185 DLTensor* out = static_cast<DLTensor*>(value[4].v_handle); 186 gcc_0_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data), 187 static_cast<float*>(arg2->data), static_cast<float*>(arg3->data), 188 static_cast<float*>(out->data)); 189 return 0; 190 } 191 ''' 192 193 gen_gcc_engine() 194 csource_module = _tvm_module.csource_module_create(code, "cc") 195 return csource_module 196 197 198def gen_gcc_engine(): 199 """An example of external backend runtime engine. This is supposed to be provided 200 by third-party vendors and included when building the generated external kernel code. 201 """ 202 203 code = r''' 204 #ifndef _GCC_ENGINE_H_ 205 #define _GCC_ENGINE_H_ 206 #include <cstdint> 207 #include <string> 208 #include <sstream> 209 #include <vector> 210 211 #define GCC_BINARY_OP_2D(p_ID_, p_OP_) \ 212 void p_ID_(int64_t dim1, int64_t dim2, float* a, float* b, float* out) { \ 213 for (int64_t i = 0; i < dim1; ++i) { \ 214 for (int64_t j = 0; j < dim2; ++j) { \ 215 int64_t k = i * dim2 + j; \ 216 out[k] = a[k] p_OP_ b[k]; \ 217 } \ 218 } \ 219 } 220 GCC_BINARY_OP_2D(add_2d, +); 221 GCC_BINARY_OP_2D(sub_2d, -); 222 GCC_BINARY_OP_2D(mul_2d, *); 223 224 struct Layer { 225 void (*op)(int64_t, int64_t, float*, float*, float*); 226 std::vector<int64_t> shapes; 227 std::vector<float*> args; 228 }; 229 230 class Engine { 231 public: 232 float* alloc_buffer(int64_t size) { 233 float* buf = (float*)malloc(sizeof(float) * size); 234 buffers.push_back(buf); 235 return buf; 236 } 237 void add(std::string op, int64_t dim1, int64_t dim2, float* in1, float* in2, float* out) { 238 Layer layer; 239 layer.shapes.push_back(dim1); 240 layer.shapes.push_back(dim2); 241 layer.args.push_back(in1); 242 layer.args.push_back(in2); 243 layer.args.push_back(out); 244 245 if (op == "add_2d") 246 layer.op = &add_2d; 247 else if (op == "sub_2d") 248 layer.op = &sub_2d; 249 else if (op == "mul_2d") 250 layer.op = &mul_2d; 251 net.push_back(layer); 252 return ; 253 } 254 255 void run(std::string graph, std::vector<float*> args, float* out) { 256 std::stringstream ss(graph); 257 std::string line; 258 int layer_idx = 0; 259 int arg_idx = 0; 260 float* buf = nullptr; 261 262 while (std::getline(ss, line, '\n')) { 263 std::stringstream ss2(line); 264 std::string token; 265 std::vector<std::string> attrs; 266 while (std::getline(ss2, token, ',')) { 267 attrs.push_back(token); 268 } 269 int64_t dim1 = stoll(attrs[1]); 270 int64_t dim2 = stoll(attrs[2]); 271 auto out_buf = this->alloc_buffer(dim1 * dim2); 272 273 if (layer_idx == 0) { 274 this->add(attrs[0], dim1, dim2, args[0], args[1], out_buf); 275 buf = out_buf; 276 arg_idx = 2; 277 } 278 else { 279 this->add(attrs[0], dim1, dim2, buf, args[arg_idx], out_buf); 280 buf = out_buf; 281 arg_idx++; 282 } 283 layer_idx++; 284 } 285 this->net.back().args.back() = out; 286 287 for (auto layer : net) { 288 (*layer.op)(layer.shapes[0], layer.shapes[1], layer.args[0], layer.args[1], layer.args[2]); 289 } 290 } 291 ~Engine() { 292 for (auto buf : buffers) { 293 free(buf); 294 } 295 } 296 private: 297 std::vector<Layer> net; 298 std::vector<float*> buffers; 299 }; 300 301 #endif 302 ''' 303 header_file = tmp_path.relpath("gcc_engine.h") 304 with open(header_file, 'w') as f: 305 f.write(code) 306 307 308def get_synthetic_lib(): 309 x = relay.var('x', shape=(10, 10)) 310 w0 = relay.var('w0', shape=(10, 10)) 311 w1 = relay.var('w1', shape=(10, 10)) 312 w2 = relay.var('w2', shape=(10, 10)) 313 w3 = relay.var('w3', shape=(10, 10)) 314 w4 = relay.var('w4', shape=(10, 10)) 315 w5 = relay.var('w5', shape=(10, 10)) 316 w6 = relay.var('w6', shape=(10, 10)) 317 w7 = relay.var('w7', shape=(10, 10)) 318 319 # subgraph0 320 gcc_input0 = relay.var('gcc_input0', shape=(10, 10)) 321 gcc_input1 = relay.var('gcc_input1', shape=(10, 10)) 322 gcc_input2 = relay.var('gcc_input2', shape=(10, 10)) 323 gcc_input3 = relay.var('gcc_input3', shape=(10, 10)) 324 subgraph0 = relay.Function([gcc_input0, gcc_input1, gcc_input2, 325 gcc_input3], relay.copy(gcc_input0)) 326 subgraph0 = subgraph0.set_attribute( 327 "Primitive", tvm.expr.IntImm("int32", 1)) 328 329 # Call subgraph0 330 subgraph0_ret = relay.Call(subgraph0, [x, w0, w1, w2]) 331 332 # subgraph1 333 gcc_input4 = relay.var('gcc_input4', shape=(10, 10)) 334 gcc_input5 = relay.var('gcc_input5', shape=(10, 10)) 335 gcc_input6 = relay.var('gcc_input6', shape=(10, 10)) 336 gcc_input7 = relay.var('gcc_input7', shape=(10, 10)) 337 subgraph1 = relay.Function([gcc_input4, gcc_input5, gcc_input6, 338 gcc_input7], relay.copy(gcc_input4)) 339 subgraph1 = subgraph1.set_attribute( 340 "Primitive", tvm.expr.IntImm("int32", 1)) 341 342 # Call subgraph1 343 subgraph1_ret = relay.Call(subgraph1, [x, w3, w4, w5]) 344 345 # Other ops that will be executed on TVM. 346 add2 = relay.add(x, w6) 347 sub2 = relay.subtract(add2, w7) 348 ret = relay.concatenate((subgraph0_ret, subgraph1_ret, sub2), 0) 349 func = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], ret) 350 mod = relay.Module.from_expr(func) 351 _, lib, _ = relay.build(mod, "llvm") 352 return lib 353 354def get_whole_graph_json(): 355 nodex = {"op": "null", "name": "x", "inputs": []} 356 node0 = {"op": "null", "name": "w0", "inputs": []} 357 node1 = {"op": "null", "name": "w1", "inputs": []} 358 node2 = {"op": "null", "name": "w2", "inputs": []} 359 node3 = {"op": "null", "name": "w3", "inputs": []} 360 node4 = {"op": "null", "name": "w4", "inputs": []} 361 node5 = {"op": "null", "name": "w5", "inputs": []} 362 node6 = {"op": "null", "name": "w6", "inputs": []} 363 node7 = {"op": "null", "name": "w7", "inputs": []} 364 365 subgraph0 = { 366 "op": "tvm_op", 367 "name": "json_rt_0", 368 "attrs": { 369 "num_outputs": "1", 370 "num_inputs": "4", 371 "func_name": "json_rt_0", 372 "flatten_data": "0" 373 }, 374 "inputs": [ 375 [0, 0, 0], 376 [1, 0, 0], 377 [2, 0, 0], 378 [3, 0, 0], 379 ] 380 } 381 subgraph1 = { 382 "op": "tvm_op", 383 "name": "json_rt_1", 384 "attrs": { 385 "num_outputs": "1", 386 "num_inputs": "4", 387 "func_name": "json_rt_1", 388 "flatten_data": "0" 389 }, 390 "inputs": [ 391 [0, 0, 0], 392 [4, 0, 0], 393 [5, 0, 0], 394 [6, 0, 0], 395 ] 396 } 397 398 fused_op = { 399 "op": "tvm_op", 400 "name": "fused_add_subtract_concatenate", 401 "attrs": { 402 "num_outputs": "1", 403 "num_inputs": "5", 404 "func_name": "fused_add_subtract_concatenate", 405 "flatten_data": "0" 406 }, 407 "inputs": [ 408 [9, 0, 0], 409 [10, 0, 0], 410 [0, 0, 0], 411 [7, 0, 0], 412 [8, 0, 0] 413 ] 414 } 415 nodes = [nodex, node0, node1, node2, node3, node4, 416 node5, node6, node7, subgraph0, subgraph1, fused_op] 417 arg_nodes = [0, 1, 2, 3, 4, 5, 6, 7, 8] 418 heads = [[11, 0, 0]] 419 node_row_ptr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 420 storage_id = ["list_int", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]] 421 422 shape = ["list_shape", [ 423 [10, 10], [10, 10], [10, 10], [10, 10], [10, 10], [10, 10], 424 [10, 10], [10, 10], [10, 10], [10, 10], [10, 10], [30, 10]]] 425 426 dltype = ["list_str", [ 427 "float32", "float32", "float32", "float32", "float32", "float32", 428 "float32", "float32", "float32", "float32", "float32", "float32"]] 429 430 attrs = { 431 "shape": shape, 432 "dltype": dltype, 433 "storage_id": storage_id, 434 } 435 436 graph = {"nodes": nodes, 437 "arg_nodes": arg_nodes, 438 "node_row_ptr": node_row_ptr, 439 "heads": heads, 440 "attrs": attrs} 441 442 return json.dumps(graph) 443 444 445def run_extern(label, get_extern_src, **kwargs): 446 if which("gcc") is None: 447 print("Skip test because gcc is not available.") 448 449 obj_name = "{}.o".format(label) 450 lib_name = "external_{}.so".format(label) 451 452 # Get Json and the compiled library. 453 graph_json = get_whole_graph_json() 454 lib = get_synthetic_lib() 455 lib.save(obj_name) 456 457 # library that contains external code. 458 csource_module = get_extern_src() 459 kwargs["options"] = [obj_name] + kwargs["options"] 460 lib_path = tmp_path.relpath(lib_name) 461 csource_module.export_library(lib_path, fcompile=False, **kwargs) 462 # load module for execution. 463 lib = tvm.module.load(lib_path) 464 mod = tvm.contrib.graph_runtime.create(graph_json, lib, tvm.cpu(0)) 465 466 x_data = np.random.rand(10, 10).astype('float32') 467 mod.set_input("x", x_data) 468 w_data = [] 469 for i in range(8): 470 data = np.random.rand(10, 10).astype('float32') 471 w_data.append(data) 472 var = "w" + str(i) 473 mod.set_input(var, data) 474 mod.run() 475 out = tvm.nd.empty((30, 10), ctx=tvm.cpu()) 476 out = mod.get_output(0, out) 477 tvm.testing.assert_allclose( 478 out.asnumpy(), 479 np.concatenate((((x_data + w_data[0]) - w_data[1]) * w_data[2], 480 ((x_data + w_data[3]) - w_data[4]) * w_data[5], 481 x_data + w_data[6] - w_data[7]), 482 axis=0)) 483 484 485def test_dso_extern(): 486 run_extern("lib", generate_csource_module, options=["-O2", "-std=c++11"]) 487 488 489def test_engine_extern(): 490 run_extern("engine", 491 generate_engine_module, 492 options=["-O2", "-std=c++11", "-I" + tmp_path.relpath("")]) 493 494def test_json_extern(): 495 if which("gcc") is None: 496 print("Skip test because gcc is not available.") 497 498 # Get subgraph Json. 499 subgraph_json = ("json_rt_0\n" + 500 "input 0 10 10\n" + 501 "input 1 10 10\n" + 502 "input 2 10 10\n" + 503 "input 3 10 10\n" + 504 "add 4 inputs: 0 1 shape: 10 10\n" + 505 "sub 5 inputs: 4 2 shape: 10 10\n" + 506 "mul 6 inputs: 5 3 shape: 10 10\n" + 507 "json_rt_1\n" + 508 "input 0 10 10\n" + 509 "input 1 10 10\n" + 510 "input 2 10 10\n" + 511 "input 3 10 10\n" + 512 "add 4 inputs: 0 1 shape: 10 10\n" + 513 "sub 5 inputs: 4 2 shape: 10 10\n" + 514 "mul 6 inputs: 5 3 shape: 10 10") 515 516 subgraph_path = tmp_path.relpath('subgraph.examplejson') 517 with open(subgraph_path, 'w') as f: 518 f.write(subgraph_json) 519 520 # Get Json and module. 521 graph_json = get_whole_graph_json() 522 523 524 lib = get_synthetic_lib() 525 ext_lib = tvm.module.load(subgraph_path, "examplejson") 526 lib.import_module(ext_lib) 527 lib_name = 'external.so' 528 lib_path = tmp_path.relpath(lib_name) 529 lib.export_library(lib_path) 530 531 # load module for execution. 532 lib = tvm.module.load(lib_path) 533 mod = tvm.contrib.graph_runtime.create(graph_json, lib, tvm.cpu(0)) 534 535 x_data = np.random.rand(10, 10).astype('float32') 536 mod.set_input("x", x_data) 537 w_data = [] 538 for i in range(8): 539 data = np.random.rand(10, 10).astype('float32') 540 w_data.append(data) 541 var = "w" + str(i) 542 mod.set_input(var, data) 543 544 mod.run() 545 out = tvm.nd.empty((30, 10), ctx=tvm.cpu()) 546 out = mod.get_output(0, out) 547 tvm.testing.assert_allclose( 548 out.asnumpy(), 549 np.concatenate((((x_data + w_data[0]) - w_data[1]) * w_data[2], 550 ((x_data + w_data[3]) - w_data[4]) * w_data[5], 551 x_data + w_data[6] - w_data[7]), 552 axis=0)) 553 554 555if __name__ == "__main__": 556 test_dso_extern() 557 test_engine_extern() 558 test_json_extern() 559