1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17# pylint: disable=too-many-locals 18"""Unit tests for heterogeneous runtime""" 19import json 20import numpy as np 21 22import tvm 23from tvm import te 24from tvm.contrib import graph_runtime, util 25from tvm import topi 26 27 28def get_simplex_graph(host_dev_type, device_dev_type): 29 r""" Return the hand-crafted json object where only one copy node is 30 inserted. This node copies data from the target device to cpu. 31 The network is constructed as following: 32 A B 33 \ / 34 elemwise_add (gpu) 35 \ 36 copy C 37 \ / 38 elemwise_sub (cpu) 39 40 Parameters 41 ---------- 42 host_dev_type : int 43 The device type of the host processor, e.g. cpu. 44 device_dev_type : int 45 The device type of the device processor, e.g. gpu, opencl, etc. 46 47 Returns 48 ------- 49 json : json 50 A json encoded object. 51 """ 52 # Construct each node in the graph. 53 var_a = {"op": "null", "name": "A", "inputs": []} 54 var_b = {"op": "null", "name": "B", "inputs": []} 55 elemwise_add = { 56 "op": "tvm_op", 57 "name": "elemwise_add", 58 "attrs": { 59 "flatten_data": "1", 60 "func_name": "elemwise_add", 61 "num_inputs": "2", 62 "num_outputs": "1", 63 }, 64 "inputs": [[0, 0, 0], [1, 0, 0]], 65 } 66 copy = { 67 "op": "tvm_op", 68 "name": "__copy_add_to_sub", 69 "attrs": { 70 "flatten_data": "0", 71 "func_name": "__copy", 72 "num_inputs": "1", 73 "num_outputs": "1", 74 }, 75 "inputs": [[2, 0, 0]], 76 } 77 var_c = {"op": "null", "name": "C", "inputs": []} 78 elemwise_sub = { 79 "op": "tvm_op", 80 "name": "elemwise_sub", 81 "attrs": { 82 "flatten_data": "0", 83 "func_name": "elemwise_sub", 84 "num_inputs": "2", 85 "num_outputs": "1", 86 }, 87 "inputs": [[3, 0, 0], [4, 0, 0]], 88 } 89 90 # Group the nodes. 91 nodes = [var_a, var_b, elemwise_add, copy, var_c, elemwise_sub] 92 arg_nodes = [0, 1, 4] 93 node_row_ptr = [0, 1, 2, 3, 4, 5, 6] 94 heads = [[5, 0, 0]] 95 shape = (4,) 96 attrs = { 97 "storage_id": ["list_int", [3, 4, 0, 1, 5, 2]], 98 "shape": ["list_shape", [shape, shape, shape, shape, shape, shape]], 99 "device_index": [ 100 "list_int", 101 [ 102 device_dev_type, 103 device_dev_type, 104 device_dev_type, 105 host_dev_type, 106 host_dev_type, 107 host_dev_type, 108 ], 109 ], 110 "dtype": ["list_int", [0, 0, 0, 0, 0, 0]], 111 "dltype": ["list_str", ["float32", "float32", "float32", "float32", "float32", "float32"]], 112 } 113 114 # Construct the graph. 115 graph = { 116 "nodes": nodes, 117 "arg_nodes": arg_nodes, 118 "node_row_ptr": node_row_ptr, 119 "heads": heads, 120 "attrs": attrs, 121 } 122 return json.dumps(graph) 123 124 125def test_simplex_data_transferring(): 126 r""" 127 Test the heterogeneous execution of a simple network where data 128 transferring is from the target device to the host processor at runtime. 129 The host processor is always assumed to be cpu, and the device varies. 130 """ 131 host = "cpu" 132 target_host = "llvm" 133 host_ctx = tvm.context(host) 134 if not tvm.runtime.enabled(target_host): 135 print("Skip test because llvm is not enabled.") 136 return 137 138 def check_device(device, target_device): 139 if not tvm.runtime.enabled(target_device): 140 print("Skip test because {} is not enabled.".format(target_device)) 141 return 142 143 device_ctx = tvm.context(device) 144 graph = get_simplex_graph(host_ctx.device_type, device_ctx.device_type) 145 shape = (4,) 146 147 # Create module for add whose target is the device. 148 tensor_a = te.placeholder(shape, name="A") 149 tensor_b = te.placeholder(shape, name="B") 150 elemwise_add = te.compute( 151 shape, lambda *i: tensor_a(*i) + tensor_b(*i), name="elemwise_add" 152 ) 153 target = topi.cpp.TEST_create_target(device) 154 schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add]) 155 lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add], name="elemwise_add") 156 157 # Insert copy. Neither compute nor schedule is required for the copy 158 # node. The compute will be performed at runtime which is just data 159 # copy from the input to the output. 160 tensor_copy = te.placeholder(shape, name="__copy") 161 162 # Create module for sub whose target is the host. 163 tensor_c = te.placeholder(shape, name="C") 164 elemwise_sub = te.compute( 165 shape, lambda *i: tensor_copy(*i) - tensor_c(*i), name="elemwise_sub" 166 ) 167 schedule_sub = te.create_schedule(elemwise_sub.op) 168 lower_sub = tvm.lower( 169 schedule_sub, [tensor_copy, tensor_c, elemwise_sub], name="elemwise_sub" 170 ) 171 172 target_flist = {target_device: lower_add, target_host: lower_sub} 173 mhost = tvm.build(target_flist, target_host=target_host) 174 ctx = [host_ctx, device_ctx] 175 mod = graph_runtime.create(graph, mhost, ctx) 176 params = {} 177 params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype) 178 params["B"] = tensor_b = np.random.uniform(size=shape).astype(tensor_b.dtype) 179 params["C"] = tensor_c = np.random.uniform(size=shape).astype(tensor_c.dtype) 180 mod.set_input(**params) 181 mod.run() 182 out = mod.get_output(0, tvm.nd.empty(shape)) 183 np.testing.assert_equal(out.asnumpy(), (tensor_a + tensor_b) - tensor_c) 184 185 dev_tar = {"cuda": "cuda", "opencl": "opencl"} 186 for device, target in dev_tar.items(): 187 with tvm.target.Target(device): 188 check_device(device, target) 189 190 191def get_duplex_graph(host_dev_type, device_dev_type): 192 r""" Return the hand-crafted json object where two copy nodes are inserted. 193 Data transferring happens back-and-forth between the target device and CPU. 194 The network is constructed as following: 195 A B 196 \ / 197 elemwise_add (gpu) 198 \ 199 copy C 200 \ / 201 elemwise_sub (cpu) 202 \ 203 copy D 204 \ / 205 elemwise_add (gpu) 206 207 Parameters 208 ---------- 209 host_dev_type : int 210 The device type of the host processor, e.g. cpu. 211 device_dev_type : int 212 The device type of the device processor, e.g. gpu, opencl, etc. 213 214 Returns 215 ------- 216 json : json 217 A json encoded object. 218 """ 219 # Construct each node in the graph. 220 var_a = {"op": "null", "name": "A", "inputs": []} 221 var_b = {"op": "null", "name": "B", "inputs": []} 222 elemwise_add0 = { 223 "op": "tvm_op", 224 "name": "elemwise_add0", 225 "attrs": { 226 "flatten_data": "1", 227 "func_name": "elemwise_add0", 228 "num_inputs": "2", 229 "num_outputs": "1", 230 }, 231 "inputs": [[0, 0, 0], [1, 0, 0]], 232 } 233 copy_add_sub = { 234 "op": "tvm_op", 235 "name": "__copy_add_to_sub", 236 "attrs": { 237 "flatten_data": "0", 238 "func_name": "__copy", 239 "num_inputs": "1", 240 "num_outputs": "1", 241 }, 242 "inputs": [[2, 0, 0]], 243 } 244 var_c = {"op": "null", "name": "C", "inputs": []} 245 elemwise_sub = { 246 "op": "tvm_op", 247 "name": "elemwise_sub", 248 "attrs": { 249 "flatten_data": "0", 250 "func_name": "elemwise_sub", 251 "num_inputs": "2", 252 "num_outputs": "1", 253 }, 254 "inputs": [[3, 0, 0], [4, 0, 0]], 255 } 256 copy_sub_add = { 257 "op": "tvm_op", 258 "name": "__copy_sub_to_add", 259 "attrs": { 260 "flatten_data": "0", 261 "func_name": "__copy", 262 "num_inputs": "1", 263 "num_outputs": "1", 264 }, 265 "inputs": [[5, 0, 0]], 266 } 267 var_d = {"op": "null", "name": "D", "inputs": []} 268 elemwise_add1 = { 269 "op": "tvm_op", 270 "name": "elemwise_add1", 271 "attrs": { 272 "flatten_data": "0", 273 "func_name": "elemwise_add1", 274 "num_inputs": "2", 275 "num_outputs": "1", 276 }, 277 "inputs": [[6, 0, 0], [7, 0, 0]], 278 } 279 280 # Group the nodes. 281 nodes = [ 282 var_a, 283 var_b, 284 elemwise_add0, 285 copy_add_sub, 286 var_c, 287 elemwise_sub, 288 copy_sub_add, 289 var_d, 290 elemwise_add1, 291 ] 292 arg_nodes = [0, 1, 4, 7] 293 node_row_ptr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 294 heads = [[8, 0, 0]] 295 shape = (4,) 296 attrs = { 297 "storage_id": ["list_int", [4, 5, 0, 1, 6, 2, 0, 7, 3]], 298 "shape": ["list_shape", [shape, shape, shape, shape, shape, shape, shape, shape, shape]], 299 "device_index": [ 300 "list_int", 301 [ 302 device_dev_type, 303 device_dev_type, 304 device_dev_type, 305 host_dev_type, 306 host_dev_type, 307 host_dev_type, 308 device_dev_type, 309 device_dev_type, 310 device_dev_type, 311 ], 312 ], 313 "dtype": ["list_int", [0, 0, 0, 0, 0, 0, 0, 0, 0]], 314 "dltype": [ 315 "list_str", 316 [ 317 "float32", 318 "float32", 319 "float32", 320 "float32", 321 "float32", 322 "float32", 323 "float32", 324 "float32", 325 "float32", 326 ], 327 ], 328 } 329 330 # Construct the graph. 331 graph = { 332 "nodes": nodes, 333 "arg_nodes": arg_nodes, 334 "node_row_ptr": node_row_ptr, 335 "heads": heads, 336 "attrs": attrs, 337 } 338 return json.dumps(graph) 339 340 341def test_duplex_data_transferring(): 342 r""" 343 Test the heterogeneous execution of a simple network where data 344 transferring occurs back-and-forth between the target device and host 345 processor. 346 The host processor is always assumed to be cpu, and the target device 347 varies. 348 """ 349 host = "cpu" 350 target_host = "llvm" 351 host_ctx = tvm.context(host) 352 if not tvm.runtime.enabled(target_host): 353 print("Skip test because llvm is not enabled.") 354 return 355 356 def check_device(device, target_device): 357 if not tvm.runtime.enabled(target_device): 358 print("Skip test because {} is not enabled.".format(target_device)) 359 return 360 361 device_ctx = tvm.context(device) 362 graph = get_duplex_graph(host_ctx.device_type, device_ctx.device_type) 363 shape = (4,) 364 365 # Insert copy nodes for data transferring between add and sub nodes. 366 # Transfers data from gpu to cpu. 367 copy_add_sub = te.placeholder(shape, name="__copy0") 368 # Transfers data from cpu to gpu. 369 copy_sub_add = te.placeholder(shape, name="__copy1") 370 371 # Create a module containing adds on the device. 372 tensor_a = te.placeholder(shape, name="A") 373 tensor_b = te.placeholder(shape, name="B") 374 tensor_d = te.placeholder(shape, name="D") 375 elemwise_add0 = te.compute( 376 shape, lambda *i: tensor_a(*i) + tensor_b(*i), name="elemwise_add0" 377 ) 378 elemwise_add1 = te.compute( 379 shape, lambda *i: copy_sub_add(*i) + tensor_d(*i), name="elemwise_add1" 380 ) 381 target = topi.cpp.TEST_create_target(device) 382 add_schedule0 = topi.cpp.cuda.schedule_injective(target, [elemwise_add0]) 383 lower_add0 = tvm.lower( 384 add_schedule0, [tensor_a, tensor_b, elemwise_add0], name="elemwise_add0" 385 ) 386 add_schedule1 = topi.cpp.cuda.schedule_injective(target, [elemwise_add1]) 387 lower_add1 = tvm.lower( 388 add_schedule1, [tensor_d, copy_sub_add, elemwise_add1], name="elemwise_add1" 389 ) 390 # Create module for sub whose target is the host. 391 tensor_c = te.placeholder(shape, name="C") 392 elemwise_sub = te.compute( 393 shape, lambda *i: copy_add_sub(*i) - tensor_c(*i), name="elemwise_sub" 394 ) 395 sub_schedule = te.create_schedule(elemwise_sub.op) 396 lower_sub = tvm.lower( 397 sub_schedule, [copy_add_sub, tensor_c, elemwise_sub], name="elemwise_sub" 398 ) 399 400 lower_add0.update(lower_add1) 401 target_flist = {target_device: lower_add0, target_host: lower_sub} 402 mhost = tvm.build(target_flist, target_host=target_host) 403 ctx = [host_ctx, device_ctx] 404 params = {} 405 params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype) 406 params["B"] = tensor_b = np.random.uniform(size=shape).astype(tensor_b.dtype) 407 params["C"] = tensor_c = np.random.uniform(size=shape).astype(tensor_c.dtype) 408 params["D"] = tensor_d = np.random.uniform(size=shape).astype(tensor_d.dtype) 409 410 def check_verify(): 411 mod = graph_runtime.create(graph, mhost, ctx) 412 mod.set_input(**params) 413 mod.run() 414 out = mod.get_output(0, tvm.nd.empty(shape)) 415 np.testing.assert_equal(out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d) 416 417 def check_load_module(): 418 temp = util.tempdir() 419 path_lib = temp.relpath("deploy.so") 420 mhost.export_library(path_lib) 421 with open(temp.relpath("deploy.json"), "w") as out_file: 422 out_file.write(graph) 423 loaded_lib = tvm.runtime.load_module(path_lib) 424 loaded_graph = open(temp.relpath("deploy.json")).read() 425 mod = graph_runtime.create(loaded_graph, loaded_lib, ctx) 426 mod.set_input(**params) 427 mod.run() 428 out = mod.get_output(0, tvm.nd.empty(shape)) 429 np.testing.assert_equal(out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d) 430 431 check_verify() 432 check_load_module() 433 434 dev_tar = {"cuda": "cuda", "opencl": "opencl"} 435 for device, target in dev_tar.items(): 436 with tvm.target.Target(device): 437 check_device(device, target) 438 439 440if __name__ == "__main__": 441 test_simplex_data_transferring() 442 test_duplex_data_transferring() 443