1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17# pylint: disable=too-many-locals
18"""Unit tests for heterogeneous runtime"""
19import json
20import numpy as np
21
22import tvm
23from tvm import te
24from tvm.contrib import graph_runtime, util
25from tvm import topi
26
27
28def get_simplex_graph(host_dev_type, device_dev_type):
29    r""" Return the hand-crafted json object where only one copy node is
30    inserted. This node copies data from the target device to cpu.
31    The network is constructed as following:
32                 A    B
33                  \  /
34             elemwise_add  (gpu)
35                     \
36                     copy      C
37                       \      /
38                     elemwise_sub  (cpu)
39
40    Parameters
41    ----------
42    host_dev_type : int
43        The device type of the host processor, e.g. cpu.
44    device_dev_type : int
45        The device type of the device processor, e.g. gpu, opencl, etc.
46
47    Returns
48    -------
49    json : json
50        A json encoded object.
51    """
52    # Construct each node in the graph.
53    var_a = {"op": "null", "name": "A", "inputs": []}
54    var_b = {"op": "null", "name": "B", "inputs": []}
55    elemwise_add = {
56        "op": "tvm_op",
57        "name": "elemwise_add",
58        "attrs": {
59            "flatten_data": "1",
60            "func_name": "elemwise_add",
61            "num_inputs": "2",
62            "num_outputs": "1",
63        },
64        "inputs": [[0, 0, 0], [1, 0, 0]],
65    }
66    copy = {
67        "op": "tvm_op",
68        "name": "__copy_add_to_sub",
69        "attrs": {
70            "flatten_data": "0",
71            "func_name": "__copy",
72            "num_inputs": "1",
73            "num_outputs": "1",
74        },
75        "inputs": [[2, 0, 0]],
76    }
77    var_c = {"op": "null", "name": "C", "inputs": []}
78    elemwise_sub = {
79        "op": "tvm_op",
80        "name": "elemwise_sub",
81        "attrs": {
82            "flatten_data": "0",
83            "func_name": "elemwise_sub",
84            "num_inputs": "2",
85            "num_outputs": "1",
86        },
87        "inputs": [[3, 0, 0], [4, 0, 0]],
88    }
89
90    # Group the nodes.
91    nodes = [var_a, var_b, elemwise_add, copy, var_c, elemwise_sub]
92    arg_nodes = [0, 1, 4]
93    node_row_ptr = [0, 1, 2, 3, 4, 5, 6]
94    heads = [[5, 0, 0]]
95    shape = (4,)
96    attrs = {
97        "storage_id": ["list_int", [3, 4, 0, 1, 5, 2]],
98        "shape": ["list_shape", [shape, shape, shape, shape, shape, shape]],
99        "device_index": [
100            "list_int",
101            [
102                device_dev_type,
103                device_dev_type,
104                device_dev_type,
105                host_dev_type,
106                host_dev_type,
107                host_dev_type,
108            ],
109        ],
110        "dtype": ["list_int", [0, 0, 0, 0, 0, 0]],
111        "dltype": ["list_str", ["float32", "float32", "float32", "float32", "float32", "float32"]],
112    }
113
114    # Construct the graph.
115    graph = {
116        "nodes": nodes,
117        "arg_nodes": arg_nodes,
118        "node_row_ptr": node_row_ptr,
119        "heads": heads,
120        "attrs": attrs,
121    }
122    return json.dumps(graph)
123
124
125def test_simplex_data_transferring():
126    r"""
127    Test the heterogeneous execution of a simple network where data
128    transferring is from the target device to the host processor at runtime.
129    The host processor is always assumed to be cpu, and the device varies.
130    """
131    host = "cpu"
132    target_host = "llvm"
133    host_ctx = tvm.context(host)
134    if not tvm.runtime.enabled(target_host):
135        print("Skip test because llvm is not enabled.")
136        return
137
138    def check_device(device, target_device):
139        if not tvm.runtime.enabled(target_device):
140            print("Skip test because {} is not enabled.".format(target_device))
141            return
142
143        device_ctx = tvm.context(device)
144        graph = get_simplex_graph(host_ctx.device_type, device_ctx.device_type)
145        shape = (4,)
146
147        # Create module for add whose target is the device.
148        tensor_a = te.placeholder(shape, name="A")
149        tensor_b = te.placeholder(shape, name="B")
150        elemwise_add = te.compute(
151            shape, lambda *i: tensor_a(*i) + tensor_b(*i), name="elemwise_add"
152        )
153        target = topi.cpp.TEST_create_target(device)
154        schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add])
155        lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add], name="elemwise_add")
156
157        # Insert copy. Neither compute nor schedule is required for the copy
158        # node. The compute will be performed at runtime which is just data
159        # copy from the input to the output.
160        tensor_copy = te.placeholder(shape, name="__copy")
161
162        # Create module for sub whose target is the host.
163        tensor_c = te.placeholder(shape, name="C")
164        elemwise_sub = te.compute(
165            shape, lambda *i: tensor_copy(*i) - tensor_c(*i), name="elemwise_sub"
166        )
167        schedule_sub = te.create_schedule(elemwise_sub.op)
168        lower_sub = tvm.lower(
169            schedule_sub, [tensor_copy, tensor_c, elemwise_sub], name="elemwise_sub"
170        )
171
172        target_flist = {target_device: lower_add, target_host: lower_sub}
173        mhost = tvm.build(target_flist, target_host=target_host)
174        ctx = [host_ctx, device_ctx]
175        mod = graph_runtime.create(graph, mhost, ctx)
176        params = {}
177        params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype)
178        params["B"] = tensor_b = np.random.uniform(size=shape).astype(tensor_b.dtype)
179        params["C"] = tensor_c = np.random.uniform(size=shape).astype(tensor_c.dtype)
180        mod.set_input(**params)
181        mod.run()
182        out = mod.get_output(0, tvm.nd.empty(shape))
183        np.testing.assert_equal(out.asnumpy(), (tensor_a + tensor_b) - tensor_c)
184
185    dev_tar = {"cuda": "cuda", "opencl": "opencl"}
186    for device, target in dev_tar.items():
187        with tvm.target.Target(device):
188            check_device(device, target)
189
190
191def get_duplex_graph(host_dev_type, device_dev_type):
192    r""" Return the hand-crafted json object where two copy nodes are inserted.
193    Data transferring happens back-and-forth between the target device and CPU.
194    The network is constructed as following:
195                 A    B
196                  \  /
197             elemwise_add  (gpu)
198                     \
199                     copy        C
200                       \        /
201                      elemwise_sub  (cpu)
202                         \
203                         copy          D
204                           \          /
205                           elemwise_add  (gpu)
206
207    Parameters
208    ----------
209    host_dev_type : int
210        The device type of the host processor, e.g. cpu.
211    device_dev_type : int
212        The device type of the device processor, e.g. gpu, opencl, etc.
213
214    Returns
215    -------
216    json : json
217        A json encoded object.
218    """
219    # Construct each node in the graph.
220    var_a = {"op": "null", "name": "A", "inputs": []}
221    var_b = {"op": "null", "name": "B", "inputs": []}
222    elemwise_add0 = {
223        "op": "tvm_op",
224        "name": "elemwise_add0",
225        "attrs": {
226            "flatten_data": "1",
227            "func_name": "elemwise_add0",
228            "num_inputs": "2",
229            "num_outputs": "1",
230        },
231        "inputs": [[0, 0, 0], [1, 0, 0]],
232    }
233    copy_add_sub = {
234        "op": "tvm_op",
235        "name": "__copy_add_to_sub",
236        "attrs": {
237            "flatten_data": "0",
238            "func_name": "__copy",
239            "num_inputs": "1",
240            "num_outputs": "1",
241        },
242        "inputs": [[2, 0, 0]],
243    }
244    var_c = {"op": "null", "name": "C", "inputs": []}
245    elemwise_sub = {
246        "op": "tvm_op",
247        "name": "elemwise_sub",
248        "attrs": {
249            "flatten_data": "0",
250            "func_name": "elemwise_sub",
251            "num_inputs": "2",
252            "num_outputs": "1",
253        },
254        "inputs": [[3, 0, 0], [4, 0, 0]],
255    }
256    copy_sub_add = {
257        "op": "tvm_op",
258        "name": "__copy_sub_to_add",
259        "attrs": {
260            "flatten_data": "0",
261            "func_name": "__copy",
262            "num_inputs": "1",
263            "num_outputs": "1",
264        },
265        "inputs": [[5, 0, 0]],
266    }
267    var_d = {"op": "null", "name": "D", "inputs": []}
268    elemwise_add1 = {
269        "op": "tvm_op",
270        "name": "elemwise_add1",
271        "attrs": {
272            "flatten_data": "0",
273            "func_name": "elemwise_add1",
274            "num_inputs": "2",
275            "num_outputs": "1",
276        },
277        "inputs": [[6, 0, 0], [7, 0, 0]],
278    }
279
280    # Group the nodes.
281    nodes = [
282        var_a,
283        var_b,
284        elemwise_add0,
285        copy_add_sub,
286        var_c,
287        elemwise_sub,
288        copy_sub_add,
289        var_d,
290        elemwise_add1,
291    ]
292    arg_nodes = [0, 1, 4, 7]
293    node_row_ptr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
294    heads = [[8, 0, 0]]
295    shape = (4,)
296    attrs = {
297        "storage_id": ["list_int", [4, 5, 0, 1, 6, 2, 0, 7, 3]],
298        "shape": ["list_shape", [shape, shape, shape, shape, shape, shape, shape, shape, shape]],
299        "device_index": [
300            "list_int",
301            [
302                device_dev_type,
303                device_dev_type,
304                device_dev_type,
305                host_dev_type,
306                host_dev_type,
307                host_dev_type,
308                device_dev_type,
309                device_dev_type,
310                device_dev_type,
311            ],
312        ],
313        "dtype": ["list_int", [0, 0, 0, 0, 0, 0, 0, 0, 0]],
314        "dltype": [
315            "list_str",
316            [
317                "float32",
318                "float32",
319                "float32",
320                "float32",
321                "float32",
322                "float32",
323                "float32",
324                "float32",
325                "float32",
326            ],
327        ],
328    }
329
330    # Construct the graph.
331    graph = {
332        "nodes": nodes,
333        "arg_nodes": arg_nodes,
334        "node_row_ptr": node_row_ptr,
335        "heads": heads,
336        "attrs": attrs,
337    }
338    return json.dumps(graph)
339
340
341def test_duplex_data_transferring():
342    r"""
343    Test the heterogeneous execution of a simple network where data
344    transferring occurs back-and-forth between the target device and host
345    processor.
346    The host processor is always assumed to be cpu, and the target device
347    varies.
348    """
349    host = "cpu"
350    target_host = "llvm"
351    host_ctx = tvm.context(host)
352    if not tvm.runtime.enabled(target_host):
353        print("Skip test because llvm is not enabled.")
354        return
355
356    def check_device(device, target_device):
357        if not tvm.runtime.enabled(target_device):
358            print("Skip test because {} is not enabled.".format(target_device))
359            return
360
361        device_ctx = tvm.context(device)
362        graph = get_duplex_graph(host_ctx.device_type, device_ctx.device_type)
363        shape = (4,)
364
365        # Insert copy nodes for data transferring between add and sub nodes.
366        # Transfers data from gpu to cpu.
367        copy_add_sub = te.placeholder(shape, name="__copy0")
368        # Transfers data from cpu to gpu.
369        copy_sub_add = te.placeholder(shape, name="__copy1")
370
371        # Create a module containing adds on the device.
372        tensor_a = te.placeholder(shape, name="A")
373        tensor_b = te.placeholder(shape, name="B")
374        tensor_d = te.placeholder(shape, name="D")
375        elemwise_add0 = te.compute(
376            shape, lambda *i: tensor_a(*i) + tensor_b(*i), name="elemwise_add0"
377        )
378        elemwise_add1 = te.compute(
379            shape, lambda *i: copy_sub_add(*i) + tensor_d(*i), name="elemwise_add1"
380        )
381        target = topi.cpp.TEST_create_target(device)
382        add_schedule0 = topi.cpp.cuda.schedule_injective(target, [elemwise_add0])
383        lower_add0 = tvm.lower(
384            add_schedule0, [tensor_a, tensor_b, elemwise_add0], name="elemwise_add0"
385        )
386        add_schedule1 = topi.cpp.cuda.schedule_injective(target, [elemwise_add1])
387        lower_add1 = tvm.lower(
388            add_schedule1, [tensor_d, copy_sub_add, elemwise_add1], name="elemwise_add1"
389        )
390        # Create module for sub whose target is the host.
391        tensor_c = te.placeholder(shape, name="C")
392        elemwise_sub = te.compute(
393            shape, lambda *i: copy_add_sub(*i) - tensor_c(*i), name="elemwise_sub"
394        )
395        sub_schedule = te.create_schedule(elemwise_sub.op)
396        lower_sub = tvm.lower(
397            sub_schedule, [copy_add_sub, tensor_c, elemwise_sub], name="elemwise_sub"
398        )
399
400        lower_add0.update(lower_add1)
401        target_flist = {target_device: lower_add0, target_host: lower_sub}
402        mhost = tvm.build(target_flist, target_host=target_host)
403        ctx = [host_ctx, device_ctx]
404        params = {}
405        params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype)
406        params["B"] = tensor_b = np.random.uniform(size=shape).astype(tensor_b.dtype)
407        params["C"] = tensor_c = np.random.uniform(size=shape).astype(tensor_c.dtype)
408        params["D"] = tensor_d = np.random.uniform(size=shape).astype(tensor_d.dtype)
409
410        def check_verify():
411            mod = graph_runtime.create(graph, mhost, ctx)
412            mod.set_input(**params)
413            mod.run()
414            out = mod.get_output(0, tvm.nd.empty(shape))
415            np.testing.assert_equal(out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
416
417        def check_load_module():
418            temp = util.tempdir()
419            path_lib = temp.relpath("deploy.so")
420            mhost.export_library(path_lib)
421            with open(temp.relpath("deploy.json"), "w") as out_file:
422                out_file.write(graph)
423            loaded_lib = tvm.runtime.load_module(path_lib)
424            loaded_graph = open(temp.relpath("deploy.json")).read()
425            mod = graph_runtime.create(loaded_graph, loaded_lib, ctx)
426            mod.set_input(**params)
427            mod.run()
428            out = mod.get_output(0, tvm.nd.empty(shape))
429            np.testing.assert_equal(out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
430
431        check_verify()
432        check_load_module()
433
434    dev_tar = {"cuda": "cuda", "opencl": "opencl"}
435    for device, target in dev_tar.items():
436        with tvm.target.Target(device):
437            check_device(device, target)
438
439
440if __name__ == "__main__":
441    test_simplex_data_transferring()
442    test_duplex_data_transferring()
443