1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17from shutil import which
18import json
19import pytest
20import sys
21import numpy as np
22
23import tvm
24from tvm import relay
25from tvm import module as _tvm_module
26from tvm.contrib import util
27
28tmp_path = util.tempdir()
29
30
31def generate_csource_module():
32    """Mock the codegen with an external library (e.g., CBLAS/cuDNN)"""
33
34    code = r'''
35    #include <tvm/runtime/c_runtime_api.h>
36    #include <dlpack/dlpack.h>
37    #include <cstdint>
38    #include <cstring>
39    #include <iostream>
40
41    #define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)           \
42      extern "C" void p_ID_(float* a, float* b, float* out) { \
43        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
44          out[i] = a[i] p_OP_ b[i];                           \
45        }                                                     \
46      }
47
48    #define GCC_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)  \
49      extern "C" void p_ID_(float* a, float* b, float* out) { \
50        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
51          for (int64_t j = 0; j < p_DIM2_; ++j) {             \
52            int64_t k = i * p_DIM2_ + j;                      \
53            out[k] = a[k] p_OP_ b[k];                         \
54          }                                                   \
55        }                                                     \
56      }
57    GCC_BINARY_OP_2D(gcc_1_0, *, 10, 10);
58    GCC_BINARY_OP_2D(gcc_1_1, -, 10, 10);
59    GCC_BINARY_OP_2D(gcc_1_2, +, 10, 10);
60
61    extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
62                           float* gcc_input6, float* gcc_input7, float* out) {
63      float* buf_0 = (float*)malloc(4 * 100);
64      float* buf_1 = (float*)malloc(4 * 100);
65      gcc_1_2(gcc_input4, gcc_input5, buf_0);
66      gcc_1_1(buf_0, gcc_input6, buf_1);
67      gcc_1_0(buf_1, gcc_input7, out);
68      free(buf_0);
69      free(buf_1);
70    }
71
72    extern "C" int json_rt_1(TVMValue* value, int* type_code, int nargs) {
73      if (nargs != 5) {
74        printf("Expect 5 args, but get %d", nargs);
75        return 1;
76      }
77      DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
78      DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
79      DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
80      DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
81      DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
82      gcc_1_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
83             static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
84             static_cast<float*>(out->data));
85      return 0;
86    }
87
88    GCC_BINARY_OP_2D(gcc_0_0, *, 10, 10);
89    GCC_BINARY_OP_2D(gcc_0_1, -, 10, 10);
90    GCC_BINARY_OP_2D(gcc_0_2, +, 10, 10);
91
92    extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1,
93                           float* gcc_input2, float* gcc_input3, float* out) {
94      float* buf_0 = (float*)malloc(4 * 100);
95      float* buf_1 = (float*)malloc(4 * 100);
96      gcc_0_2(gcc_input0, gcc_input1, buf_0);
97      gcc_0_1(buf_0, gcc_input2, buf_1);
98      gcc_0_0(buf_1, gcc_input3, out);
99      free(buf_0);
100      free(buf_1);
101    }
102
103    extern "C" int json_rt_0(TVMValue* value, int* type_code, int nargs) {
104      if (nargs != 5) {
105        printf("Expect 5 args, but get %d", nargs);
106        return 1;
107      }
108      DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
109      DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
110      DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
111      DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
112      DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
113      gcc_0_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
114             static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
115             static_cast<float*>(out->data));
116      return 0;
117    }
118    '''
119    csource_module = _tvm_module.csource_module_create(code, "cc")
120    return csource_module
121
122
123def generate_engine_module():
124    """
125    Mock the codegen of an external backend with its own runtime engine
126    (e.g., MKL-DNN/TensorRT)
127    """
128
129    code = r'''
130    #include <tvm/runtime/c_runtime_api.h>
131    #include <dlpack/dlpack.h>
132    #include "gcc_engine.h"
133
134    extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
135            float* gcc_input6, float* gcc_input7, float* out) {
136
137        std::string graph =
138            "add_2d,10,10\n"
139            "sub_2d,10,10\n"
140            "mul_2d,10,10\n";
141
142        Engine engine;
143        engine.run(graph, {gcc_input4, gcc_input5, gcc_input6, gcc_input7}, out);
144    }
145
146
147    extern "C" int json_rt_1(TVMValue* value, int* type_code, int nargs) {
148        if (nargs != 5) {
149            printf("Expect 5 args, but get %d", nargs);
150            return 1;
151        }
152        DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
153        DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
154        DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
155        DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
156        DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
157        gcc_1_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
158                static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
159                static_cast<float*>(out->data));
160        return 0;
161    }
162
163    extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1,
164            float* gcc_input2, float* gcc_input3, float* out) {
165
166        std::string graph =
167            "add_2d,10,10\n"
168            "sub_2d,10,10\n"
169            "mul_2d,10,10\n";
170
171        Engine engine;
172        engine.run(graph, {gcc_input0, gcc_input1, gcc_input2, gcc_input3}, out);
173
174    }
175
176    extern "C" int json_rt_0(TVMValue* value, int* type_code, int nargs) {
177        if (nargs != 5) {
178            printf("Expect 5 args, but get %d", nargs);
179            return 1;
180        }
181        DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
182        DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
183        DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
184        DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
185        DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
186        gcc_0_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
187                static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
188                static_cast<float*>(out->data));
189        return 0;
190    }
191    '''
192
193    gen_gcc_engine()
194    csource_module = _tvm_module.csource_module_create(code, "cc")
195    return csource_module
196
197
198def gen_gcc_engine():
199    """An example of external backend runtime engine. This is supposed to be provided
200      by third-party vendors and included when building the generated external kernel code.
201    """
202
203    code = r'''
204    #ifndef _GCC_ENGINE_H_
205    #define _GCC_ENGINE_H_
206    #include <cstdint>
207    #include <string>
208    #include <sstream>
209    #include <vector>
210
211    #define GCC_BINARY_OP_2D(p_ID_, p_OP_)  \
212      void p_ID_(int64_t dim1, int64_t dim2, float* a, float* b, float* out) { \
213        for (int64_t i = 0; i < dim1; ++i) {                                   \
214          for (int64_t j = 0; j < dim2; ++j) {                                 \
215            int64_t k = i * dim2 + j;                                          \
216            out[k] = a[k] p_OP_ b[k];                                          \
217          }                                                                    \
218        }                                                                      \
219      }
220    GCC_BINARY_OP_2D(add_2d, +);
221    GCC_BINARY_OP_2D(sub_2d, -);
222    GCC_BINARY_OP_2D(mul_2d, *);
223
224    struct Layer {
225        void (*op)(int64_t, int64_t, float*, float*, float*);
226        std::vector<int64_t> shapes;
227        std::vector<float*> args;
228    };
229
230    class Engine {
231    public:
232        float* alloc_buffer(int64_t size) {
233            float* buf = (float*)malloc(sizeof(float) * size);
234            buffers.push_back(buf);
235            return buf;
236        }
237        void add(std::string op, int64_t dim1, int64_t dim2, float* in1, float* in2, float* out) {
238            Layer layer;
239            layer.shapes.push_back(dim1);
240            layer.shapes.push_back(dim2);
241            layer.args.push_back(in1);
242            layer.args.push_back(in2);
243            layer.args.push_back(out);
244
245            if (op == "add_2d")
246                layer.op = &add_2d;
247            else if (op == "sub_2d")
248                layer.op = &sub_2d;
249            else if (op == "mul_2d")
250                layer.op = &mul_2d;
251            net.push_back(layer);
252            return ;
253        }
254
255        void run(std::string graph, std::vector<float*> args, float* out) {
256            std::stringstream ss(graph);
257            std::string line;
258            int layer_idx = 0;
259            int arg_idx = 0;
260            float* buf = nullptr;
261
262            while (std::getline(ss, line, '\n')) {
263                std::stringstream ss2(line);
264                std::string token;
265                std::vector<std::string> attrs;
266                while (std::getline(ss2, token, ',')) {
267                    attrs.push_back(token);
268                }
269                int64_t dim1 = stoll(attrs[1]);
270                int64_t dim2 = stoll(attrs[2]);
271                auto out_buf = this->alloc_buffer(dim1 * dim2);
272
273                if (layer_idx == 0) {
274                    this->add(attrs[0], dim1, dim2, args[0], args[1], out_buf);
275                    buf = out_buf;
276                    arg_idx = 2;
277                }
278                else {
279                    this->add(attrs[0], dim1, dim2, buf, args[arg_idx], out_buf);
280                    buf = out_buf;
281                    arg_idx++;
282                }
283                layer_idx++;
284            }
285            this->net.back().args.back() = out;
286
287            for (auto layer : net) {
288                (*layer.op)(layer.shapes[0], layer.shapes[1], layer.args[0], layer.args[1], layer.args[2]);
289            }
290        }
291        ~Engine() {
292            for (auto buf : buffers) {
293                free(buf);
294            }
295        }
296    private:
297        std::vector<Layer> net;
298        std::vector<float*> buffers;
299    };
300
301    #endif
302    '''
303    header_file = tmp_path.relpath("gcc_engine.h")
304    with open(header_file, 'w') as f:
305        f.write(code)
306
307
308def get_synthetic_lib():
309    x = relay.var('x', shape=(10, 10))
310    w0 = relay.var('w0', shape=(10, 10))
311    w1 = relay.var('w1', shape=(10, 10))
312    w2 = relay.var('w2', shape=(10, 10))
313    w3 = relay.var('w3', shape=(10, 10))
314    w4 = relay.var('w4', shape=(10, 10))
315    w5 = relay.var('w5', shape=(10, 10))
316    w6 = relay.var('w6', shape=(10, 10))
317    w7 = relay.var('w7', shape=(10, 10))
318
319    # subgraph0
320    gcc_input0 = relay.var('gcc_input0', shape=(10, 10))
321    gcc_input1 = relay.var('gcc_input1', shape=(10, 10))
322    gcc_input2 = relay.var('gcc_input2', shape=(10, 10))
323    gcc_input3 = relay.var('gcc_input3', shape=(10, 10))
324    subgraph0 = relay.Function([gcc_input0, gcc_input1, gcc_input2,
325                                gcc_input3], relay.copy(gcc_input0))
326    subgraph0 = subgraph0.set_attribute(
327        "Primitive", tvm.expr.IntImm("int32", 1))
328
329    # Call subgraph0
330    subgraph0_ret = relay.Call(subgraph0, [x, w0, w1, w2])
331
332    # subgraph1
333    gcc_input4 = relay.var('gcc_input4', shape=(10, 10))
334    gcc_input5 = relay.var('gcc_input5', shape=(10, 10))
335    gcc_input6 = relay.var('gcc_input6', shape=(10, 10))
336    gcc_input7 = relay.var('gcc_input7', shape=(10, 10))
337    subgraph1 = relay.Function([gcc_input4, gcc_input5, gcc_input6,
338                                gcc_input7], relay.copy(gcc_input4))
339    subgraph1 = subgraph1.set_attribute(
340        "Primitive", tvm.expr.IntImm("int32", 1))
341
342    # Call subgraph1
343    subgraph1_ret = relay.Call(subgraph1, [x, w3, w4, w5])
344
345    # Other ops that will be executed on TVM.
346    add2 = relay.add(x, w6)
347    sub2 = relay.subtract(add2, w7)
348    ret = relay.concatenate((subgraph0_ret, subgraph1_ret, sub2), 0)
349    func = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], ret)
350    mod = relay.Module.from_expr(func)
351    _, lib, _ = relay.build(mod, "llvm")
352    return lib
353
354def get_whole_graph_json():
355    nodex = {"op": "null", "name": "x", "inputs": []}
356    node0 = {"op": "null", "name": "w0", "inputs": []}
357    node1 = {"op": "null", "name": "w1", "inputs": []}
358    node2 = {"op": "null", "name": "w2", "inputs": []}
359    node3 = {"op": "null", "name": "w3", "inputs": []}
360    node4 = {"op": "null", "name": "w4", "inputs": []}
361    node5 = {"op": "null", "name": "w5", "inputs": []}
362    node6 = {"op": "null", "name": "w6", "inputs": []}
363    node7 = {"op": "null", "name": "w7", "inputs": []}
364
365    subgraph0 = {
366        "op": "tvm_op",
367        "name": "json_rt_0",
368        "attrs": {
369            "num_outputs": "1",
370            "num_inputs": "4",
371            "func_name": "json_rt_0",
372            "flatten_data": "0"
373        },
374        "inputs": [
375            [0, 0, 0],
376            [1, 0, 0],
377            [2, 0, 0],
378            [3, 0, 0],
379        ]
380    }
381    subgraph1 = {
382        "op": "tvm_op",
383        "name": "json_rt_1",
384        "attrs": {
385            "num_outputs": "1",
386            "num_inputs": "4",
387            "func_name": "json_rt_1",
388            "flatten_data": "0"
389        },
390        "inputs": [
391            [0, 0, 0],
392            [4, 0, 0],
393            [5, 0, 0],
394            [6, 0, 0],
395        ]
396    }
397
398    fused_op = {
399        "op": "tvm_op",
400        "name": "fused_add_subtract_concatenate",
401        "attrs": {
402            "num_outputs": "1",
403            "num_inputs": "5",
404            "func_name": "fused_add_subtract_concatenate",
405            "flatten_data": "0"
406        },
407        "inputs": [
408            [9, 0, 0],
409            [10, 0, 0],
410            [0, 0, 0],
411            [7, 0, 0],
412            [8, 0, 0]
413        ]
414    }
415    nodes = [nodex, node0, node1, node2, node3, node4,
416             node5, node6, node7, subgraph0, subgraph1, fused_op]
417    arg_nodes = [0, 1, 2, 3, 4, 5, 6, 7, 8]
418    heads = [[11, 0, 0]]
419    node_row_ptr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
420    storage_id = ["list_int", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]
421
422    shape = ["list_shape", [
423        [10, 10], [10, 10], [10, 10], [10, 10], [10, 10], [10, 10],
424        [10, 10], [10, 10], [10, 10], [10, 10], [10, 10], [30, 10]]]
425
426    dltype = ["list_str", [
427        "float32", "float32", "float32", "float32", "float32", "float32",
428        "float32", "float32", "float32", "float32", "float32", "float32"]]
429
430    attrs = {
431        "shape": shape,
432        "dltype": dltype,
433        "storage_id": storage_id,
434    }
435
436    graph = {"nodes": nodes,
437             "arg_nodes": arg_nodes,
438             "node_row_ptr": node_row_ptr,
439             "heads": heads,
440             "attrs": attrs}
441
442    return json.dumps(graph)
443
444
445def run_extern(label, get_extern_src, **kwargs):
446    if which("gcc") is None:
447        print("Skip test because gcc is not available.")
448
449    obj_name = "{}.o".format(label)
450    lib_name = "external_{}.so".format(label)
451
452    # Get Json and the compiled library.
453    graph_json = get_whole_graph_json()
454    lib = get_synthetic_lib()
455    lib.save(obj_name)
456
457    # library that contains external code.
458    csource_module = get_extern_src()
459    kwargs["options"] = [obj_name] + kwargs["options"]
460    lib_path = tmp_path.relpath(lib_name)
461    csource_module.export_library(lib_path, fcompile=False, **kwargs)
462    # load module for execution.
463    lib = tvm.module.load(lib_path)
464    mod = tvm.contrib.graph_runtime.create(graph_json, lib, tvm.cpu(0))
465
466    x_data = np.random.rand(10, 10).astype('float32')
467    mod.set_input("x", x_data)
468    w_data = []
469    for i in range(8):
470        data = np.random.rand(10, 10).astype('float32')
471        w_data.append(data)
472        var = "w" + str(i)
473        mod.set_input(var, data)
474    mod.run()
475    out = tvm.nd.empty((30, 10), ctx=tvm.cpu())
476    out = mod.get_output(0, out)
477    tvm.testing.assert_allclose(
478        out.asnumpy(),
479        np.concatenate((((x_data + w_data[0]) - w_data[1]) * w_data[2],
480                        ((x_data + w_data[3]) - w_data[4]) * w_data[5],
481                        x_data + w_data[6] - w_data[7]),
482                       axis=0))
483
484
485def test_dso_extern():
486    run_extern("lib", generate_csource_module, options=["-O2", "-std=c++11"])
487
488
489def test_engine_extern():
490    run_extern("engine",
491               generate_engine_module,
492               options=["-O2", "-std=c++11", "-I" + tmp_path.relpath("")])
493
494def test_json_extern():
495    if which("gcc") is None:
496        print("Skip test because gcc is not available.")
497
498    # Get subgraph Json.
499    subgraph_json = ("json_rt_0\n" +
500                     "input 0 10 10\n" +
501                     "input 1 10 10\n" +
502                     "input 2 10 10\n" +
503                     "input 3 10 10\n" +
504                     "add 4 inputs: 0 1 shape: 10 10\n" +
505                     "sub 5 inputs: 4 2 shape: 10 10\n" +
506                     "mul 6 inputs: 5 3 shape: 10 10\n" +
507                     "json_rt_1\n" +
508                     "input 0 10 10\n" +
509                     "input 1 10 10\n" +
510                     "input 2 10 10\n" +
511                     "input 3 10 10\n" +
512                     "add 4 inputs: 0 1 shape: 10 10\n" +
513                     "sub 5 inputs: 4 2 shape: 10 10\n" +
514                     "mul 6 inputs: 5 3 shape: 10 10")
515
516    subgraph_path = tmp_path.relpath('subgraph.examplejson')
517    with open(subgraph_path, 'w') as f:
518        f.write(subgraph_json)
519
520    # Get Json and module.
521    graph_json = get_whole_graph_json()
522
523
524    lib = get_synthetic_lib()
525    ext_lib = tvm.module.load(subgraph_path, "examplejson")
526    lib.import_module(ext_lib)
527    lib_name = 'external.so'
528    lib_path = tmp_path.relpath(lib_name)
529    lib.export_library(lib_path)
530
531    # load module for execution.
532    lib = tvm.module.load(lib_path)
533    mod = tvm.contrib.graph_runtime.create(graph_json, lib, tvm.cpu(0))
534
535    x_data = np.random.rand(10, 10).astype('float32')
536    mod.set_input("x", x_data)
537    w_data = []
538    for i in range(8):
539        data = np.random.rand(10, 10).astype('float32')
540        w_data.append(data)
541        var = "w" + str(i)
542        mod.set_input(var, data)
543
544    mod.run()
545    out = tvm.nd.empty((30, 10), ctx=tvm.cpu())
546    out = mod.get_output(0, out)
547    tvm.testing.assert_allclose(
548        out.asnumpy(),
549        np.concatenate((((x_data + w_data[0]) - w_data[1]) * w_data[2],
550                        ((x_data + w_data[3]) - w_data[4]) * w_data[5],
551                        x_data + w_data[6] - w_data[7]),
552                       axis=0))
553
554
555if __name__ == "__main__":
556    test_dso_extern()
557    test_engine_extern()
558    test_json_extern()
559