1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
18import tvm
19import numpy as np
20from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int16
21
22
23def benchmark_fc_int8_acc16():
24    m = 128
25    n = 128
26    k = 128
27
28    X = tvm.placeholder((m, k), name='X', dtype="uint8")
29    W = tvm.placeholder((n, k), name='W', dtype="int8")
30
31    peak = 512/16*2*2*2
32    gops_per_mm = 2*n*m*k
33    print("Peak {} Gops/s \n".format(peak))
34
35    def verify(target="llvm -mcpu=skylake-avx512"):
36        if not tvm.module.enabled(target):
37            print("skip because %s is not enabled..." % target)
38            return
39
40        ctx = tvm.context(target, 0)
41        X = tvm.placeholder((m, k), name='X', dtype="uint8")
42        W = tvm.placeholder((n, k), name='W', dtype="int8")
43        pc = dot_16x1x16_uint8_int8_int16()
44        ak = tvm.reduce_axis((0, k), name='k')
45
46        packedW = tvm.placeholder((n//128, 128*(k//2), 2), name='packedW', dtype="int8")
47        t_fc = tvm.compute((m, n), lambda i, j: tvm.sum(X[i, ak].astype("int16") * packedW[j//128, (ak//2)*128+j%128, ak%2].astype("int16"), axis=ak), name="F")
48
49        t_sch = tvm.create_schedule(t_fc.op)
50        a_x, a_y = t_fc.op.axis
51        a_k, = t_fc.op.reduce_axis
52
53        a_yo, a_yi = t_sch[t_fc].split(a_y, factor=128)
54        a_ko, a_ki = t_sch[t_fc].split(a_k, factor=2)
55
56        a_xo, a_xi = t_sch[t_fc].split(a_x, factor=128)
57        a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=32)
58        t_sch[t_fc].reorder(a_yo, a_xo, a_koo, a_xi, a_koi, a_yi, a_ki)
59
60       	t_sch[t_fc].tensorize(a_yi, pc)
61        # print(tvm.lower(t_sch, [X, packedW, t_fc], simple_mode=True))
62        t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
63        t_evaluator = t_func.time_evaluator(t_func.entry_name, ctx, number=10)
64
65	    # generate the plain data
66        a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
67        b_ = np.random.uniform(1, 10,  size=(n, k)).astype("int8")
68
69        packW = np.random.uniform(1, 10,  size=(n//128, 128*(k//2), 2)).astype("int8")
70        # This occurs in pre_compute stage
71        for r_idx in range(n//128):
72            for s_idx in range(128*(k//2)):
73                for t_idx in range(2):
74                    packW[r_idx][s_idx][t_idx] = b_[r_idx*128+s_idx%128][s_idx//128*2+t_idx]
75
76        x = tvm.nd.array(a_, ctx)
77        w = tvm.nd.array(packW, ctx)
78        y = tvm.nd.array(np.zeros((m, n), dtype="int16"), ctx)
79
80        result = t_evaluator(x, w, y)
81        gops_per_sec = gops_per_mm/result.mean/1e9
82        tvm.testing.assert_allclose(
83           y.asnumpy(), np.dot(a_, b_.T), rtol=1e-5)
84        print('Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}.'.format(result.mean*1000, gops_per_sec, gops_per_sec/peak))
85        #t_func.export_library("gemm_tensorize.o")
86
87    verify()
88
89if __name__ == "__main__":
90    benchmark_fc_int8_acc16()
91