1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition 18import tvm 19import numpy as np 20from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int16 21 22 23def benchmark_fc_int8_acc16(): 24 m = 128 25 n = 128 26 k = 128 27 28 X = tvm.placeholder((m, k), name='X', dtype="uint8") 29 W = tvm.placeholder((n, k), name='W', dtype="int8") 30 31 peak = 512/16*2*2*2 32 gops_per_mm = 2*n*m*k 33 print("Peak {} Gops/s \n".format(peak)) 34 35 def verify(target="llvm -mcpu=skylake-avx512"): 36 if not tvm.module.enabled(target): 37 print("skip because %s is not enabled..." % target) 38 return 39 40 ctx = tvm.context(target, 0) 41 X = tvm.placeholder((m, k), name='X', dtype="uint8") 42 W = tvm.placeholder((n, k), name='W', dtype="int8") 43 pc = dot_16x1x16_uint8_int8_int16() 44 ak = tvm.reduce_axis((0, k), name='k') 45 46 packedW = tvm.placeholder((n//128, 128*(k//2), 2), name='packedW', dtype="int8") 47 t_fc = tvm.compute((m, n), lambda i, j: tvm.sum(X[i, ak].astype("int16") * packedW[j//128, (ak//2)*128+j%128, ak%2].astype("int16"), axis=ak), name="F") 48 49 t_sch = tvm.create_schedule(t_fc.op) 50 a_x, a_y = t_fc.op.axis 51 a_k, = t_fc.op.reduce_axis 52 53 a_yo, a_yi = t_sch[t_fc].split(a_y, factor=128) 54 a_ko, a_ki = t_sch[t_fc].split(a_k, factor=2) 55 56 a_xo, a_xi = t_sch[t_fc].split(a_x, factor=128) 57 a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=32) 58 t_sch[t_fc].reorder(a_yo, a_xo, a_koo, a_xi, a_koi, a_yi, a_ki) 59 60 t_sch[t_fc].tensorize(a_yi, pc) 61 # print(tvm.lower(t_sch, [X, packedW, t_fc], simple_mode=True)) 62 t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic") 63 t_evaluator = t_func.time_evaluator(t_func.entry_name, ctx, number=10) 64 65 # generate the plain data 66 a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8") 67 b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8") 68 69 packW = np.random.uniform(1, 10, size=(n//128, 128*(k//2), 2)).astype("int8") 70 # This occurs in pre_compute stage 71 for r_idx in range(n//128): 72 for s_idx in range(128*(k//2)): 73 for t_idx in range(2): 74 packW[r_idx][s_idx][t_idx] = b_[r_idx*128+s_idx%128][s_idx//128*2+t_idx] 75 76 x = tvm.nd.array(a_, ctx) 77 w = tvm.nd.array(packW, ctx) 78 y = tvm.nd.array(np.zeros((m, n), dtype="int16"), ctx) 79 80 result = t_evaluator(x, w, y) 81 gops_per_sec = gops_per_mm/result.mean/1e9 82 tvm.testing.assert_allclose( 83 y.asnumpy(), np.dot(a_, b_.T), rtol=1e-5) 84 print('Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}.'.format(result.mean*1000, gops_per_sec, gops_per_sec/peak)) 85 #t_func.export_library("gemm_tensorize.o") 86 87 verify() 88 89if __name__ == "__main__": 90 benchmark_fc_int8_acc16() 91