1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17"""
18.. _opt-gemm:
19
20How to optimize GEMM on CPU
21===========================
22**Author**: `Jian Weng <https://github.com/were>`_, \
23            `Ruofei Yu <https://github.com/yuruofeifei>`_
24
25(TL;DR) TVM provides abstract interfaces which allows users to depict an algorithm and the
26algorithm's implementing organization (the so-called schedule) separately. Typically, writing
27algorithm in high-performance schedule breaks the algorithm's readability and modularity. Also,
28trying various seemingly promising schedules is time-consuming. With the help of TVM, we can
29try these schedules efficiently to enhance the performance.
30
31In this tutorial, we will demonstrate how to use TVM to optimize square matrix multiplication
32and achieve 200 times faster than baseline by simply adding 18 extra lines of code.
33
34There are two important optimizations on intense computation applications executed on CPU:
35    1. Increase the cache hit rate of memory access. Both complex numerical computation and hot-spot
36       memory access can be accelerated from high cache hit rate. This requires us to transform the
37       origin memory access pattern to the pattern fits the cache policy.
38    2. SIMD (Single instruction multi-data), or we call it vector processing unit. Every time, a
39       small batch of data, rather than a single grid, will be processed. This requires us to
40       transform the data access pattern in the loop body in uniform pattern so that the LLVM
41       backend can lower it to SIMD.
42
43Actually, all the methodologies used in this tutorial is a subset of tricks mentioned in this
44`repo <https://github.com/flame/how-to-optimize-gemm>`_. Some of them have been applied by TVM
45abstraction automatically, but some of them cannot be simply applied due to TVM constraints.
46
47All the experiment results mentioned below, are executed on 2015's 15' MacBook equipped with
48Intel i7-4770HQ CPU. The cache line size should be 64 bytes for all the x86 CPUs.
49"""
50
51################################################################################################
52# Preparation and Baseline
53# ------------------------
54# In this tutorial, we will demo how to use TVM to optimize matrix multiplication.
55# Before actually demonstrating, we first define these variables.
56# Then we write a baseline implementation, the simplest way to write a matrix multiplication in TVM.
57
58import tvm
59import numpy
60import timeit
61
62# The size of the matrix
63# (M, K) x (K, N)
64# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL.
65M = 1024
66K = 1024
67N = 1024
68
69# The default tensor type in tvm
70dtype = "float32"
71
72# using Intel AVX2(Advanced Vector Extensions) ISA for SIMD
73# To get the best performance, please change the following line
74# to llvm -mcpu=core-avx2, or specific type of CPU you use
75target = 'llvm'
76ctx = tvm.context(target, 0)
77
78# Random generated tensor for testing
79a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), ctx)
80b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), ctx)
81
82np_repeat = 100
83np_runing_time = timeit.timeit(setup='import numpy\n'
84                                     'M = ' + str(M) + '\n'
85                                     'K = ' + str(K) + '\n'
86                                     'N = ' + str(N) + '\n'
87                                     'dtype = "float32"\n'
88                                     'a = numpy.random.rand(M, K).astype(dtype)\n'
89                                     'b = numpy.random.rand(K, N).astype(dtype)\n',
90                               stmt='answer = numpy.dot(a, b)',
91                               number=np_repeat)
92print("Numpy running time: %f" % (np_runing_time / np_repeat))
93
94answer = numpy.dot(a.asnumpy(), b.asnumpy())
95
96# Algorithm
97k = tvm.reduce_axis((0, K), 'k')
98A = tvm.placeholder((M, K), name='A')
99B = tvm.placeholder((K, N), name='B')
100C = tvm.compute(
101           (M, N),
102           lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k),
103           name='C')
104
105# Default schedule
106s = tvm.create_schedule(C.op)
107func = tvm.build(s, [A, B, C], target=target, name='mmult')
108assert func
109
110c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
111func(a, b, c)
112tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
113
114evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
115print('Baseline: %f' % evaluator(a, b, c).mean)
116
117################################################################################################
118# In TVM, we can always inspect lower level IR to debug or optimize our schedule.
119# Here is the generated IR using our baseline schedule.
120
121print(tvm.lower(s, [A, B, C], simple_mode=True))
122
123################################################################################################
124# Blocking
125# --------
126# A important trick to enhance the cache hit rate is blocking --- data chunk will be computed
127# block by block. The memory access inside the block is a small neighbourhood which is with high
128# memory locality. In this tutorial, I picked up 32 as the blocking factor. So the block will
129# fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB (L1 data cache)
130
131bn = 32
132s = tvm.create_schedule(C.op)
133
134# Blocking by loop tiling
135xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
136k, = s[C].op.reduce_axis
137ko, ki = s[C].split(k, factor=4)
138
139# Hoist reduction domain outside the blocking loop
140s[C].reorder(xo, yo, ko, ki, xi, yi)
141
142func = tvm.build(s, [A, B, C], target=target, name='mmult')
143assert func
144
145c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
146func(a, b, c)
147tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
148
149# By simply tiling the loop 32x32, and hoisting ko, ki outside the blocking loops,
150# we can see big speedup compared with the baseline.
151evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
152print('Opt1: %f' % evaluator(a, b, c).mean)
153
154################################################################################################
155# Here is the generated IR after blocking.
156
157print(tvm.lower(s, [A, B, C], simple_mode=True))
158
159###################################################################################################
160# Vectorization
161# -------------
162# Another important trick is vectorization. When the memory access pattern is uniform,
163# the compiler can detect this pattern and pass the continuous memory to vector processor. In TVM,
164# we can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it vastly.
165#
166# In this tutorial, we chose to vectorize the inner loop row data since it is cache friendly.
167
168s = tvm.create_schedule(C.op)
169xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
170k, = s[C].op.reduce_axis
171ko, ki = s[C].split(k, factor=4)
172
173s[C].reorder(xo, yo, ko, ki, xi, yi)
174
175# Vectorization
176s[C].vectorize(yi)
177
178func = tvm.build(s, [A, B, C], target=target, name='mmult')
179assert func
180
181c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
182func(a, b, c)
183tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
184
185evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
186print('Opt2: %f' % evaluator(a, b, c).mean)
187
188################################################################################################
189# Here is the generated IR after vectorization.
190
191print(tvm.lower(s, [A, B, C], simple_mode=True))
192
193###################################################################################################
194# Loop Permutation
195# ----------------
196# If we look at the above IR, we can see the inner loop row data is vectorized and
197# B is transformed into PackedB. The traversal of PackedB is sequential now.
198# So we will look at the access pattern of A. In current schedule, A is accessed column by column
199# which is not cache friendly. If we change the nested loop order of ki and inner axes xi,
200# the access pattern for A matrix is more cache friendly.
201
202s = tvm.create_schedule(C.op)
203xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
204k, = s[C].op.reduce_axis
205ko, ki = s[C].split(k, factor=4)
206
207# re-ordering
208s[C].reorder(xo, yo, ko, xi, ki, yi)
209s[C].vectorize(yi)
210
211func = tvm.build(s, [A, B, C], target=target, name='mmult')
212assert func
213
214c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
215func(a, b, c)
216tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
217
218evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
219print('Opt3: %f' % evaluator(a, b, c).mean)
220
221################################################################################################
222# Here is the generated IR after loop permutation.
223
224print(tvm.lower(s, [A, B, C], simple_mode=True))
225
226###################################################################################################
227# Array Packing
228# -------------
229# Another important trick is array packing. This trick is to reorder the storage dimension of the
230# array to convert the continuous access pattern on certain dimension to a sequential pattern after
231# flattening.
232#
233# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/array-packing.png
234#      :align: center
235#      :scale: 100%
236#
237
238
239###################################################################################################
240# Just as it is shown in the figure above, after blocking the computations, we can observe the array
241# access pattern of B (after flattening), which is regular but discontinuous. We expect that after
242# some transformation we can get continuous access pattern. We can reorder a [16][16] array to
243# a [16/4][16][4] array, so that the access pattern of B will be sequential when grabing
244# the corresponding value from the packed array.
245#
246
247# We have to re-write the algorithm slightly.
248packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB')
249C = tvm.compute((M, N),
250                lambda x, y: tvm.sum(A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k),
251                name = 'C')
252
253s = tvm.create_schedule(C.op)
254
255xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
256k, = s[C].op.reduce_axis
257ko, ki = s[C].split(k, factor=4)
258
259s[C].reorder(xo, yo, ko, xi, ki, yi)
260s[C].vectorize(yi)
261
262x, y, z = s[packedB].op.axis
263s[packedB].vectorize(z)
264s[packedB].parallel(x)
265
266func = tvm.build(s, [A, B, C], target=target, name='mmult')
267assert func
268
269c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
270func(a, b, c)
271tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
272
273evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
274print('Opt4: %f' % evaluator(a, b, c).mean)
275
276################################################################################################
277# Here is the generated IR after array packing.
278
279print(tvm.lower(s, [A, B, C], simple_mode=True))
280
281################################################################################################
282# Write cache for blocks
283# ----------------------
284# After blocking, the program will write result to C block by block, the access pattern
285# is not sequential. So we can use a sequential cache array to hold the block results and
286# write to C when all the block results are ready.
287#
288
289s = tvm.create_schedule(C.op)
290
291# Allocate write cache
292CC = s.cache_write(C, 'global')
293
294xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
295
296# Write cache is computed at yo
297s[CC].compute_at(s[C], yo)
298
299# New inner axes
300xc, yc = s[CC].op.axis
301
302k, = s[CC].op.reduce_axis
303ko, ki = s[CC].split(k, factor=4)
304s[CC].reorder(ko, xc, ki, yc)
305s[CC].unroll(ki)
306s[CC].vectorize(yc)
307
308x, y, z = s[packedB].op.axis
309s[packedB].vectorize(z)
310s[packedB].parallel(x)
311
312func = tvm.build(s, [A, B, C], target=target, name='mmult')
313assert func
314
315c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
316func(a, b, c)
317tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
318
319evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
320print('Opt5: %f' % evaluator(a, b, c).mean)
321
322################################################################################################
323# Here is the generated IR after blocking.
324
325print(tvm.lower(s, [A, B, C], simple_mode=True))
326
327###################################################################################################
328# Parallel
329# --------
330# Futhermore, we can also utilize multi-core processors to do the thread-level parallelization.
331
332s = tvm.create_schedule(C.op)
333
334CC = s.cache_write(C, 'global')
335
336xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
337
338s[CC].compute_at(s[C], yo)
339
340xc, yc = s[CC].op.axis
341
342k, = s[CC].op.reduce_axis
343ko, ki = s[CC].split(k, factor=4)
344s[CC].reorder(ko, xc, ki, yc)
345s[CC].unroll(ki)
346s[CC].vectorize(yc)
347
348# parallel
349s[C].parallel(xo)
350
351x, y, z = s[packedB].op.axis
352s[packedB].vectorize(z)
353s[packedB].parallel(x)
354
355func = tvm.build(s, [A, B, C], target=target, name = 'mmult')
356assert func
357
358c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
359func(a, b, c)
360tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
361
362evaluator = func.time_evaluator(func.entry_name, ctx, number=50)
363opt6_time = evaluator(a, b, c).mean
364print('Opt6: %f' % opt6_time)
365
366################################################################################################
367# Here is the generated IR after parallelization.
368
369print(tvm.lower(s, [A, B, C], simple_mode=True))
370
371###################################################################################################
372
373##################################################################################################
374# Summary
375# -------
376# After applying the above simple optimizations with only 18 lines of code,
377# our generated code can achieve 60% of the `numpy` performance with MKL.
378# Note that the outputs on the web page reflect the running times on a non-exclusive
379# Docker container, thereby they are *unreliable*. It is highly encouraged to run the
380# tutorial by yourself to observe the performance gain acheived by TVM.
381