1 //
2 // CPUMatMul.cpp
3 // MNN
4 //
5 // Created by MNN on 2018/08/06.
6 // Copyright © 2018, Alibaba Group Holding Limited
7 //
8
9 #include "CPUMatMul.hpp"
10 #include "CPUBackend.hpp"
11 #include "math/Matrix.hpp"
12 #include "compute/CommonOptFunction.h"
13 #include "core/Macro.h"
14 #include "core/Concurrency.h"
15 #include "core/AutoStorage.h"
16 #include "math/Vec.hpp"
17 #include <limits>
18
19 using Vec4 = MNN::Math::Vec<float, 4>;
20 namespace MNN {
21
CPUMatMul(Backend * backend,bool transposeA,bool transposeB,bool transposeC,bool multiThread)22 CPUMatMul::CPUMatMul(Backend* backend, bool transposeA, bool transposeB, bool transposeC, bool multiThread)
23 : Execution(backend), mTransposeA(transposeA), mTransposeB(transposeB), mTransposeC(transposeC), mSupportMultiThread(multiThread) {
24 mComputer.reset(new StrassenMatrixComputor(backend, mSupportMultiThread, 5));
25 }
26
_scheduleForVecE(int e,int l,int h)27 void CPUMatMul::_scheduleForVecE(int e, int l, int h) {
28 int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
29 MNN_ASSERT(e == 1);
30 MatMulParam param;
31 param.e = 1;
32 param.l = l;
33 param.h = h;
34 param.BTranspose = mTransposeB;
35 param.numberThread = numberThread;
36 auto func = static_cast<CPUBackend*>(backend())->functions()->MNNComputeMatMulForE_1;
37 mPostFunctions.emplace_back(std::make_pair([param, func](
38 int tId, const float* A, const float* B, const float* biasPtr, float* C) {
39 func(A, B, C, biasPtr, ¶m, tId);
40 }, numberThread));
41 }
42
_scheduleForVec(int e,int l,int h)43 void CPUMatMul::_scheduleForVec(int e, int l, int h) {
44 int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
45 MatMulParam param;
46 param.e = e;
47 param.l = l;
48 param.h = 1;
49 param.ATranspose = mTransposeA;
50 param.numberThread = numberThread;
51 auto func = static_cast<CPUBackend*>(backend())->functions()->MNNComputeMatMulForH_1;
52 // TODD: Support e = 1
53 MNN_ASSERT(h == 1);
54 mPostFunctions.emplace_back(std::make_pair([param, func](
55 int tId, const float* A, const float* B, const float* biasPtr, float* C) {
56 func(A, B, C, biasPtr, ¶m, tId);
57 }, numberThread));
58 }
59
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)60 ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
61 const Tensor* A = inputs[0];
62 const Tensor* B = inputs[1];
63 Tensor* C = outputs[0];
64 auto w0 = inputs[0]->length(1);
65 auto h0 = inputs[0]->length(0);
66 auto core = static_cast<CPUBackend*>(backend())->functions();
67 mPreFunctions.clear();
68 mPostFunctions.clear();
69 auto e = A->length(0);
70 auto h = B->length(1);
71 auto l = A->length(1);
72 if (mTransposeA) {
73 l = A->length(0);
74 e = A->length(1);
75 }
76 if (mTransposeB) {
77 h = B->length(0);
78 }
79 // If encoded but resized as h=1/e=1, the computer should clear firstly
80 mComputer->onReset();
81 if (h == 1) {
82 _scheduleForVec(e, l, h);
83 return NO_ERROR;
84 }
85 if (e == 1) {
86 const float* biasPtr = nullptr;
87 _scheduleForVecE(e, l, h);
88 return NO_ERROR;
89 }
90 int eP, lP, hP;
91 core->MNNGetMatMulPackMode(&eP, &lP, &hP);
92 AutoRelease<Tensor> AT(Tensor::createDevice<float>({UP_DIV(l, core->pack), e, core->pack}));
93 AutoRelease<Tensor> BT(Tensor::createDevice<float>({UP_DIV(h, hP), UP_DIV(l, lP) * lP, hP}));
94 AutoRelease<Tensor> CT(Tensor::createDevice<float>({UP_DIV(h, core->pack), e, core->pack}));
95 auto res = backend()->onAcquireBuffer(BT.get(), Backend::DYNAMIC);
96 if (!res) {
97 return OUT_OF_MEMORY;
98 }
99 auto BTPtr = BT->host<float>();
100 float* BTempPtr = BTPtr;
101 int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
102 mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this, core] (int tId, const float* APtr, const float* BPtr, const float* Bias) {
103 core->MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB);
104 } , 1));
105 res = backend()->onAcquireBuffer(AT.get(), Backend::DYNAMIC);
106 res = res && backend()->onAcquireBuffer(CT.get(), Backend::DYNAMIC);
107 if (!res) {
108 return OUT_OF_MEMORY;
109 }
110 auto ATPtr = AT->host<float>();
111 if (mTransposeA) {
112 // l, e -> lC4, e, 4
113 mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
114 core->MNNPackCUnit(ATPtr, APtr, e, l);
115 }, 1));
116 } else {
117 // e, l -> lC4, e, 4
118 mPreFunctions.emplace_back(std::make_pair(
119 [ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
120 core->MNNPackCUnitTranspose(ATPtr, APtr, e, l);
121 }, 1));
122 }
123 AutoRelease<Tensor> biasWrap;
124 std::vector<Tensor*> strassenInputs = {AT.get(), BT.get()};
125 std::vector<float> postParameters;
126 if (inputs.size() > 2) {
127 auto bias = inputs[2];
128 auto biasLength = bias->elementSize();
129 if (biasLength % core->pack != 0) {
130 mStrassenUseBiasDirectly = false;
131 // Padding to align of 4
132 biasWrap.reset(Tensor::createDevice<float>({UP_DIV(biasLength, core->pack) * core->pack}));
133 res = backend()->onAcquireBuffer(biasWrap.get(), Backend::DYNAMIC);
134 if (!res) {
135 return OUT_OF_MEMORY;
136 }
137 auto bdest = biasWrap->host<float>();
138 mPreFunctions.emplace_back(std::make_pair(
139 [biasLength, bdest, core](int tId, const float* APtr, const float* BPtr, const float* borigin) {
140 ::memset(bdest, 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack);
141 ::memcpy(bdest, borigin, biasLength * core->bytes);
142 }, 1));
143 strassenInputs.emplace_back(biasWrap.get());
144 } else {
145 mStrassenUseBiasDirectly = true;
146 strassenInputs.emplace_back(bias);
147 }
148 postParameters = {
149 1.0f,
150 1.0f,
151 -std::numeric_limits<float>().max(),
152 std::numeric_limits<float>().max(),
153 };
154 }
155 auto code = mComputer->onEncode(strassenInputs, {CT.get()}, postParameters, l);
156 if (NO_ERROR != code) {
157 return code;
158 }
159 if (nullptr != biasWrap.get()) {
160 backend()->onReleaseBuffer(biasWrap.get(), Backend::DYNAMIC);
161 }
162
163 auto CTPtr = CT->host<float>();
164 // hC4, e, 4 -> e, h
165 if (mTransposeC) {
166 mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core](
167 int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
168 core->MNNUnpackCUnitTranspose(CPtr, CTPtr, e, h);
169 }, 1));
170 } else {
171 mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core](
172 int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
173 core->MNNUnpackCUnit(CPtr, CTPtr, e, h);
174 }, 1));
175 }
176 backend()->onReleaseBuffer(AT.get(), Backend::DYNAMIC);
177 backend()->onReleaseBuffer(BT.get(), Backend::DYNAMIC);
178 backend()->onReleaseBuffer(CT.get(), Backend::DYNAMIC);
179 return NO_ERROR;
180 }
181
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)182 ErrorCode CPUMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
183
184 auto APtr = inputs[0]->host<float>();
185 auto BPtr = inputs[1]->host<float>();
186 auto CPtr = outputs[0]->host<float>();
187
188 const float* biasPtr = nullptr;
189 if (inputs.size() > 2) {
190 biasPtr = inputs[2]->host<float>();
191 }
192 execute(APtr, BPtr, CPtr, biasPtr);
193 return NO_ERROR;
194 }
195
execute(const float * APtr,const float * BPtr,float * CPtr,const float * biasPtr)196 void CPUMatMul::execute(const float* APtr, const float* BPtr, float* CPtr, const float* biasPtr) {
197 for (auto& f : mPreFunctions) {
198 MNN_CONCURRENCY_BEGIN(tId, f.second) {
199 f.first(tId, APtr, BPtr, biasPtr);
200 }
201 MNN_CONCURRENCY_END();
202 }
203 if (mStrassenUseBiasDirectly) {
204 mComputer->onExecute(nullptr, nullptr, (uint8_t*)biasPtr, nullptr);
205 } else {
206 mComputer->onExecute();
207 }
208 for (auto& f : mPostFunctions) {
209 MNN_CONCURRENCY_BEGIN(tId, f.second) {
210 f.first(tId, APtr, BPtr, biasPtr, CPtr);
211 }
212 MNN_CONCURRENCY_END();
213 }
214 }
215
216 class CPUMatMulCreator : public CPUBackend::Creator {
217 public:
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * backend) const218 virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
219 const MNN::Op* op, Backend* backend) const override {
220 auto param = op->main_as_MatMul();
221 return new CPUMatMul(backend, param->transposeA(), param->transposeB(), true, true);
222 }
223 };
224
225 REGISTER_CPU_OP_CREATOR(CPUMatMulCreator, OpType_MatMul);
226
227 } // namespace MNN
228