1 //
2 //  CPUMatMul.cpp
3 //  MNN
4 //
5 //  Created by MNN on 2018/08/06.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #include "CPUMatMul.hpp"
10 #include "CPUBackend.hpp"
11 #include "math/Matrix.hpp"
12 #include "compute/CommonOptFunction.h"
13 #include "core/Macro.h"
14 #include "core/Concurrency.h"
15 #include "core/AutoStorage.h"
16 #include "math/Vec.hpp"
17 #include <limits>
18 
19 using Vec4 = MNN::Math::Vec<float, 4>;
20 namespace MNN {
21 
CPUMatMul(Backend * backend,bool transposeA,bool transposeB,bool transposeC,bool multiThread)22 CPUMatMul::CPUMatMul(Backend* backend, bool transposeA, bool transposeB, bool transposeC, bool multiThread)
23     : Execution(backend), mTransposeA(transposeA), mTransposeB(transposeB), mTransposeC(transposeC), mSupportMultiThread(multiThread) {
24     mComputer.reset(new StrassenMatrixComputor(backend, mSupportMultiThread, 5));
25 }
26 
_scheduleForVecE(int e,int l,int h)27 void CPUMatMul::_scheduleForVecE(int e, int l, int h) {
28     int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
29     MNN_ASSERT(e == 1);
30     MatMulParam param;
31     param.e = 1;
32     param.l = l;
33     param.h = h;
34     param.BTranspose = mTransposeB;
35     param.numberThread = numberThread;
36     auto func = static_cast<CPUBackend*>(backend())->functions()->MNNComputeMatMulForE_1;
37     mPostFunctions.emplace_back(std::make_pair([param, func](
38                                                                              int tId, const float* A, const float* B, const float* biasPtr, float* C) {
39         func(A, B, C, biasPtr, &param, tId);
40     }, numberThread));
41 }
42 
_scheduleForVec(int e,int l,int h)43 void CPUMatMul::_scheduleForVec(int e, int l, int h) {
44     int numberThread = mSupportMultiThread ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
45     MatMulParam param;
46     param.e = e;
47     param.l = l;
48     param.h = 1;
49     param.ATranspose = mTransposeA;
50     param.numberThread = numberThread;
51     auto func = static_cast<CPUBackend*>(backend())->functions()->MNNComputeMatMulForH_1;
52     // TODD: Support e = 1
53     MNN_ASSERT(h == 1);
54     mPostFunctions.emplace_back(std::make_pair([param, func](
55         int tId, const float* A, const float* B, const float* biasPtr, float* C) {
56         func(A, B, C, biasPtr, &param, tId);
57     }, numberThread));
58 }
59 
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)60 ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
61     const Tensor* A = inputs[0];
62     const Tensor* B = inputs[1];
63     Tensor* C       = outputs[0];
64     auto w0         = inputs[0]->length(1);
65     auto h0         = inputs[0]->length(0);
66     auto core = static_cast<CPUBackend*>(backend())->functions();
67     mPreFunctions.clear();
68     mPostFunctions.clear();
69     auto e = A->length(0);
70     auto h = B->length(1);
71     auto l = A->length(1);
72     if (mTransposeA) {
73         l = A->length(0);
74         e = A->length(1);
75     }
76     if (mTransposeB) {
77         h = B->length(0);
78     }
79     // If encoded but resized as h=1/e=1, the computer should clear firstly
80     mComputer->onReset();
81     if (h == 1) {
82         _scheduleForVec(e, l, h);
83         return NO_ERROR;
84     }
85     if (e == 1) {
86         const float* biasPtr = nullptr;
87         _scheduleForVecE(e, l, h);
88         return NO_ERROR;
89     }
90     int eP, lP, hP;
91     core->MNNGetMatMulPackMode(&eP, &lP, &hP);
92     AutoRelease<Tensor> AT(Tensor::createDevice<float>({UP_DIV(l, core->pack), e, core->pack}));
93     AutoRelease<Tensor> BT(Tensor::createDevice<float>({UP_DIV(h, hP), UP_DIV(l, lP) * lP, hP}));
94     AutoRelease<Tensor> CT(Tensor::createDevice<float>({UP_DIV(h, core->pack), e, core->pack}));
95     auto res = backend()->onAcquireBuffer(BT.get(), Backend::DYNAMIC);
96     if (!res) {
97         return OUT_OF_MEMORY;
98     }
99     auto BTPtr = BT->host<float>();
100     float* BTempPtr = BTPtr;
101     int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
102     mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this, core] (int tId, const float* APtr, const float* BPtr, const float* Bias) {
103         core->MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB);
104     } , 1));
105     res = backend()->onAcquireBuffer(AT.get(), Backend::DYNAMIC);
106     res = res && backend()->onAcquireBuffer(CT.get(), Backend::DYNAMIC);
107     if (!res) {
108         return OUT_OF_MEMORY;
109     }
110     auto ATPtr = AT->host<float>();
111     if (mTransposeA) {
112         // l, e -> lC4, e, 4
113         mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
114             core->MNNPackCUnit(ATPtr, APtr, e, l);
115         }, 1));
116     } else {
117         // e, l -> lC4, e, 4
118         mPreFunctions.emplace_back(std::make_pair(
119             [ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
120             core->MNNPackCUnitTranspose(ATPtr, APtr, e, l);
121         }, 1));
122     }
123     AutoRelease<Tensor> biasWrap;
124     std::vector<Tensor*> strassenInputs = {AT.get(), BT.get()};
125     std::vector<float> postParameters;
126     if (inputs.size() > 2) {
127         auto bias = inputs[2];
128         auto biasLength = bias->elementSize();
129         if (biasLength % core->pack != 0) {
130             mStrassenUseBiasDirectly = false;
131             // Padding to align of 4
132             biasWrap.reset(Tensor::createDevice<float>({UP_DIV(biasLength, core->pack) * core->pack}));
133             res = backend()->onAcquireBuffer(biasWrap.get(), Backend::DYNAMIC);
134             if (!res) {
135                 return OUT_OF_MEMORY;
136             }
137             auto bdest = biasWrap->host<float>();
138             mPreFunctions.emplace_back(std::make_pair(
139                 [biasLength, bdest, core](int tId, const float* APtr, const float* BPtr, const float* borigin) {
140                 ::memset(bdest, 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack);
141                 ::memcpy(bdest, borigin, biasLength * core->bytes);
142             }, 1));
143             strassenInputs.emplace_back(biasWrap.get());
144         } else {
145             mStrassenUseBiasDirectly = true;
146             strassenInputs.emplace_back(bias);
147         }
148         postParameters = {
149             1.0f,
150             1.0f,
151             -std::numeric_limits<float>().max(),
152             std::numeric_limits<float>().max(),
153         };
154     }
155     auto code = mComputer->onEncode(strassenInputs, {CT.get()}, postParameters, l);
156     if (NO_ERROR != code) {
157         return code;
158     }
159     if (nullptr != biasWrap.get()) {
160         backend()->onReleaseBuffer(biasWrap.get(), Backend::DYNAMIC);
161     }
162 
163     auto CTPtr = CT->host<float>();
164     // hC4, e, 4 -> e, h
165     if (mTransposeC) {
166         mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core](
167                 int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
168             core->MNNUnpackCUnitTranspose(CPtr, CTPtr, e, h);
169         }, 1));
170     } else {
171         mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core](
172                 int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
173             core->MNNUnpackCUnit(CPtr, CTPtr, e, h);
174         }, 1));
175     }
176     backend()->onReleaseBuffer(AT.get(), Backend::DYNAMIC);
177     backend()->onReleaseBuffer(BT.get(), Backend::DYNAMIC);
178     backend()->onReleaseBuffer(CT.get(), Backend::DYNAMIC);
179     return NO_ERROR;
180 }
181 
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)182 ErrorCode CPUMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
183 
184     auto APtr = inputs[0]->host<float>();
185     auto BPtr = inputs[1]->host<float>();
186     auto CPtr = outputs[0]->host<float>();
187 
188     const float* biasPtr = nullptr;
189     if (inputs.size() > 2) {
190         biasPtr = inputs[2]->host<float>();
191     }
192     execute(APtr, BPtr, CPtr, biasPtr);
193     return NO_ERROR;
194 }
195 
execute(const float * APtr,const float * BPtr,float * CPtr,const float * biasPtr)196 void CPUMatMul::execute(const float* APtr, const float* BPtr, float* CPtr, const float* biasPtr) {
197     for (auto& f : mPreFunctions) {
198         MNN_CONCURRENCY_BEGIN(tId, f.second) {
199             f.first(tId, APtr, BPtr, biasPtr);
200         }
201         MNN_CONCURRENCY_END();
202     }
203     if (mStrassenUseBiasDirectly) {
204         mComputer->onExecute(nullptr, nullptr, (uint8_t*)biasPtr, nullptr);
205     } else {
206         mComputer->onExecute();
207     }
208     for (auto& f : mPostFunctions) {
209         MNN_CONCURRENCY_BEGIN(tId, f.second) {
210             f.first(tId, APtr, BPtr, biasPtr, CPtr);
211         }
212         MNN_CONCURRENCY_END();
213     }
214 }
215 
216 class CPUMatMulCreator : public CPUBackend::Creator {
217 public:
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * backend) const218     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
219                                 const MNN::Op* op, Backend* backend) const override {
220         auto param = op->main_as_MatMul();
221         return new CPUMatMul(backend, param->transposeA(), param->transposeB(), true, true);
222     }
223 };
224 
225 REGISTER_CPU_OP_CREATOR(CPUMatMulCreator, OpType_MatMul);
226 
227 } // namespace MNN
228