1 //
2 //  DenseConvolutionTiledExecutor.cpp
3 //  MNN
4 //
5 //  Created by MNN on 2018/07/16.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #include "DenseConvolutionTiledExecutor.hpp"
10 #include <MNN/AutoTime.hpp>
11 #include "backend/cpu/CPUBackend.hpp"
12 #include "CommonOptFunction.h"
13 #include "core/Concurrency.h"
14 #include "ConvOpt.h"
15 #include "core/Macro.h"
16 #include "core/TensorUtils.hpp"
17 #include "math/Vec.hpp"
18 #include "core/BufferAllocator.hpp"
19 #include "core/MemoryFormater.h"
20 
21 using Vec4 = MNN::Math::Vec<float, 4>;
22 namespace MNN {
23 
initWeight(float * dest,const float * source,float * cache,int depth,int outputCount,int kernelSize,const CoreFunctions * function)24 void DenseConvolutionTiledExecutor::initWeight(float *dest, const float *source, float* cache, int depth, int outputCount, int kernelSize, const CoreFunctions* function) {
25     ConvolutionTiledExecutor::initWeight(source, cache, depth, outputCount, kernelSize, function);
26     function->MNNPackForMatMul_B(dest, cache, outputCount, kernelSize * depth, true);
27     /*MNN_PRINT("dense weight matrix tile:");
28     formatMatrix(dest, {UP_DIV(outputCount, 4), kernelSize * depth, 4});*/
29 }
30 
DenseConvolutionTiledExecutor(const Convolution2DCommon * common,Backend * b,const float * originWeight,size_t originWeightSize,const float * bias,size_t biasSize)31 DenseConvolutionTiledExecutor::DenseConvolutionTiledExecutor(const Convolution2DCommon* common, Backend* b,
32                                                    const float* originWeight, size_t originWeightSize,
33                                                    const float* bias, size_t biasSize)
34     : ConvolutionTiledExecutor(b, bias, biasSize) {
35 
36     auto outputCount = (int)biasSize;
37     int eP, lP, hP;
38     auto core = static_cast<CPUBackend*>(b)->functions();
39     int bytes = core->bytes;
40     core->MNNGetMatMulPackMode(&eP, &lP, &hP);
41     // Don't use common->inputCount for old model common->inputCount is zero
42     auto srcCount    = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY();
43     auto lSize = srcCount * common->kernelX() * common->kernelY();
44     mResource->mWeight.reset(Tensor::createDevice<uint8_t>(
45         {UP_DIV(outputCount, hP) * UP_DIV(lSize, lP) * hP * lP * bytes}));
46     std::shared_ptr<Tensor> cache(Tensor::createDevice<uint8_t>({outputCount * srcCount * common->kernelX() * common->kernelY() * (int)sizeof(float)})); // cache must be float
47 
48     mValid = mValid && backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
49     mValid = mValid && backend()->onAcquireBuffer(cache.get(), Backend::STATIC);
50     if (!mValid) {
51         return;
52     }
53     initWeight(mResource->mWeight->host<float>(), originWeight, cache->host<float>(), srcCount, outputCount, common->kernelX() * common->kernelY(), core);
54     backend()->onReleaseBuffer(cache.get(), Backend::STATIC);
55     mProxy.reset(new DenseConvolutionTiledImpl(common, b));
56 }
57 
DenseConvolutionTiledExecutor(std::shared_ptr<CPUConvolution::Resource> res,const Convolution2DCommon * common,Backend * b)58 DenseConvolutionTiledExecutor::DenseConvolutionTiledExecutor(std::shared_ptr<CPUConvolution::Resource> res, const Convolution2DCommon* common, Backend* b) : ConvolutionTiledExecutor(res, b) {
59     mProxy.reset(new DenseConvolutionTiledImpl(common, b));
60 }
61 
~DenseConvolutionTiledExecutor()62 DenseConvolutionTiledExecutor::~DenseConvolutionTiledExecutor() {
63     // Do nothing
64 }
onClone(Backend * bn,const Op * op,Execution ** dst)65 bool DenseConvolutionTiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst) {
66     if (!mValid) {
67         return false;
68     }
69     if (nullptr == dst) {
70         return true;
71     }
72     *dst = new DenseConvolutionTiledExecutor(mResource, op->main_as_Convolution2D()->common(), bn);
73     return true;
74 }
75 
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)76 ErrorCode ConvolutionTiledExecutorMultiInput::onExecute(const std::vector<Tensor*>& inputs,
77                                                         const std::vector<Tensor*>& outputs) {
78     int depth       = inputs[1]->channel();
79     int outputCount = inputs[1]->batch();
80     auto function = static_cast<CPUBackend*>(backend())->functions();
81     if (nullptr != mTempBias) {
82         ::memset(mTempBias->host<float>(), 0, mTempBias->elementSize() * function->bytes);
83         if (inputs.size() > 2) {
84             ::memcpy(mTempBias->host<float>(), inputs[2]->host<float>(), inputs[2]->elementSize() * function->bytes);
85         }
86     }
87     auto cache = mTempWeightCache->host<float>();
88     auto source = inputs[1]->host<float>();
89     auto kernelSize = inputs[1]->width() * inputs[1]->height();
90     // Swap k, ic
91     int dims[4] = {
92         depth,
93         kernelSize,
94         kernelSize,
95         depth
96     };
97     if (function->bytes < 4) {
98         // TODO: Opt it
99         // Lowp
100         source = mTempWeightCache->host<float>() + mTempWeightCache->stride(0);
101         function->MNNLowpToFp32(inputs[1]->host<int16_t>(), source, inputs[1]->elementSize());
102         for (int o=0; o<outputCount; ++o) {
103             auto dO = cache + o * depth * kernelSize;
104             auto sO = source + o * depth * kernelSize;
105             MNNTranspose32Bit((int32_t*)dO, (const int32_t*)sO, &dims[0]);
106         }
107         function->MNNFp32ToLowp(cache, (int16_t*)cache, inputs[1]->elementSize());
108     } else {
109         for (int o=0; o<outputCount; ++o) {
110             auto dO = cache + o * depth * kernelSize;
111             auto sO = source + o * depth * kernelSize;
112             MNNTranspose32Bit((int32_t*)dO, (const int32_t*)sO, &dims[0]);
113         }
114     }
115     function->MNNPackForMatMul_B(mTempWeight->host<float>(), mTempWeightCache->host<float>(), outputCount, inputs[1]->width() * inputs[1]->height() * depth, true);
116     return mProxy->onExecute(mInputs, outputs);
117 }
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)118 ErrorCode ConvolutionTiledExecutorMultiInput::onResize(const std::vector<Tensor*>& inputs,
119                                                        const std::vector<Tensor*>& outputs) {
120     int depth       = inputs[1]->channel();
121     int outputCount = outputs[0]->channel();
122     auto function = static_cast<CPUBackend*>(backend())->functions();
123     int eP, lP, hP;
124     function->MNNGetMatMulPackMode(&eP, &lP, &hP);
125     auto kernelSize = depth * inputs[1]->width() * inputs[1]->height();
126     mTempWeight.reset(Tensor::createDevice<float>(
127         {UP_DIV(outputCount, hP), UP_DIV(kernelSize, lP), lP * hP}));
128     if (function->bytes < 4) {
129         mTempWeightCache.reset(Tensor::createDevice<int32_t>({2, outputCount * kernelSize}));
130     } else {
131         mTempWeightCache.reset(Tensor::createDevice<float>({outputCount * kernelSize}));
132     }
133     auto res = backend()->onAcquireBuffer(mTempWeight.get(), Backend::DYNAMIC);
134     res = res && backend()->onAcquireBuffer(mTempWeightCache.get(), Backend::DYNAMIC);
135     mTempBias.reset();
136     if (!res) {
137         return OUT_OF_MEMORY;
138     }
139     if (inputs.size() > 2 && inputs[2]->elementSize() % function->pack == 0) {
140         mInputs = {inputs[0], mTempWeight.get(), inputs[2]};
141     } else {
142         mTempBias.reset(Tensor::createDevice<float>({UP_DIV(outputCount, function->pack) * function->pack}));
143         backend()->onAcquireBuffer(mTempBias.get(), Backend::DYNAMIC);
144         mInputs = {inputs[0], mTempWeight.get(), mTempBias.get()};
145     }
146     backend()->onReleaseBuffer(mTempWeightCache.get(), Backend::DYNAMIC);
147     auto errorCode = mProxy->onResize(mInputs, outputs);
148     backend()->onReleaseBuffer(mTempWeight.get(), Backend::DYNAMIC);
149     if (nullptr != mTempBias) {
150         backend()->onReleaseBuffer(mTempBias.get(), Backend::DYNAMIC);
151     }
152     return errorCode;
153 }
154 
155 
getPackParameter(int * eP,int * lP,int * hP,const CoreFunctions * core)156 void DenseConvolutionTiledImpl::getPackParameter(int* eP, int* lP, int* hP, const CoreFunctions* core) {
157     core->MNNGetMatMulPackMode(eP, lP, hP);
158     return;
159 }
160 
161 #define GENERATE_FUNCTOR()                     \
162     auto matmulUnit   = core->MNNPackedMatMul; \
163     auto matmulRemain = core->MNNPackedMatMulRemain;
164 
165 #define GENERATE_WEIGHT()                      \
166     auto weightPtr = weight->host<float>();
167 
168 #define GENERATE_MM()                                                                                                  \
169     if (xC == CONVOLUTION_TILED_NUMBER) {                                                                              \
170         matmulUnit((float*)(dstOrigin + start * unit * bytes), (float*)gemmBuffer, weightPtr, parameters.data(),       \
171                    postParameters.data(), biasPtr);                                                                    \
172     } else {                                                                                                           \
173         matmulRemain((float*)(dstOrigin + start * unit * bytes), (float*)gemmBuffer, weightPtr, xC, parameters.data(), \
174                      postParameters.data(), biasPtr);                                                                  \
175     }                                                                                                                  \
176     /*MNN_PRINT("formatMatrix gemm. xC:%d, eP:%d\n", xC, eP);*/                                                        \
177     /*formatMatrix((float*)(dstOrigin + start * 4 * bytes), {UP_DIV(outputChannel, hP), xC, hP});*/
178 
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)179 ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs,
180                                                   const std::vector<Tensor*>& outputs) {
181     GENERATE_RESIZE();
182 }
183 
184 #undef GENERATE_FUNCTOR
185 #undef GENERATE_WEIGHT
186 #undef GENERATE_MM
187 
188 } // namespace MNN
189