1 //
2 //  VulkanConvolutionImpl.cpp
3 //  MNN
4 //
5 //  Created by MNN on 2019/01/31.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #include "VulkanConvolutionImpl.hpp"
10 #include "core/Macro.h"
11 #include "VulkanConvolution.hpp"
12 #include "VulkanConvolutionWinograd.hpp"
13 #include "VulkanMatMul.hpp"
14 //#define MNN_OPEN_TIME_TRACE
15 #include <MNN/AutoTime.hpp>
16 namespace MNN {
17 //#define VULKAN_IM2COL_GEMM_UNIT 512
writeParameters(VulkanMatMul::Reorder::nchwBuffer & parameters,int co,int ci,int kh,int kw)18 static void writeParameters(VulkanMatMul::Reorder::nchwBuffer& parameters, int co, int ci, int kh, int kw) {
19     parameters.size[0] = co;
20     parameters.size[1] = ci;
21     parameters.size[2] = kh;
22     parameters.size[3] = kw;
23     parameters.stride[0] = ci * kh * kw;
24     parameters.stride[1] = kh * kw;
25     parameters.stride[2] = kw;
26     parameters.stride[3] = 1;
27 }
28 class VulkanConvolutionIm2Col : public VulkanBasicExecution {
29 public:
30 
VulkanConvolutionIm2Col(VulkanBackend * backend,const Convolution2DCommon * convOption,const float * weightPtr,const float * biasPtr,int ci,int co)31     VulkanConvolutionIm2Col(VulkanBackend* backend, const Convolution2DCommon* convOption, const float* weightPtr,
32                             const float* biasPtr, int ci, int co) : VulkanBasicExecution(backend), mConvCommonOption(convOption) {
33         auto kw = convOption->kernelX();
34         auto kh = convOption->kernelY();
35         if (nullptr != weightPtr) {
36             // Static weight
37             VulkanMatMul::Reorder reorder(backend, true);
38             VulkanMatMul::Reorder::nchwBuffer parameters;
39             writeParameters(parameters, co, ci, kh, kw);
40             mKernel = VulkanMatrixMultier4x4::createKernel(backend, nullptr, ALIGN_UP4(ci) * kh * kw, co, 1);
41             auto weightSize = ci * co * kh * kw;
42             std::shared_ptr<VulkanBuffer> tempBuffer(new VulkanBuffer(backend->getMemoryPool(), false, weightSize*sizeof(float), nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
43             auto tempWeightBuffer = tempBuffer->map();
44             ::memcpy(tempWeightBuffer, weightPtr, weightSize * sizeof(float));
45             tempBuffer->unmap();
46             std::shared_ptr<VulkanBuffer> tempBuffer2(new VulkanBuffer(backend->getMemoryPool(), false, reorder.computeMiddleBufferSize(co, kh, kw, ci) *sizeof(float), nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
47             std::shared_ptr<VulkanCommandPool::Buffer> cmdBuffer(backend->getPool().allocBuffer());
48             cmdBuffer->begin(0);
49             reorder.encode(tempBuffer->buffer(), tempBuffer->size(), tempBuffer2->buffer()
50                            , tempBuffer2->size(), mKernel.get(), cmdBuffer.get(), parameters);
51             cmdBuffer->end();
52             backend->getPool().submitAndWait(cmdBuffer->get());
53         }
54         mMultiCreator = [ci, kh, kw, co, backend, this]() {
55             auto multi = std::make_shared<VulkanMatrixMultier4x4>(backend, nullptr, ALIGN_UP4(ci) * kh * kw, co, 1, mKernel);
56             return multi;
57         };
58         std::vector<VkDescriptorType> im2Coltypes{
59             VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
60         if (kw == 1 && kh == 1 && convOption->padX() == 0 && convOption->padY() == 0) {
61             mIm2Col =
62                 backend->getPipeline("glsl_im2col1x1_comp", /* glsl_im2col1x1_comp, glsl_im2col1x1_comp_len,*/ im2Coltypes);
63         } else {
64             mIm2Col = backend->getPipeline("glsl_im2col_comp", /*glsl_im2col_comp, glsl_im2col_comp_len,*/ im2Coltypes);
65         }
66         std::vector<VkDescriptorType> Col2imTypes{
67             VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
68             VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
69         auto macro = VulkanConvolutionCommon::getPostTreatMacro(convOption);
70         mCol2Im    = backend->getPipeline("glsl_col2Im_" + macro + "comp", Col2imTypes);
71 
72         mSampler      = backend->getCommonSampler();
73         if (nullptr != biasPtr) {
74             // Static bias
75             mBias         = std::make_shared<VulkanImage>(backend->getMemoryPool(), false, UP_DIV(co, 4), 1);
76             auto tempBias = std::make_shared<VulkanBuffer>(backend->getMemoryPool(), false, sizeof(float) * ALIGN_UP4(co));
77             auto bias     = tempBias->map();
78             ::memset(bias, 0, sizeof(float) * ALIGN_UP4(co));
79             ::memcpy(bias, biasPtr, sizeof(float) * co);
80             tempBias->unmap();
81             backend->copyBufferToImage(tempBias.get(), mBias.get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
82         }
83     }
~VulkanConvolutionIm2Col()84     ~VulkanConvolutionIm2Col() {
85         // Do nothing
86     }
onEncode(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const VulkanCommandPool::Buffer * cmdBuffer)87     virtual ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
88                                const VulkanCommandPool::Buffer* cmdBuffer) override {
89         auto src         = inputs[0];
90         auto dst         = outputs[0];
91         const int icDiv4 = UP_DIV(src->channel(), 4);
92         const int ocDiv4 = UP_DIV(dst->channel(), 4);
93         auto vkBn = (VulkanBackend*)backend();
94         int limit = vkBn->proty().limits.maxImageDimension2D * 4;
95 #ifdef VULKAN_IM2COL_GEMM_UNIT
96         limit = VULKAN_IM2COL_GEMM_UNIT;
97 #endif
98         if (limit < dst->width()) {
99             MNN_ERROR("Don't support width too large feature: %d x %d, limit = %d\n", dst->width(), dst->height(), limit);
100             return NOT_SUPPORT;
101         }
102         int batchLoopNumber = 1;
103         int heightLoopNumber = 1;
104         int unitHeight = dst->height();
105         int unitBatch = dst->batch();
106         auto area = dst->width() * dst->height();
107         if (limit < area) {
108             batchLoopNumber = dst->batch();
109             unitBatch = 1;
110             unitHeight = limit / dst->width();
111             heightLoopNumber = UP_DIV(dst->height(), unitHeight);
112         } else if (limit < area * dst->batch()) {
113             unitBatch = limit / area;
114             batchLoopNumber = UP_DIV(dst->batch(), unitBatch);
115         }
116         int loopNumber = batchLoopNumber * heightLoopNumber;
117         mConvParams.resize(loopNumber);
118         mMultilers.resize(loopNumber);
119         mIm2ColSet.resize(loopNumber);
120         mCol2ImSet.resize(loopNumber);
121 
122         for (int i=0; i<batchLoopNumber; ++i) {
123             int batchOffset = i * unitBatch;
124             int currentBatch = dst->batch() - batchOffset;
125             if (currentBatch > unitBatch) {
126                 currentBatch = unitBatch;
127             }
128             for (int j=0; j<heightLoopNumber; ++j) {
129                 int heightOffset = j * unitHeight;
130                 int currentHeight = dst->height() - heightOffset;
131                 if (currentHeight > unitHeight) {
132                     currentHeight = unitHeight;
133                 }
134                 auto index = i * heightLoopNumber + j;
135                 auto totalNumberInput = currentBatch * icDiv4 * dst->width() * currentHeight;
136                 auto totalNumberOutput = currentBatch * ocDiv4 * dst->width() * currentHeight;
137                 mConvParams[index] = std::make_shared<VulkanBuffer>(vkBn->getMemoryPool(), false,
138                                                         sizeof(VulkanConvolutionCommon::ConvolutionParameter), nullptr,
139                                                         VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
140                 {
141                     auto convCons = reinterpret_cast<VulkanConvolutionCommon::ConvolutionParameter*>(mConvParams[index]->map());
142                     VulkanConvolutionCommon::writeParameter(convCons, mConvCommonOption, src, dst);
143                     convCons->offset[0] = batchOffset;
144                     convCons->offset[1] = heightOffset;
145                     convCons->outputSize[3] = currentBatch;
146                     convCons->outputSize[1] = currentHeight;
147                     mConvParams[index]->unmap();
148                 }
149                 mIm2ColSet[index].reset(mIm2Col->createSet());
150                 mCol2ImSet[index].reset(mCol2Im->createSet());
151                 mMultilers[index] = mMultiCreator();
152                 mMultilers[index]->prepare(cmdBuffer, dst->width() * currentHeight * currentBatch);
153                 auto mMultiler = mMultilers[index].get();
154                 if (true) {
155                     auto colImage = mMultiler->source();
156                     cmdBuffer->barrierImageIfNeeded(colImage, VK_IMAGE_LAYOUT_GENERAL);
157                     mIm2ColSet[index]->writeImage(colImage->view(), mSampler->get(), VK_IMAGE_LAYOUT_GENERAL, 0);
158                     mIm2ColSet[index]->writeImage((reinterpret_cast<VulkanTensor*>(src->deviceId()))->image()->view(), mSampler->get(),
159                                         VK_IMAGE_LAYOUT_GENERAL, 1);
160                     mIm2ColSet[index]->writeBuffer(mConvParams[index]->buffer(), 2, mConvParams[index]->size());
161                     mIm2Col->bind(cmdBuffer->get(), mIm2ColSet[index]->get());
162                     vkCmdDispatch(cmdBuffer->get(), UP_DIV(totalNumberInput, VulkanConvolutionCommon::gImage2ColLocal),
163                                 1, 1);
164                     cmdBuffer->barrierImageIfNeeded(colImage, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
165                 }
166                 mMultilers[index]->compute(cmdBuffer);
167                 if (true) {
168                     auto dstImage = mMultiler->dest();
169                     mCol2ImSet[index]->writeImage(dstImage->view(), mSampler->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 0);
170                     mCol2ImSet[index]->writeImage((reinterpret_cast<VulkanTensor*>(dst->deviceId()))->image()->view(), mSampler->get(),
171                                         VK_IMAGE_LAYOUT_GENERAL, 1);
172 
173                     mCol2ImSet[index]->writeImage(mBias->view(), mSampler->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 2);
174                     mCol2ImSet[index]->writeBuffer(mConvParams[index]->buffer(), 3, mConvParams[index]->size());
175                     mCol2Im->bind(cmdBuffer->get(), mCol2ImSet[index]->get());
176                     cmdBuffer->barrierImageIfNeeded(dstImage, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
177                     // cmdBuffer->barrierImage(dstImage->get(), VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
178                     vkCmdDispatch(cmdBuffer->get(), UP_DIV(totalNumberOutput, VulkanConvolutionCommon::gImage2ColLocal),
179                                 1, 1);
180                 }
181             }
182         }
183         return NO_ERROR;
184     }
185 private:
186     const VulkanPipeline* mIm2Col;
187     const VulkanPipeline* mCol2Im;
188     const VulkanSampler* mSampler;
189 
190     std::shared_ptr<VulkanImage> mBias;
191     std::shared_ptr<VulkanImage> mKernel;
192     const Convolution2DCommon* mConvCommonOption;
193     std::vector<std::shared_ptr<VulkanPipeline::DescriptorSet>> mCol2ImSet;
194     std::vector<std::shared_ptr<VulkanPipeline::DescriptorSet>> mIm2ColSet;
195     std::vector<std::shared_ptr<VulkanBuffer>> mConvParams;
196     std::vector<std::shared_ptr<VulkanMatrixMultier4x4>> mMultilers;
197     std::function<std::shared_ptr<VulkanMatrixMultier4x4>()> mMultiCreator;
198 };
199 
create(VulkanBackend * backend,const Convolution2DCommon * convOption,const std::vector<Tensor * > & inputs,const Tensor * output,const float * weightPtr,const float * biasPtr,int ci,int co)200 VulkanBasicExecution* VulkanConvolutionImpl::create(VulkanBackend* backend, const Convolution2DCommon* convOption,
201                                                          const std::vector<Tensor*>& inputs, const Tensor* output,
202                                                          const float* weightPtr, const float* biasPtr, int ci, int co) {
203     AUTOTIME;
204     if (inputs.size() > 1) {
205         return new VulkanConvolutionIm2Col(backend, convOption, weightPtr, biasPtr, ci, co);
206     }
207     auto imageLimit = backend->proty().limits.maxImageDimension2D;
208     if (VulkanConvolutionWinograd::support(convOption)) {
209         if (output->width() >= 4 && output->height() >= 4 && output->batch() == 1) {
210             return new VulkanConvolutionWinograd(backend, convOption, weightPtr, biasPtr, ci, co);
211         }
212     }
213     if (ALIGN_UP4(ci) * convOption->kernelX() * convOption->kernelY() > imageLimit) {
214         return nullptr;
215     }
216     return new VulkanConvolutionIm2Col(backend, convOption, weightPtr, biasPtr, ci, co);
217 }
218 
219 } // namespace MNN
220