1 #include "VulkanRaster.hpp"
2 #include "VulkanMatMul.hpp"
3 #include "core/TensorUtils.hpp"
4 #include <algorithm>
5 #include "core/OpCommonUtils.hpp"
6 #include "core/Macro.h"
7 namespace MNN {
writeNCHW(int * dims,Tensor * origin)8 static void writeNCHW(int* dims, Tensor* origin) {
9     int w     = std::max(origin->width(), 1);
10     int h     = std::max(origin->height(), 1);
11     int b     = origin->batch();
12     dims[0]   = w;
13     dims[1]   = h;
14     dims[2]   = origin->channel();
15     dims[3]   = b;
16 }
17 struct SamplerInfo {
18     ivec4 stride;//stride[3] + offset
19     ivec4 size;//size[3] + totalSize
20     ivec4 extent;//dstStride[3]+dstOffset
21     ivec4 imageSize;// srcwh and dstwh
22     ivec2 depth;//c4 for src and dst
23 };
24 
writeSamplerInfo(SamplerInfo & info,const Tensor::InsideDescribe::Region & sampler)25 static void writeSamplerInfo(SamplerInfo& info, const Tensor::InsideDescribe::Region& sampler) {
26     int sizeTotal = 1;
27     for (int i=0; i<3; ++i) {
28         info.size[i] = sampler.size[i];
29         info.stride[i] = sampler.src.stride[i];
30         info.extent[i] = sampler.dst.stride[i];
31         sizeTotal *= info.size[i];
32     }
33     info.size[3] = sizeTotal;
34     info.stride[3] = sampler.src.offset;
35     info.extent[3] = sampler.dst.offset;
36 }
37 
onEncodeFast(const Tensor * input,const Tensor * output,const VulkanCommandPool::Buffer * cmdBuffer,bool zero)38 void VulkanRaster::onEncodeFast(const Tensor* input, const Tensor* output, const VulkanCommandPool::Buffer *cmdBuffer, bool zero) {
39     auto des = TensorUtils::getDescribe(input);
40     mBlitImages.resize(des->regions.size());
41     auto vkBn = static_cast<VulkanBackend*>(backend());
42     auto dstTensor = reinterpret_cast<VulkanTensor*>(output->deviceId());
43     if (zero) {
44         auto fillPipeline = vkBn->getPipeline("glsl_fill_image_comp", {
45             VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
46             VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
47         });
48         struct FillImage {
49             vec4 value;
50             ivec4 imageSize;
51         };
52         FillImage uniformInfo;
53         ::memset(&uniformInfo, 0, sizeof(FillImage));
54         auto image = dstTensor->image();
55         uniformInfo.imageSize[0] = image->width();
56         uniformInfo.imageSize[1] = image->height();
57         uniformInfo.imageSize[2] = 0;
58         uniformInfo.imageSize[3] = image->width() * image->height();
59         std::shared_ptr<VulkanBuffer> uniform(new VulkanBuffer(vkBn->getMemoryPool(), false, sizeof(FillImage), &uniformInfo, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT));
60         mExtraUniform.emplace_back(uniform);
61         std::shared_ptr<VulkanPipeline::DescriptorSet> des(fillPipeline->createSet());
62         des->writeImage(image->view(), vkBn->getCommonSampler()->get(), VK_IMAGE_LAYOUT_GENERAL, 0);
63         des->writeBuffer(uniform->buffer(), 1, uniform->size());
64         auto totalSize = UP_DIV(uniformInfo.imageSize[3], 256);
65         mExtraDescribes.emplace_back(des);
66         fillPipeline->bind(cmdBuffer->get(), des->get());
67         vkCmdDispatch(cmdBuffer->get(), totalSize, 1, 1);
68         cmdBuffer->barrierImage(image->get(), VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_GENERAL, VulkanCommandPool::Buffer::WRITE_WRITE);
69     }
70 
71     auto blitPipeline = vkBn->getPipeline("glsl_blit_image_comp", {
72         VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
73         VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
74         VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
75     });
76 
77     for (int i=0; i< des->regions.size(); ++i) {
78         auto& slice = des->regions[i];
79         Tensor::InsideDescribe::Region newRegion;
80         OpCommonUtils::turnToPackRegion(slice, newRegion, output, 4);
81         // TODO: Find better way
82         newRegion.dst.offset /= 4;
83         newRegion.src.offset /= 4;
84         auto& dst = mBlitImages[i];
85         SamplerInfo info;
86         writeSamplerInfo(info, newRegion);
87         auto nhwcSrc = VulkanTensor::tensorShapeFormat(slice.origin);
88         auto nhwcDst = VulkanTensor::tensorShapeFormat(output);
89         info.imageSize[0] = nhwcSrc[2];
90         info.imageSize[1] = nhwcSrc[1];
91         info.imageSize[2] = nhwcDst[2];
92         info.imageSize[3] = nhwcDst[1];
93         info.depth[0] = UP_DIV(nhwcSrc[3], 4);
94         info.depth[1] = UP_DIV(nhwcDst[3], 4);
95         auto total = info.size[0] * info.size[1] * info.size[2];
96         auto group = UP_DIV(total, 256);
97         dst.describe.reset(blitPipeline->createSet());
98         dst.uniform.reset(new VulkanBuffer(vkBn->getMemoryPool(), false, sizeof(SamplerInfo), &info, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT));
99         auto srcTensor = reinterpret_cast<VulkanTensor*>(slice.origin->deviceId());
100         dst.describe->writeImage(srcTensor->image()->view(), vkBn->getCommonSampler()->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 1);
101         dst.describe->writeImage(dstTensor->image()->view(), vkBn->getCommonSampler()->get(), VK_IMAGE_LAYOUT_GENERAL, 0);
102         dst.describe->writeBuffer(dst.uniform->buffer(), 2, dst.uniform->size());
103         cmdBuffer->barrierImage(srcTensor->image()->get(), VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
104         blitPipeline->bind(cmdBuffer->get(), dst.describe->get());
105         vkCmdDispatch(cmdBuffer->get(), group, 1, 1);
106     }
107 }
108 
109 
onEncode(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const VulkanCommandPool::Buffer * cmdBuffer)110 ErrorCode VulkanRaster::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
111                            const VulkanCommandPool::Buffer *cmdBuffer) {
112     MNN_ASSERT(inputs.size() == 1);
113     MNN_ASSERT(outputs.size() == 1);
114     auto input = inputs[0];
115     auto output = outputs[0];
116     auto des = TensorUtils::getDescribe(input);
117     auto outputDes = TensorUtils::getDescribe(output);
118     bool needZero = !TensorUtils::regionIsFull(input);
119 
120     /** Alloc Begin*/
121     mInputBuffers.clear();
122     mOutputBuffer.buffer = nullptr;
123     mBlits.resize(des->regions.size());
124     mBlitImages.clear();
125     mExtraUniform.clear();
126     mExtraDescribes.clear();
127     if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
128         // TODO: Optimize it
129         bool fast = true;
130         for (int i=0; i< des->regions.size(); ++i) {
131             auto& slice = des->regions[i];
132             if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
133                 fast = false;
134                 break;
135             }
136             if (!OpCommonUtils::canBlitFast(slice, output)) {
137                 fast = false;
138                 break;
139             }
140         }
141         if (fast) {
142             onEncodeFast(input, output, cmdBuffer, needZero);
143             return NO_ERROR;
144         }
145     }
146     auto vkBn = static_cast<VulkanBackend*>(backend());
147     std::vector<VkDescriptorType> nchwConvertTypes{VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
148                                         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
149     for (auto& slice : des->regions) {
150         auto origin = slice.origin;
151         if (mInputBuffers.find(origin)!=mInputBuffers.end()) {
152             continue;
153         }
154         MNN_ASSERT(origin->deviceId() != 0);
155         int bufferSize = sizeof(float);
156         for (int i=0; i<origin->dimensions(); ++i) {
157             bufferSize *= origin->length(i);
158         }
159         ivec4 dims;
160         writeNCHW(dims, origin);
161         ConvertInfo info;
162         info.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(),
163                                            false, bufferSize, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
164         info.convert.reset(new VulkanImageConverter(vkBn));
165         mInputBuffers.insert(std::make_pair(origin, std::move(info)));
166     }
167     {
168         int bufferSize = sizeof(float);
169         for (int i=0; i<output->dimensions(); ++i) {
170             bufferSize *= output->length(i);
171         }
172         mOutputBuffer.convert.reset(new VulkanImageConverter(vkBn));
173         mOutputBuffer.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSize, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
174     }
175     for (auto& iter : mInputBuffers) {
176         iter.second.buffer->release();
177     }
178     if (nullptr != mOutputBuffer.buffer) {
179         mOutputBuffer.buffer->release();
180     }
181     auto blitPipeline = vkBn->getPipeline("glsl_blit_comp", {
182         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
183         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
184         VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
185     });
186     for (int i=0; i<mBlits.size(); ++i) {
187         auto& origin = des->regions[i];
188         auto& dst = mBlits[i];
189         SamplerInfo info;
190         writeSamplerInfo(info, origin);
191         auto total = info.size[0] * info.size[1] * info.size[2];
192         dst.workGroup[2] = 1;
193         dst.workGroup[1] = 1;
194         dst.workGroup[0] = UP_DIV(total, 256);
195         dst.pipeline = blitPipeline;
196         dst.describe.reset(blitPipeline->createSet());
197         dst.uniform.reset(new VulkanBuffer(vkBn->getMemoryPool(), false, sizeof(info), &info, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT));
198         auto srcIter = mInputBuffers.find(origin.origin);
199         dst.srcBuffer = srcIter->second.buffer->buffer();
200         dst.srcBufferSize = srcIter->second.buffer->size();
201         dst.dstBuffer = mOutputBuffer.buffer->buffer();
202         dst.dstBufferSize = mOutputBuffer.buffer->size();
203     }
204     if (needZero) {
205         mZero.dstBuffer = mOutputBuffer.buffer->buffer();
206         mZero.dstBufferSize = mOutputBuffer.buffer->size();
207     }
208     /** Alloc End*/
209 
210     /** Encode Begin*/
211     // Convert NC4HW4 image to buffer
212     for (auto& iter : mInputBuffers) {
213         auto& info = iter.second;
214         info.convert->encodeTensorToBuffer(iter.first, info.buffer->buffer(), info.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(iter.first), cmdBuffer);
215     }
216     //Blit
217     if (needZero) {
218         vkCmdFillBuffer(cmdBuffer->get(), mZero.dstBuffer, 0, mZero.dstBufferSize, 0);
219         cmdBuffer->barrierSource(mZero.dstBuffer, 0, mZero.dstBufferSize, VulkanCommandPool::Buffer::WRITE_WRITE);
220     }
221     for (auto& info : mBlits) {
222         info.describe->writeBuffer(info.dstBuffer, 0, info.dstBufferSize);
223         info.describe->writeBuffer(info.srcBuffer, 1, info.srcBufferSize);
224         info.describe->writeBuffer(info.uniform->buffer(), 2, info.uniform->size());
225         info.pipeline->bind(cmdBuffer->get(), info.describe->get());
226         cmdBuffer->barrierSource(info.srcBuffer, 0, info.srcBufferSize);
227         vkCmdDispatch(cmdBuffer->get(), info.workGroup[0], info.workGroup[1], info.workGroup[2]);
228     }
229 
230     // Convert buffer to NC4HW4 image
231     {
232         auto& info = mOutputBuffer;
233         info.convert->encodeBufferToTensor(info.buffer->buffer(), output, info.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(output), cmdBuffer);
234     }
235     /** Encode End*/
236     return NO_ERROR;
237 }
238 class VulkanRasterCreator : public VulkanBackend::Creator {
239 public:
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * bn) const240     virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* bn) const override {
241         return new VulkanRaster(bn);
242     }
243 };
244 
245 
246 class VulkanLoop : public VulkanBasicExecution {
247 public:
VulkanLoop(Backend * bn,const LoopParam * loop)248     VulkanLoop(Backend *bn, const LoopParam* loop) : VulkanBasicExecution(bn) {
249         mLoop = loop;
250     }
251     virtual ~VulkanLoop() = default;
252 
onEncode(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const VulkanCommandPool::Buffer * cmdBuffer)253     virtual ErrorCode onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
254                                const VulkanCommandPool::Buffer *cmdBuffer) override {
255         mExecutions.clear();
256         auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
257         std::vector<Tensor*> tensors(mLoop->tensorNumber());
258         for (int i=0; i<mLoop->inputIndexes()->size(); ++i) {
259             tensors[mLoop->inputIndexes()->data()[i]] = inputs[i];
260         }
261         for (int i=0; i<mLoop->outputIndexes()->size(); ++i) {
262             tensors[mLoop->outputIndexes()->data()[i]] = outputs[i];
263         }
264         auto C = tensors[cmd->indexes()->data()[0]];
265         auto A = tensors[cmd->indexes()->data()[1]];
266         auto B = tensors[cmd->indexes()->data()[2]];
267         for (int i=0; i<mLoop->loopNumber(); ++i) {
268             VulkanMatMul::MatMulInfo matInfo;
269             matInfo.e = cmd->size()->data()[0];
270             matInfo.l = cmd->size()->data()[1];
271             matInfo.h = cmd->size()->data()[2];
272             matInfo.offsetC = cmd->view()->GetAs<View>(0)->offset() + i * cmd->steps()->data()[0];
273             matInfo.offsetA = cmd->view()->GetAs<View>(1)->offset() + i * cmd->steps()->data()[1];
274             matInfo.offsetB = cmd->view()->GetAs<View>(2)->offset() + i * cmd->steps()->data()[2];
275             ::memcpy(matInfo.aStride, cmd->view()->GetAs<View>(1)->stride()->data(), 3 * sizeof(int));
276             ::memcpy(matInfo.bStride, cmd->view()->GetAs<View>(2)->stride()->data(), 3 * sizeof(int));
277             ::memcpy(matInfo.cStride, cmd->view()->GetAs<View>(0)->stride()->data(), 3 * sizeof(int));
278             Tensor* bias = nullptr;
279             if (cmd->indexes()->size() > 3) {
280                 bias = tensors[cmd->indexes()->data()[3]];
281                 matInfo.offsetBias = cmd->view()->GetAs<View>(3)->offset() + i * cmd->steps()->data()[3];
282             }
283             auto matmulOp = cmd->op();
284             std::shared_ptr<VulkanBasicExecution> exe(new VulkanMatMul(matmulOp->main_as_MatMul()->transposeA(), matmulOp->main_as_MatMul()->transposeB(), backend()));
285             auto matmulExe = static_cast<VulkanMatMul*>(exe.get());
286             bool res = true;
287             if (bias == nullptr) {
288                 res = matmulExe->encode({{A, B}}, {C}, cmdBuffer, matInfo);
289             } else {
290                 res = matmulExe->encode({{A, B, bias}}, {C}, cmdBuffer, matInfo);
291             }
292             if (!res) {
293                 return NOT_SUPPORT;
294             }
295             mExecutions.emplace_back(exe);
296         }
297         return NO_ERROR;
298     }
299 private:
300     std::vector<std::shared_ptr<VulkanBasicExecution>> mExecutions;
301     const LoopParam* mLoop;
302 };
303 
304 class VulkanLoopCreator : public VulkanBackend::Creator {
305 public:
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * bn) const306     virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* bn) const override {
307         auto loop = op->main_as_LoopParam();
308         if (1 != loop->commands()->size()) {
309             return nullptr;
310         }
311         auto cmd = loop->commands()->GetAs<RegionCommand>(0);
312         if (OpType_MatMul != cmd->op()->type()) {
313             return nullptr;
314         }
315         return new VulkanLoop(bn, loop);
316     }
317 };
318 
319 
__anon11930a7e0102() 320 static bool gResistor = []() {
321     VulkanBackend::addCreator(OpType_Raster, new VulkanRasterCreator);
322     VulkanBackend::addCreator(OpType_While, new VulkanLoopCreator);
323     return true;
324 }();
325 
326 };
327