1 #include "VulkanRaster.hpp"
2 #include "VulkanMatMul.hpp"
3 #include "core/TensorUtils.hpp"
4 #include <algorithm>
5 #include "core/OpCommonUtils.hpp"
6 #include "core/Macro.h"
7 namespace MNN {
writeNCHW(int * dims,Tensor * origin)8 static void writeNCHW(int* dims, Tensor* origin) {
9 int w = std::max(origin->width(), 1);
10 int h = std::max(origin->height(), 1);
11 int b = origin->batch();
12 dims[0] = w;
13 dims[1] = h;
14 dims[2] = origin->channel();
15 dims[3] = b;
16 }
17 struct SamplerInfo {
18 ivec4 stride;//stride[3] + offset
19 ivec4 size;//size[3] + totalSize
20 ivec4 extent;//dstStride[3]+dstOffset
21 ivec4 imageSize;// srcwh and dstwh
22 ivec2 depth;//c4 for src and dst
23 };
24
writeSamplerInfo(SamplerInfo & info,const Tensor::InsideDescribe::Region & sampler)25 static void writeSamplerInfo(SamplerInfo& info, const Tensor::InsideDescribe::Region& sampler) {
26 int sizeTotal = 1;
27 for (int i=0; i<3; ++i) {
28 info.size[i] = sampler.size[i];
29 info.stride[i] = sampler.src.stride[i];
30 info.extent[i] = sampler.dst.stride[i];
31 sizeTotal *= info.size[i];
32 }
33 info.size[3] = sizeTotal;
34 info.stride[3] = sampler.src.offset;
35 info.extent[3] = sampler.dst.offset;
36 }
37
onEncodeFast(const Tensor * input,const Tensor * output,const VulkanCommandPool::Buffer * cmdBuffer,bool zero)38 void VulkanRaster::onEncodeFast(const Tensor* input, const Tensor* output, const VulkanCommandPool::Buffer *cmdBuffer, bool zero) {
39 auto des = TensorUtils::getDescribe(input);
40 mBlitImages.resize(des->regions.size());
41 auto vkBn = static_cast<VulkanBackend*>(backend());
42 auto dstTensor = reinterpret_cast<VulkanTensor*>(output->deviceId());
43 if (zero) {
44 auto fillPipeline = vkBn->getPipeline("glsl_fill_image_comp", {
45 VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
46 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
47 });
48 struct FillImage {
49 vec4 value;
50 ivec4 imageSize;
51 };
52 FillImage uniformInfo;
53 ::memset(&uniformInfo, 0, sizeof(FillImage));
54 auto image = dstTensor->image();
55 uniformInfo.imageSize[0] = image->width();
56 uniformInfo.imageSize[1] = image->height();
57 uniformInfo.imageSize[2] = 0;
58 uniformInfo.imageSize[3] = image->width() * image->height();
59 std::shared_ptr<VulkanBuffer> uniform(new VulkanBuffer(vkBn->getMemoryPool(), false, sizeof(FillImage), &uniformInfo, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT));
60 mExtraUniform.emplace_back(uniform);
61 std::shared_ptr<VulkanPipeline::DescriptorSet> des(fillPipeline->createSet());
62 des->writeImage(image->view(), vkBn->getCommonSampler()->get(), VK_IMAGE_LAYOUT_GENERAL, 0);
63 des->writeBuffer(uniform->buffer(), 1, uniform->size());
64 auto totalSize = UP_DIV(uniformInfo.imageSize[3], 256);
65 mExtraDescribes.emplace_back(des);
66 fillPipeline->bind(cmdBuffer->get(), des->get());
67 vkCmdDispatch(cmdBuffer->get(), totalSize, 1, 1);
68 cmdBuffer->barrierImage(image->get(), VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_GENERAL, VulkanCommandPool::Buffer::WRITE_WRITE);
69 }
70
71 auto blitPipeline = vkBn->getPipeline("glsl_blit_image_comp", {
72 VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
73 VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
74 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
75 });
76
77 for (int i=0; i< des->regions.size(); ++i) {
78 auto& slice = des->regions[i];
79 Tensor::InsideDescribe::Region newRegion;
80 OpCommonUtils::turnToPackRegion(slice, newRegion, output, 4);
81 // TODO: Find better way
82 newRegion.dst.offset /= 4;
83 newRegion.src.offset /= 4;
84 auto& dst = mBlitImages[i];
85 SamplerInfo info;
86 writeSamplerInfo(info, newRegion);
87 auto nhwcSrc = VulkanTensor::tensorShapeFormat(slice.origin);
88 auto nhwcDst = VulkanTensor::tensorShapeFormat(output);
89 info.imageSize[0] = nhwcSrc[2];
90 info.imageSize[1] = nhwcSrc[1];
91 info.imageSize[2] = nhwcDst[2];
92 info.imageSize[3] = nhwcDst[1];
93 info.depth[0] = UP_DIV(nhwcSrc[3], 4);
94 info.depth[1] = UP_DIV(nhwcDst[3], 4);
95 auto total = info.size[0] * info.size[1] * info.size[2];
96 auto group = UP_DIV(total, 256);
97 dst.describe.reset(blitPipeline->createSet());
98 dst.uniform.reset(new VulkanBuffer(vkBn->getMemoryPool(), false, sizeof(SamplerInfo), &info, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT));
99 auto srcTensor = reinterpret_cast<VulkanTensor*>(slice.origin->deviceId());
100 dst.describe->writeImage(srcTensor->image()->view(), vkBn->getCommonSampler()->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 1);
101 dst.describe->writeImage(dstTensor->image()->view(), vkBn->getCommonSampler()->get(), VK_IMAGE_LAYOUT_GENERAL, 0);
102 dst.describe->writeBuffer(dst.uniform->buffer(), 2, dst.uniform->size());
103 cmdBuffer->barrierImage(srcTensor->image()->get(), VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
104 blitPipeline->bind(cmdBuffer->get(), dst.describe->get());
105 vkCmdDispatch(cmdBuffer->get(), group, 1, 1);
106 }
107 }
108
109
onEncode(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const VulkanCommandPool::Buffer * cmdBuffer)110 ErrorCode VulkanRaster::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
111 const VulkanCommandPool::Buffer *cmdBuffer) {
112 MNN_ASSERT(inputs.size() == 1);
113 MNN_ASSERT(outputs.size() == 1);
114 auto input = inputs[0];
115 auto output = outputs[0];
116 auto des = TensorUtils::getDescribe(input);
117 auto outputDes = TensorUtils::getDescribe(output);
118 bool needZero = !TensorUtils::regionIsFull(input);
119
120 /** Alloc Begin*/
121 mInputBuffers.clear();
122 mOutputBuffer.buffer = nullptr;
123 mBlits.resize(des->regions.size());
124 mBlitImages.clear();
125 mExtraUniform.clear();
126 mExtraDescribes.clear();
127 if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
128 // TODO: Optimize it
129 bool fast = true;
130 for (int i=0; i< des->regions.size(); ++i) {
131 auto& slice = des->regions[i];
132 if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
133 fast = false;
134 break;
135 }
136 if (!OpCommonUtils::canBlitFast(slice, output)) {
137 fast = false;
138 break;
139 }
140 }
141 if (fast) {
142 onEncodeFast(input, output, cmdBuffer, needZero);
143 return NO_ERROR;
144 }
145 }
146 auto vkBn = static_cast<VulkanBackend*>(backend());
147 std::vector<VkDescriptorType> nchwConvertTypes{VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
148 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
149 for (auto& slice : des->regions) {
150 auto origin = slice.origin;
151 if (mInputBuffers.find(origin)!=mInputBuffers.end()) {
152 continue;
153 }
154 MNN_ASSERT(origin->deviceId() != 0);
155 int bufferSize = sizeof(float);
156 for (int i=0; i<origin->dimensions(); ++i) {
157 bufferSize *= origin->length(i);
158 }
159 ivec4 dims;
160 writeNCHW(dims, origin);
161 ConvertInfo info;
162 info.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(),
163 false, bufferSize, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
164 info.convert.reset(new VulkanImageConverter(vkBn));
165 mInputBuffers.insert(std::make_pair(origin, std::move(info)));
166 }
167 {
168 int bufferSize = sizeof(float);
169 for (int i=0; i<output->dimensions(); ++i) {
170 bufferSize *= output->length(i);
171 }
172 mOutputBuffer.convert.reset(new VulkanImageConverter(vkBn));
173 mOutputBuffer.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSize, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
174 }
175 for (auto& iter : mInputBuffers) {
176 iter.second.buffer->release();
177 }
178 if (nullptr != mOutputBuffer.buffer) {
179 mOutputBuffer.buffer->release();
180 }
181 auto blitPipeline = vkBn->getPipeline("glsl_blit_comp", {
182 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
183 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
184 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
185 });
186 for (int i=0; i<mBlits.size(); ++i) {
187 auto& origin = des->regions[i];
188 auto& dst = mBlits[i];
189 SamplerInfo info;
190 writeSamplerInfo(info, origin);
191 auto total = info.size[0] * info.size[1] * info.size[2];
192 dst.workGroup[2] = 1;
193 dst.workGroup[1] = 1;
194 dst.workGroup[0] = UP_DIV(total, 256);
195 dst.pipeline = blitPipeline;
196 dst.describe.reset(blitPipeline->createSet());
197 dst.uniform.reset(new VulkanBuffer(vkBn->getMemoryPool(), false, sizeof(info), &info, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT));
198 auto srcIter = mInputBuffers.find(origin.origin);
199 dst.srcBuffer = srcIter->second.buffer->buffer();
200 dst.srcBufferSize = srcIter->second.buffer->size();
201 dst.dstBuffer = mOutputBuffer.buffer->buffer();
202 dst.dstBufferSize = mOutputBuffer.buffer->size();
203 }
204 if (needZero) {
205 mZero.dstBuffer = mOutputBuffer.buffer->buffer();
206 mZero.dstBufferSize = mOutputBuffer.buffer->size();
207 }
208 /** Alloc End*/
209
210 /** Encode Begin*/
211 // Convert NC4HW4 image to buffer
212 for (auto& iter : mInputBuffers) {
213 auto& info = iter.second;
214 info.convert->encodeTensorToBuffer(iter.first, info.buffer->buffer(), info.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(iter.first), cmdBuffer);
215 }
216 //Blit
217 if (needZero) {
218 vkCmdFillBuffer(cmdBuffer->get(), mZero.dstBuffer, 0, mZero.dstBufferSize, 0);
219 cmdBuffer->barrierSource(mZero.dstBuffer, 0, mZero.dstBufferSize, VulkanCommandPool::Buffer::WRITE_WRITE);
220 }
221 for (auto& info : mBlits) {
222 info.describe->writeBuffer(info.dstBuffer, 0, info.dstBufferSize);
223 info.describe->writeBuffer(info.srcBuffer, 1, info.srcBufferSize);
224 info.describe->writeBuffer(info.uniform->buffer(), 2, info.uniform->size());
225 info.pipeline->bind(cmdBuffer->get(), info.describe->get());
226 cmdBuffer->barrierSource(info.srcBuffer, 0, info.srcBufferSize);
227 vkCmdDispatch(cmdBuffer->get(), info.workGroup[0], info.workGroup[1], info.workGroup[2]);
228 }
229
230 // Convert buffer to NC4HW4 image
231 {
232 auto& info = mOutputBuffer;
233 info.convert->encodeBufferToTensor(info.buffer->buffer(), output, info.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(output), cmdBuffer);
234 }
235 /** Encode End*/
236 return NO_ERROR;
237 }
238 class VulkanRasterCreator : public VulkanBackend::Creator {
239 public:
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * bn) const240 virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* bn) const override {
241 return new VulkanRaster(bn);
242 }
243 };
244
245
246 class VulkanLoop : public VulkanBasicExecution {
247 public:
VulkanLoop(Backend * bn,const LoopParam * loop)248 VulkanLoop(Backend *bn, const LoopParam* loop) : VulkanBasicExecution(bn) {
249 mLoop = loop;
250 }
251 virtual ~VulkanLoop() = default;
252
onEncode(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const VulkanCommandPool::Buffer * cmdBuffer)253 virtual ErrorCode onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
254 const VulkanCommandPool::Buffer *cmdBuffer) override {
255 mExecutions.clear();
256 auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
257 std::vector<Tensor*> tensors(mLoop->tensorNumber());
258 for (int i=0; i<mLoop->inputIndexes()->size(); ++i) {
259 tensors[mLoop->inputIndexes()->data()[i]] = inputs[i];
260 }
261 for (int i=0; i<mLoop->outputIndexes()->size(); ++i) {
262 tensors[mLoop->outputIndexes()->data()[i]] = outputs[i];
263 }
264 auto C = tensors[cmd->indexes()->data()[0]];
265 auto A = tensors[cmd->indexes()->data()[1]];
266 auto B = tensors[cmd->indexes()->data()[2]];
267 for (int i=0; i<mLoop->loopNumber(); ++i) {
268 VulkanMatMul::MatMulInfo matInfo;
269 matInfo.e = cmd->size()->data()[0];
270 matInfo.l = cmd->size()->data()[1];
271 matInfo.h = cmd->size()->data()[2];
272 matInfo.offsetC = cmd->view()->GetAs<View>(0)->offset() + i * cmd->steps()->data()[0];
273 matInfo.offsetA = cmd->view()->GetAs<View>(1)->offset() + i * cmd->steps()->data()[1];
274 matInfo.offsetB = cmd->view()->GetAs<View>(2)->offset() + i * cmd->steps()->data()[2];
275 ::memcpy(matInfo.aStride, cmd->view()->GetAs<View>(1)->stride()->data(), 3 * sizeof(int));
276 ::memcpy(matInfo.bStride, cmd->view()->GetAs<View>(2)->stride()->data(), 3 * sizeof(int));
277 ::memcpy(matInfo.cStride, cmd->view()->GetAs<View>(0)->stride()->data(), 3 * sizeof(int));
278 Tensor* bias = nullptr;
279 if (cmd->indexes()->size() > 3) {
280 bias = tensors[cmd->indexes()->data()[3]];
281 matInfo.offsetBias = cmd->view()->GetAs<View>(3)->offset() + i * cmd->steps()->data()[3];
282 }
283 auto matmulOp = cmd->op();
284 std::shared_ptr<VulkanBasicExecution> exe(new VulkanMatMul(matmulOp->main_as_MatMul()->transposeA(), matmulOp->main_as_MatMul()->transposeB(), backend()));
285 auto matmulExe = static_cast<VulkanMatMul*>(exe.get());
286 bool res = true;
287 if (bias == nullptr) {
288 res = matmulExe->encode({{A, B}}, {C}, cmdBuffer, matInfo);
289 } else {
290 res = matmulExe->encode({{A, B, bias}}, {C}, cmdBuffer, matInfo);
291 }
292 if (!res) {
293 return NOT_SUPPORT;
294 }
295 mExecutions.emplace_back(exe);
296 }
297 return NO_ERROR;
298 }
299 private:
300 std::vector<std::shared_ptr<VulkanBasicExecution>> mExecutions;
301 const LoopParam* mLoop;
302 };
303
304 class VulkanLoopCreator : public VulkanBackend::Creator {
305 public:
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * bn) const306 virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* bn) const override {
307 auto loop = op->main_as_LoopParam();
308 if (1 != loop->commands()->size()) {
309 return nullptr;
310 }
311 auto cmd = loop->commands()->GetAs<RegionCommand>(0);
312 if (OpType_MatMul != cmd->op()->type()) {
313 return nullptr;
314 }
315 return new VulkanLoop(bn, loop);
316 }
317 };
318
319
__anon11930a7e0102() 320 static bool gResistor = []() {
321 VulkanBackend::addCreator(OpType_Raster, new VulkanRasterCreator);
322 VulkanBackend::addCreator(OpType_While, new VulkanLoopCreator);
323 return true;
324 }();
325
326 };
327