1 //
2 //  ConvBufWinograd.cpp
3 //  MNN
4 //
5 //  Created by MNN on 2019/01/08.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #ifndef MNN_OPENCL_BUFFER_CLOSED
10 
11 #include "backend/opencl/execution/buffer/ConvBufWinograd.hpp"
12 #include "core/Backend.hpp"
13 #include "core/ConvolutionCommon.hpp"
14 #include "math/WingoradGenerater.hpp"
15 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
16 
17 #define UNIT 2
18 #define INTERP 1
19 namespace MNN {
20 namespace OpenCL {
valid(const Convolution2DCommon * common,const Tensor * input,int limit)21 bool ConvBufWinograd::valid(const Convolution2DCommon* common, const Tensor* input, int limit) {
22     if (common->strideX() != 1 || common->strideY() != 1) {
23         return false;
24     }
25     if (common->dilateX() != 1 || common->dilateY() != 1) {
26         return false;
27     }
28     if (input->channel() < 8 || common->outputCount() < 8) {
29         return false;
30     }
31     return (common->kernelX() == 3 && common->kernelY() == 3);
32 }
33 
ConvBufWinograd(const MNN::Convolution2D * op,Backend * backend)34 ConvBufWinograd::ConvBufWinograd(const MNN::Convolution2D* op, Backend* backend) : Execution(backend) {
35     mOpenCLBackend = static_cast<OpenCLBackend*>(backend);
36     mCommon        = op->common();
37     MNN_ASSERT((3 == mCommon->kernelY() && 3 == mCommon->kernelX()));
38     MNN_ASSERT(1 == mCommon->strideX() && 1 == mCommon->strideY());
39     MNN_ASSERT(1 == mCommon->dilateX() && 1 == mCommon->dilateY());
40     auto runTime = mOpenCLBackend->getOpenCLRuntime();
41     int ky       = mCommon->kernelY();
42     int kx       = mCommon->kernelX();
43 
44     int weightSize             = 0;
45     const float* filterDataPtr = nullptr;
46     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
47     ConvolutionCommon::getConvParameters(&quanCommon, op, &filterDataPtr, &weightSize);
48 
49     int oc     = mCommon->outputCount();
50     int ic     = weightSize / oc / mCommon->kernelX() / mCommon->kernelY();
51     auto ocC4  = UP_DIV(oc, 4);
52     auto icC4  = UP_DIV(ic, 4);
53     auto queue = runTime->commandQueue();
54 
55     auto imageChannelType = CL_HALF_FLOAT;
56     if (mOpenCLBackend->getPrecision() == BackendConfig::Precision_High) {
57         imageChannelType = CL_FLOAT;
58     }
59     // Create Buffer Object
60     {
61         cl_int ret_code;
62         size_t bias_element = ALIGN_UP4(oc);
63         size_t buffer_size;
64         if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
65             buffer_size = bias_element * sizeof(half_float::half);
66         } else {
67             buffer_size = bias_element * sizeof(float);
68         }
69 
70         mBias.reset(Tensor::createDevice<float>({1, 1, 1, (int)ALIGN_UP4(oc)}));
71         mOpenCLBackend->onAcquireBuffer(mBias.get(), Backend::STATIC);
72         cl::Buffer &bias_buffer = *(cl::Buffer *)mBias->buffer().device;
73 
74         auto bias_ptr = queue.enqueueMapBuffer(bias_buffer, CL_TRUE, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &ret_code);
75         if(bias_ptr == nullptr || ret_code) {
76             MNN_ERROR("clBuffer map error!\n");
77         }
78         ::memset(bias_ptr, 0, buffer_size);
79         if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
80             for(int i=0; i<oc; i++) {
81                 ((half_float::half *)bias_ptr)[i] = (half_float::half)op->bias()->data()[i];
82             }
83         } else {
84             ::memcpy(bias_ptr, op->bias()->data(), oc*sizeof(float));
85         }
86         queue.enqueueUnmapMemObject(bias_buffer, bias_ptr);
87 
88 
89         std::shared_ptr<Tensor> sourceWeight(
90             Tensor::create<float>(std::vector<int>{oc, ic, ky, kx}, (void*)(filterDataPtr), Tensor::CAFFE));
91 
92         int unit       = UNIT;
93         int kernelSize = kx;
94         Math::WinogradGenerater generator(unit, kernelSize, INTERP);
95         int alpha       = unit + kernelSize - 1;
96         auto weightDest = generator.allocTransformWeight(sourceWeight.get());
97         generator.transformWeight(weightDest.get(), sourceWeight.get());
98         auto weightDestSize = weightDest->size();
99 
100         buffer_size = weightDest->elementSize();
101         if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
102             buffer_size *= sizeof(half_float::half);
103         } else {
104             buffer_size *= sizeof(float);
105         }
106 
107         mWeight.reset(Tensor::createDevice<float>({1, ocC4 * alpha * alpha, icC4 * 4, 4}));//NHWC
108         mOpenCLBackend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
109 
110         cl::Buffer &weightBuffer = *(cl::Buffer *)mWeight->buffer().device;
111 
112         auto weight_ptr = queue.enqueueMapBuffer(weightBuffer, CL_TRUE, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &ret_code);
113         if(weight_ptr != nullptr && ret_code == CL_SUCCESS){
114             if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()){
115                 for(int i=0; i<weightDest->elementSize(); i++) {
116                     ((half_float::half*)weight_ptr)[i] = (half_float::half)(weightDest->host<float>()[i]);
117                 }
118             }else{
119                 ::memcpy(weight_ptr, weightDest->host<float>(), buffer_size);
120             }
121         } else{
122             MNN_ERROR("Map error weightPtr == nullptr \n");
123         }
124 
125         queue.enqueueUnmapMemObject(weightBuffer, weight_ptr);
126 
127     }
128 }
129 
~ConvBufWinograd()130 ConvBufWinograd::~ConvBufWinograd() {
131     mOpenCLBackend->onReleaseBuffer(mWeight.get(), Backend::STATIC);
132     mOpenCLBackend->onReleaseBuffer(mBias.get(), Backend::STATIC);
133 }
134 
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)135 ErrorCode ConvBufWinograd::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
136     auto input  = inputs[0];
137     auto output = outputs[0];
138     mKernelX    = mCommon->kernelX();
139     mKernelY    = mCommon->kernelY();
140     mStrideX    = mCommon->strideX();
141     mStrideY    = mCommon->strideY();
142 
143     int alpha  = mKernelX + UNIT - 1;
144     auto wUnit = UP_DIV(output->width(), UNIT);
145     auto hUnit = UP_DIV(output->height(), UNIT);
146 
147     auto pad = ConvolutionCommon::convolutionPad(input, output, mCommon);
148     int padY = pad.second;
149     int padX = pad.first;
150 
151     auto runTime = mOpenCLBackend->getOpenCLRuntime();
152 
153     mSource.reset(Tensor::createDevice<float>(
154         std::vector<int>{alpha * alpha, input->channel(), ROUND_UP(UP_DIV(wUnit * hUnit, 4), 2), 4}, Tensor::CAFFE_C4));
155     mDest.reset(Tensor::createDevice<float>(
156         std::vector<int>{4, wUnit * hUnit, UP_DIV(output->channel(), 4), alpha * alpha}, Tensor::CAFFE_C4));
157 
158     mOpenCLBackend->onAcquireBuffer(mSource.get(), Backend::DYNAMIC);
159     mOpenCLBackend->onAcquireBuffer(mDest.get(), Backend::DYNAMIC);
160     mOpenCLBackend->onReleaseBuffer(mSource.get(), Backend::DYNAMIC);
161     mOpenCLBackend->onReleaseBuffer(mDest.get(), Backend::DYNAMIC);
162 
163     auto icC4 = UP_DIV(input->channel(), 4);
164     auto ocC4 = UP_DIV(output->channel(), 4);
165 
166     uint32_t total_num = input->batch();
167     mSourceTransform.resize(total_num);
168     mMatMul.resize(total_num);
169     mDestTransform.resize(total_num);
170     mMaxWGS_S.resize(total_num);
171     mMaxWGS_D.resize(total_num);
172     mMaxWGS_M.resize(total_num);
173 
174     std::set<std::string> basic;
175     /*Create Kernel*/
176     for(int i = 0; i < total_num; i++) {
177         char format[20];
178         ::memset(format, 0, sizeof(format));
179         sprintf(format, "%d_%d_%d", UNIT, mKernelX, INTERP);
180         auto formatStr = std::string(format);
181         mSourceTransform[i] =
182             runTime->buildKernel("winogradTransform_buf",
183                                  "winoTransSrcBuf" + formatStr, basic);
184         mMaxWGS_S[i] = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mSourceTransform[i]));
185         {
186             std::set<std::string> buildOptions = basic;
187             if (mCommon->relu()) {
188                 buildOptions.emplace("-DRELU");
189             }
190             if (mCommon->relu6()) {
191                 buildOptions.emplace("-DRELU6");
192             }
193             mDestTransform[i] =
194                 runTime->buildKernel("winogradTransform_buf",
195                                      "winoTransDstBuf" + formatStr, buildOptions);
196             mMaxWGS_D[i] = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mDestTransform[i]));
197         }
198     }
199 
200     mGWS_S.resize(total_num);
201     mGWS_D.resize(total_num);
202     mGWS_M.resize(total_num);
203     mLWS_S.resize(total_num);
204     mLWS_D.resize(total_num);
205     mLWS_M.resize(total_num);
206 
207     for (int b = 0; b < input->batch(); ++b) {
208         int hCount = hUnit;
209         int wCount = wUnit;
210 
211         // Source Transform
212         {
213             mGWS_S[b] = {static_cast<uint32_t>(wCount * hCount), static_cast<uint32_t>(icC4)};
214             int index = 0;
215             mSourceTransform[b].setArg(index++, mGWS_S[b][0]);
216             mSourceTransform[b].setArg(index++, mGWS_S[b][1]);
217             mSourceTransform[b].setArg(index++, openCLBuffer(input));
218             mSourceTransform[b].setArg(index++, openCLBuffer(mSource.get()));
219             mSourceTransform[b].setArg(index++, wCount);
220             mSourceTransform[b].setArg(index++, hCount);
221             mSourceTransform[b].setArg(index++, padX);
222             mSourceTransform[b].setArg(index++, padY);
223             mSourceTransform[b].setArg(index++, input->width());
224             mSourceTransform[b].setArg(index++, input->height());
225             mSourceTransform[b].setArg(index++, icC4);
226             mSourceTransform[b].setArg(index++, b);
227 
228             std::string kernelName = "winoTransSrcBuf";
229             mLWS_S[b] = localWS2DDefault(mGWS_S[b], mMaxWGS_S[b], mOpenCLBackend->getOpenCLRuntime(), kernelName, mSourceTransform[b]).first;
230         }
231 
232         // MatMul
233         {
234             auto gemmHeight = ocC4;
235             auto gemmWidth = UP_DIV(wCount * hCount, 4);
236 
237             const int total_kernel = 2;
238             const std::string kernelName[total_kernel] = {"gemm_buf", "gemm_buf2"};
239             int itemW[total_kernel] = {1, 2};
240 
241             int actual_kernel = total_kernel;
242             if(mOpenCLBackend->getOpenCLRuntime()->getCLTuneLevel() == Normal || mOpenCLBackend->getOpenCLRuntime()->getCLTuneLevel() == Fast || mOpenCLBackend->getOpenCLRuntime()->getCLTuneLevel() == None) {
243                 actual_kernel = 1;
244             }
245 
246             cl::Kernel kernel[total_kernel];
247             std::vector<uint32_t> globalWorkSize[total_kernel];
248             std::vector<uint32_t> localWorkSize[total_kernel];
249             std::pair<uint32_t, int> min_cost(UINT_MAX, 0);//(min_time, min_index)
250             for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
251                 kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_buf", kernelName[knl_idx], basic);
252                 uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
253 
254                 globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(gemmWidth, itemW[knl_idx])*gemmHeight), static_cast<uint32_t>(alpha * alpha)};
255                 uint32_t index = 0;
256                 kernel[knl_idx].setArg(index++, globalWorkSize[knl_idx][0]);
257                 kernel[knl_idx].setArg(index++, globalWorkSize[knl_idx][1]);
258                 kernel[knl_idx].setArg(index++, openCLBuffer(mSource.get()));
259                 kernel[knl_idx].setArg(index++, openCLBuffer(mWeight.get()));
260                 kernel[knl_idx].setArg(index++, openCLBuffer(mDest.get()));
261                 kernel[knl_idx].setArg(index++, gemmWidth);
262                 kernel[knl_idx].setArg(index++, gemmHeight);
263                 kernel[knl_idx].setArg(index++, icC4);
264                 kernel[knl_idx].setArg(index++, alpha*alpha);
265 
266                 std::pair<std::vector<uint32_t>, uint32_t> retTune;
267                 retTune = localWS2DDefault(globalWorkSize[knl_idx], maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName[knl_idx], kernel[knl_idx]);
268                 //printf("gemm %d, %d\n", knl_idx, retTune.second);
269                 if(min_cost.first > retTune.second) {
270                     min_cost.first = retTune.second;
271                     min_cost.second = knl_idx;
272                     mLWS_M[b] = {retTune.first[0], retTune.first[1]};
273                 }
274             }
275             int min_index  = min_cost.second;
276             //mKernel = kernel[min_index];
277             mGWS_M[b] = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
278             mMatMul[b] = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_buf", kernelName[min_index], basic);
279 
280             int index = 0;
281             mMatMul[b].setArg(index++, mGWS_M[b][0]);
282             mMatMul[b].setArg(index++, mGWS_M[b][1]);
283             mMatMul[b].setArg(index++, openCLBuffer(mSource.get()));
284             mMatMul[b].setArg(index++, openCLBuffer(mWeight.get()));
285             mMatMul[b].setArg(index++, openCLBuffer(mDest.get()));
286             mMatMul[b].setArg(index++, gemmWidth);
287             mMatMul[b].setArg(index++, gemmHeight);
288             mMatMul[b].setArg(index++, icC4);
289             mMatMul[b].setArg(index++, alpha*alpha);
290         }
291 
292         // Dest Transform
293         {
294             mGWS_D[b] = {static_cast<uint32_t>(wCount*hCount), static_cast<uint32_t>(ocC4)};
295 
296             int index = 0;
297             mDestTransform[b].setArg(index++, mGWS_D[b][0]);
298             mDestTransform[b].setArg(index++, mGWS_D[b][1]);
299             mDestTransform[b].setArg(index++, openCLBuffer(mDest.get()));
300             mDestTransform[b].setArg(index++, openCLBuffer(mBias.get()));
301             mDestTransform[b].setArg(index++, openCLBuffer(output));
302             mDestTransform[b].setArg(index++, wCount);
303             mDestTransform[b].setArg(index++, hCount);
304             mDestTransform[b].setArg(index++, output->width());
305             mDestTransform[b].setArg(index++, output->height());
306             mDestTransform[b].setArg(index++, ocC4);
307             mDestTransform[b].setArg(index++, b);
308 
309             std::string kernelName = "winoTransDstBuf";
310             mLWS_D[b] = localWS2DDefault(mGWS_D[b], mMaxWGS_D[b], mOpenCLBackend->getOpenCLRuntime(), kernelName, mDestTransform[b]).first;
311         }
312     }
313 
314     return NO_ERROR;
315 }
316 
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)317 ErrorCode ConvBufWinograd::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
318     auto input  = inputs[0];
319     auto output = outputs[0];
320 
321     #ifdef ENABLE_OPENCL_TIME_PROFILER
322     int costTime = 0;
323     #endif
324     for (int b = 0; b < input->batch(); ++b) {
325         int index = b;
326         /*Source Transform*/
327         {
328         #ifdef ENABLE_OPENCL_TIME_PROFILER
329             cl::Event event;
330             runKernel2D(mSourceTransform[index], mGWS_S[index], mLWS_S[index],
331                         mOpenCLBackend->getOpenCLRuntime(), &event);
332 
333             int costTime0 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
334             costTime += costTime0;
335             MNN_PRINT("kernel cost:%d    us ConvWino0\n",costTime0);
336         #else
337             runKernel2D(mSourceTransform[index], mGWS_S[index], mLWS_S[index],
338                         mOpenCLBackend->getOpenCLRuntime());
339         #endif
340         }
341 
342         /*MatMul*/
343         {
344         #ifdef ENABLE_OPENCL_TIME_PROFILER
345             cl::Event event;
346             runKernel2D(mMatMul[index], mGWS_M[index], mLWS_M[index],
347                         mOpenCLBackend->getOpenCLRuntime(), &event);
348 
349             int costTime1 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
350             costTime += costTime1;
351             MNN_PRINT("kernel cost:%d    us ConvWino1\n",costTime1);
352         #else
353             runKernel2D(mMatMul[index], mGWS_M[index], mLWS_M[index],
354                         mOpenCLBackend->getOpenCLRuntime());
355         #endif
356         }
357 
358         // Dest Transform
359         {
360         #ifdef ENABLE_OPENCL_TIME_PROFILER
361             cl::Event event;
362             runKernel2D(mDestTransform[index], mGWS_D[index], mLWS_D[index],
363                         mOpenCLBackend->getOpenCLRuntime(), &event);
364 
365             int costTime2 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
366             costTime += costTime2;
367             MNN_PRINT("kernel cost:%d    us ConvWino2\n",costTime2);
368         #else
369             runKernel2D(mDestTransform[index], mGWS_D[index], mLWS_D[index],
370                         mOpenCLBackend->getOpenCLRuntime());
371         #endif
372         }
373     }
374     #ifdef ENABLE_OPENCL_TIME_PROFILER
375     MNN_PRINT("kernel cost:%d    us ConvWino total\n",costTime);
376     #endif
377 
378     return NO_ERROR;
379 }
380 
381 } // namespace OpenCL
382 } // namespace MNN
383 #endif /* MNN_OPENCL_BUFFER_CLOSED */
384