1 //
2 // ConvBufWinograd.cpp
3 // MNN
4 //
5 // Created by MNN on 2019/01/08.
6 // Copyright © 2018, Alibaba Group Holding Limited
7 //
8
9 #ifndef MNN_OPENCL_BUFFER_CLOSED
10
11 #include "backend/opencl/execution/buffer/ConvBufWinograd.hpp"
12 #include "core/Backend.hpp"
13 #include "core/ConvolutionCommon.hpp"
14 #include "math/WingoradGenerater.hpp"
15 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
16
17 #define UNIT 2
18 #define INTERP 1
19 namespace MNN {
20 namespace OpenCL {
valid(const Convolution2DCommon * common,const Tensor * input,int limit)21 bool ConvBufWinograd::valid(const Convolution2DCommon* common, const Tensor* input, int limit) {
22 if (common->strideX() != 1 || common->strideY() != 1) {
23 return false;
24 }
25 if (common->dilateX() != 1 || common->dilateY() != 1) {
26 return false;
27 }
28 if (input->channel() < 8 || common->outputCount() < 8) {
29 return false;
30 }
31 return (common->kernelX() == 3 && common->kernelY() == 3);
32 }
33
ConvBufWinograd(const MNN::Convolution2D * op,Backend * backend)34 ConvBufWinograd::ConvBufWinograd(const MNN::Convolution2D* op, Backend* backend) : Execution(backend) {
35 mOpenCLBackend = static_cast<OpenCLBackend*>(backend);
36 mCommon = op->common();
37 MNN_ASSERT((3 == mCommon->kernelY() && 3 == mCommon->kernelX()));
38 MNN_ASSERT(1 == mCommon->strideX() && 1 == mCommon->strideY());
39 MNN_ASSERT(1 == mCommon->dilateX() && 1 == mCommon->dilateY());
40 auto runTime = mOpenCLBackend->getOpenCLRuntime();
41 int ky = mCommon->kernelY();
42 int kx = mCommon->kernelX();
43
44 int weightSize = 0;
45 const float* filterDataPtr = nullptr;
46 std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
47 ConvolutionCommon::getConvParameters(&quanCommon, op, &filterDataPtr, &weightSize);
48
49 int oc = mCommon->outputCount();
50 int ic = weightSize / oc / mCommon->kernelX() / mCommon->kernelY();
51 auto ocC4 = UP_DIV(oc, 4);
52 auto icC4 = UP_DIV(ic, 4);
53 auto queue = runTime->commandQueue();
54
55 auto imageChannelType = CL_HALF_FLOAT;
56 if (mOpenCLBackend->getPrecision() == BackendConfig::Precision_High) {
57 imageChannelType = CL_FLOAT;
58 }
59 // Create Buffer Object
60 {
61 cl_int ret_code;
62 size_t bias_element = ALIGN_UP4(oc);
63 size_t buffer_size;
64 if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
65 buffer_size = bias_element * sizeof(half_float::half);
66 } else {
67 buffer_size = bias_element * sizeof(float);
68 }
69
70 mBias.reset(Tensor::createDevice<float>({1, 1, 1, (int)ALIGN_UP4(oc)}));
71 mOpenCLBackend->onAcquireBuffer(mBias.get(), Backend::STATIC);
72 cl::Buffer &bias_buffer = *(cl::Buffer *)mBias->buffer().device;
73
74 auto bias_ptr = queue.enqueueMapBuffer(bias_buffer, CL_TRUE, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &ret_code);
75 if(bias_ptr == nullptr || ret_code) {
76 MNN_ERROR("clBuffer map error!\n");
77 }
78 ::memset(bias_ptr, 0, buffer_size);
79 if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
80 for(int i=0; i<oc; i++) {
81 ((half_float::half *)bias_ptr)[i] = (half_float::half)op->bias()->data()[i];
82 }
83 } else {
84 ::memcpy(bias_ptr, op->bias()->data(), oc*sizeof(float));
85 }
86 queue.enqueueUnmapMemObject(bias_buffer, bias_ptr);
87
88
89 std::shared_ptr<Tensor> sourceWeight(
90 Tensor::create<float>(std::vector<int>{oc, ic, ky, kx}, (void*)(filterDataPtr), Tensor::CAFFE));
91
92 int unit = UNIT;
93 int kernelSize = kx;
94 Math::WinogradGenerater generator(unit, kernelSize, INTERP);
95 int alpha = unit + kernelSize - 1;
96 auto weightDest = generator.allocTransformWeight(sourceWeight.get());
97 generator.transformWeight(weightDest.get(), sourceWeight.get());
98 auto weightDestSize = weightDest->size();
99
100 buffer_size = weightDest->elementSize();
101 if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
102 buffer_size *= sizeof(half_float::half);
103 } else {
104 buffer_size *= sizeof(float);
105 }
106
107 mWeight.reset(Tensor::createDevice<float>({1, ocC4 * alpha * alpha, icC4 * 4, 4}));//NHWC
108 mOpenCLBackend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
109
110 cl::Buffer &weightBuffer = *(cl::Buffer *)mWeight->buffer().device;
111
112 auto weight_ptr = queue.enqueueMapBuffer(weightBuffer, CL_TRUE, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &ret_code);
113 if(weight_ptr != nullptr && ret_code == CL_SUCCESS){
114 if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()){
115 for(int i=0; i<weightDest->elementSize(); i++) {
116 ((half_float::half*)weight_ptr)[i] = (half_float::half)(weightDest->host<float>()[i]);
117 }
118 }else{
119 ::memcpy(weight_ptr, weightDest->host<float>(), buffer_size);
120 }
121 } else{
122 MNN_ERROR("Map error weightPtr == nullptr \n");
123 }
124
125 queue.enqueueUnmapMemObject(weightBuffer, weight_ptr);
126
127 }
128 }
129
~ConvBufWinograd()130 ConvBufWinograd::~ConvBufWinograd() {
131 mOpenCLBackend->onReleaseBuffer(mWeight.get(), Backend::STATIC);
132 mOpenCLBackend->onReleaseBuffer(mBias.get(), Backend::STATIC);
133 }
134
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)135 ErrorCode ConvBufWinograd::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
136 auto input = inputs[0];
137 auto output = outputs[0];
138 mKernelX = mCommon->kernelX();
139 mKernelY = mCommon->kernelY();
140 mStrideX = mCommon->strideX();
141 mStrideY = mCommon->strideY();
142
143 int alpha = mKernelX + UNIT - 1;
144 auto wUnit = UP_DIV(output->width(), UNIT);
145 auto hUnit = UP_DIV(output->height(), UNIT);
146
147 auto pad = ConvolutionCommon::convolutionPad(input, output, mCommon);
148 int padY = pad.second;
149 int padX = pad.first;
150
151 auto runTime = mOpenCLBackend->getOpenCLRuntime();
152
153 mSource.reset(Tensor::createDevice<float>(
154 std::vector<int>{alpha * alpha, input->channel(), ROUND_UP(UP_DIV(wUnit * hUnit, 4), 2), 4}, Tensor::CAFFE_C4));
155 mDest.reset(Tensor::createDevice<float>(
156 std::vector<int>{4, wUnit * hUnit, UP_DIV(output->channel(), 4), alpha * alpha}, Tensor::CAFFE_C4));
157
158 mOpenCLBackend->onAcquireBuffer(mSource.get(), Backend::DYNAMIC);
159 mOpenCLBackend->onAcquireBuffer(mDest.get(), Backend::DYNAMIC);
160 mOpenCLBackend->onReleaseBuffer(mSource.get(), Backend::DYNAMIC);
161 mOpenCLBackend->onReleaseBuffer(mDest.get(), Backend::DYNAMIC);
162
163 auto icC4 = UP_DIV(input->channel(), 4);
164 auto ocC4 = UP_DIV(output->channel(), 4);
165
166 uint32_t total_num = input->batch();
167 mSourceTransform.resize(total_num);
168 mMatMul.resize(total_num);
169 mDestTransform.resize(total_num);
170 mMaxWGS_S.resize(total_num);
171 mMaxWGS_D.resize(total_num);
172 mMaxWGS_M.resize(total_num);
173
174 std::set<std::string> basic;
175 /*Create Kernel*/
176 for(int i = 0; i < total_num; i++) {
177 char format[20];
178 ::memset(format, 0, sizeof(format));
179 sprintf(format, "%d_%d_%d", UNIT, mKernelX, INTERP);
180 auto formatStr = std::string(format);
181 mSourceTransform[i] =
182 runTime->buildKernel("winogradTransform_buf",
183 "winoTransSrcBuf" + formatStr, basic);
184 mMaxWGS_S[i] = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mSourceTransform[i]));
185 {
186 std::set<std::string> buildOptions = basic;
187 if (mCommon->relu()) {
188 buildOptions.emplace("-DRELU");
189 }
190 if (mCommon->relu6()) {
191 buildOptions.emplace("-DRELU6");
192 }
193 mDestTransform[i] =
194 runTime->buildKernel("winogradTransform_buf",
195 "winoTransDstBuf" + formatStr, buildOptions);
196 mMaxWGS_D[i] = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mDestTransform[i]));
197 }
198 }
199
200 mGWS_S.resize(total_num);
201 mGWS_D.resize(total_num);
202 mGWS_M.resize(total_num);
203 mLWS_S.resize(total_num);
204 mLWS_D.resize(total_num);
205 mLWS_M.resize(total_num);
206
207 for (int b = 0; b < input->batch(); ++b) {
208 int hCount = hUnit;
209 int wCount = wUnit;
210
211 // Source Transform
212 {
213 mGWS_S[b] = {static_cast<uint32_t>(wCount * hCount), static_cast<uint32_t>(icC4)};
214 int index = 0;
215 mSourceTransform[b].setArg(index++, mGWS_S[b][0]);
216 mSourceTransform[b].setArg(index++, mGWS_S[b][1]);
217 mSourceTransform[b].setArg(index++, openCLBuffer(input));
218 mSourceTransform[b].setArg(index++, openCLBuffer(mSource.get()));
219 mSourceTransform[b].setArg(index++, wCount);
220 mSourceTransform[b].setArg(index++, hCount);
221 mSourceTransform[b].setArg(index++, padX);
222 mSourceTransform[b].setArg(index++, padY);
223 mSourceTransform[b].setArg(index++, input->width());
224 mSourceTransform[b].setArg(index++, input->height());
225 mSourceTransform[b].setArg(index++, icC4);
226 mSourceTransform[b].setArg(index++, b);
227
228 std::string kernelName = "winoTransSrcBuf";
229 mLWS_S[b] = localWS2DDefault(mGWS_S[b], mMaxWGS_S[b], mOpenCLBackend->getOpenCLRuntime(), kernelName, mSourceTransform[b]).first;
230 }
231
232 // MatMul
233 {
234 auto gemmHeight = ocC4;
235 auto gemmWidth = UP_DIV(wCount * hCount, 4);
236
237 const int total_kernel = 2;
238 const std::string kernelName[total_kernel] = {"gemm_buf", "gemm_buf2"};
239 int itemW[total_kernel] = {1, 2};
240
241 int actual_kernel = total_kernel;
242 if(mOpenCLBackend->getOpenCLRuntime()->getCLTuneLevel() == Normal || mOpenCLBackend->getOpenCLRuntime()->getCLTuneLevel() == Fast || mOpenCLBackend->getOpenCLRuntime()->getCLTuneLevel() == None) {
243 actual_kernel = 1;
244 }
245
246 cl::Kernel kernel[total_kernel];
247 std::vector<uint32_t> globalWorkSize[total_kernel];
248 std::vector<uint32_t> localWorkSize[total_kernel];
249 std::pair<uint32_t, int> min_cost(UINT_MAX, 0);//(min_time, min_index)
250 for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
251 kernel[knl_idx] = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_buf", kernelName[knl_idx], basic);
252 uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
253
254 globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(gemmWidth, itemW[knl_idx])*gemmHeight), static_cast<uint32_t>(alpha * alpha)};
255 uint32_t index = 0;
256 kernel[knl_idx].setArg(index++, globalWorkSize[knl_idx][0]);
257 kernel[knl_idx].setArg(index++, globalWorkSize[knl_idx][1]);
258 kernel[knl_idx].setArg(index++, openCLBuffer(mSource.get()));
259 kernel[knl_idx].setArg(index++, openCLBuffer(mWeight.get()));
260 kernel[knl_idx].setArg(index++, openCLBuffer(mDest.get()));
261 kernel[knl_idx].setArg(index++, gemmWidth);
262 kernel[knl_idx].setArg(index++, gemmHeight);
263 kernel[knl_idx].setArg(index++, icC4);
264 kernel[knl_idx].setArg(index++, alpha*alpha);
265
266 std::pair<std::vector<uint32_t>, uint32_t> retTune;
267 retTune = localWS2DDefault(globalWorkSize[knl_idx], maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName[knl_idx], kernel[knl_idx]);
268 //printf("gemm %d, %d\n", knl_idx, retTune.second);
269 if(min_cost.first > retTune.second) {
270 min_cost.first = retTune.second;
271 min_cost.second = knl_idx;
272 mLWS_M[b] = {retTune.first[0], retTune.first[1]};
273 }
274 }
275 int min_index = min_cost.second;
276 //mKernel = kernel[min_index];
277 mGWS_M[b] = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
278 mMatMul[b] = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_buf", kernelName[min_index], basic);
279
280 int index = 0;
281 mMatMul[b].setArg(index++, mGWS_M[b][0]);
282 mMatMul[b].setArg(index++, mGWS_M[b][1]);
283 mMatMul[b].setArg(index++, openCLBuffer(mSource.get()));
284 mMatMul[b].setArg(index++, openCLBuffer(mWeight.get()));
285 mMatMul[b].setArg(index++, openCLBuffer(mDest.get()));
286 mMatMul[b].setArg(index++, gemmWidth);
287 mMatMul[b].setArg(index++, gemmHeight);
288 mMatMul[b].setArg(index++, icC4);
289 mMatMul[b].setArg(index++, alpha*alpha);
290 }
291
292 // Dest Transform
293 {
294 mGWS_D[b] = {static_cast<uint32_t>(wCount*hCount), static_cast<uint32_t>(ocC4)};
295
296 int index = 0;
297 mDestTransform[b].setArg(index++, mGWS_D[b][0]);
298 mDestTransform[b].setArg(index++, mGWS_D[b][1]);
299 mDestTransform[b].setArg(index++, openCLBuffer(mDest.get()));
300 mDestTransform[b].setArg(index++, openCLBuffer(mBias.get()));
301 mDestTransform[b].setArg(index++, openCLBuffer(output));
302 mDestTransform[b].setArg(index++, wCount);
303 mDestTransform[b].setArg(index++, hCount);
304 mDestTransform[b].setArg(index++, output->width());
305 mDestTransform[b].setArg(index++, output->height());
306 mDestTransform[b].setArg(index++, ocC4);
307 mDestTransform[b].setArg(index++, b);
308
309 std::string kernelName = "winoTransDstBuf";
310 mLWS_D[b] = localWS2DDefault(mGWS_D[b], mMaxWGS_D[b], mOpenCLBackend->getOpenCLRuntime(), kernelName, mDestTransform[b]).first;
311 }
312 }
313
314 return NO_ERROR;
315 }
316
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)317 ErrorCode ConvBufWinograd::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
318 auto input = inputs[0];
319 auto output = outputs[0];
320
321 #ifdef ENABLE_OPENCL_TIME_PROFILER
322 int costTime = 0;
323 #endif
324 for (int b = 0; b < input->batch(); ++b) {
325 int index = b;
326 /*Source Transform*/
327 {
328 #ifdef ENABLE_OPENCL_TIME_PROFILER
329 cl::Event event;
330 runKernel2D(mSourceTransform[index], mGWS_S[index], mLWS_S[index],
331 mOpenCLBackend->getOpenCLRuntime(), &event);
332
333 int costTime0 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
334 costTime += costTime0;
335 MNN_PRINT("kernel cost:%d us ConvWino0\n",costTime0);
336 #else
337 runKernel2D(mSourceTransform[index], mGWS_S[index], mLWS_S[index],
338 mOpenCLBackend->getOpenCLRuntime());
339 #endif
340 }
341
342 /*MatMul*/
343 {
344 #ifdef ENABLE_OPENCL_TIME_PROFILER
345 cl::Event event;
346 runKernel2D(mMatMul[index], mGWS_M[index], mLWS_M[index],
347 mOpenCLBackend->getOpenCLRuntime(), &event);
348
349 int costTime1 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
350 costTime += costTime1;
351 MNN_PRINT("kernel cost:%d us ConvWino1\n",costTime1);
352 #else
353 runKernel2D(mMatMul[index], mGWS_M[index], mLWS_M[index],
354 mOpenCLBackend->getOpenCLRuntime());
355 #endif
356 }
357
358 // Dest Transform
359 {
360 #ifdef ENABLE_OPENCL_TIME_PROFILER
361 cl::Event event;
362 runKernel2D(mDestTransform[index], mGWS_D[index], mLWS_D[index],
363 mOpenCLBackend->getOpenCLRuntime(), &event);
364
365 int costTime2 = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
366 costTime += costTime2;
367 MNN_PRINT("kernel cost:%d us ConvWino2\n",costTime2);
368 #else
369 runKernel2D(mDestTransform[index], mGWS_D[index], mLWS_D[index],
370 mOpenCLBackend->getOpenCLRuntime());
371 #endif
372 }
373 }
374 #ifdef ENABLE_OPENCL_TIME_PROFILER
375 MNN_PRINT("kernel cost:%d us ConvWino total\n",costTime);
376 #endif
377
378 return NO_ERROR;
379 }
380
381 } // namespace OpenCL
382 } // namespace MNN
383 #endif /* MNN_OPENCL_BUFFER_CLOSED */
384