1 //
2 //  ConvExecution.cpp
3 //  MNN
4 //
5 //  Created by MNN on 2019/02/28.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #include "ConvExecution.hpp"
10 #include "ConvWinograd.hpp"
11 #include "core/ConvolutionCommon.hpp"
12 #include "core/Macro.h"
13 #include "core/TensorUtils.hpp"
14 #include "backend/opencl/core/OpenCLBackend.hpp"
15 #include "backend/opencl/core/OpenCLRunningUtils.hpp"
16 
17 #define UNIT 4
18 namespace MNN {
19 namespace OpenCL {
20 
ConvCommonExecution(const Convolution2D * conv2dParams,Backend * backend)21 ConvCommonExecution::ConvCommonExecution(const Convolution2D *conv2dParams, Backend *backend) : Execution(backend) {
22     auto openclBackend       = (OpenCLBackend *)backend;
23     int biasSize             = conv2dParams->bias()->size();
24     const float *biasDataPtr = conv2dParams->bias()->data();
25 
26     int buffer_size = ALIGN_UP4(biasSize);
27     if(openclBackend->getOpenCLRuntime()->isWeightCpuTransHalf()) {
28         buffer_size *= sizeof(half_float::half);
29     } else {
30         buffer_size *= sizeof(float);
31     }
32     cl::Buffer biasBuffer(openclBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
33     cl_int error;
34     auto biasPtrCL = openclBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(
35         biasBuffer, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
36     if(biasPtrCL != nullptr && error == CL_SUCCESS){
37         if(openclBackend->getOpenCLRuntime()->isWeightCpuTransHalf()){
38             for(int i=0; i<biasSize; i++) {
39                 ((half_float::half*)biasPtrCL)[i] = (half_float::half)(biasDataPtr[i]);
40             }
41             for(int i=biasSize; i<ALIGN_UP4(biasSize); i++) {
42                 ((half_float::half*)biasPtrCL)[i] = (half_float::half)(0.0f);
43             }
44         }else{
45             ::memset(biasPtrCL, 0, ALIGN_UP4(biasSize) * sizeof(float));
46             ::memcpy(biasPtrCL, biasDataPtr, biasSize * sizeof(float));
47         }
48     }else{
49         MNN_ERROR("Map error biasPtrCL == nullptr \n");
50     }
51     openclBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(biasBuffer, biasPtrCL);
52     mBias.reset(Tensor::createDevice<float>({1, 1, 1, biasSize}));
53     backend->onAcquireBuffer(mBias.get(), Backend::STATIC);
54     copyBufferToImage(openclBackend->getOpenCLRuntime(), biasBuffer, openCLImage(mBias.get()), UP_DIV(biasSize, 4), 1);
55 }
~ConvCommonExecution()56 ConvCommonExecution::~ConvCommonExecution() {
57     MNN_ASSERT(nullptr != mBias);
58     backend()->onReleaseBuffer(mBias.get(), Backend::STATIC);
59 }
60 
ConvExecution(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * backend)61 ConvExecution::ConvExecution(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const MNN::Op *op, Backend *backend)
62     : ConvCommonExecution(op->main_as_Convolution2D(), backend) {
63 #ifdef LOG_VERBOSE
64     MNN_PRINT("Start ConvExecution init !\n");
65 #endif
66     mOpenCLBackend                 = static_cast<OpenCLBackend *>(backend);
67     const auto *conv2dParams       = op->main_as_Convolution2D();
68     const auto *conv2dCommonParams = conv2dParams->common();
69     mConv2dCommonParams            = conv2dCommonParams;
70     mStrides                       = {conv2dCommonParams->strideY(), conv2dCommonParams->strideX()};
71     mDilations                     = {conv2dCommonParams->dilateY(), conv2dCommonParams->dilateX()};
72 
73     auto pad = ConvolutionCommon::convolutionPad(inputs[0], outputs[0], mConv2dCommonParams);
74     mPaddings[0] = pad.second;
75     mPaddings[1] = pad.first;
76 
77     int kernelWidth   = conv2dCommonParams->kernelX();
78     int kernelHeight  = conv2dCommonParams->kernelY();
79     int outputChannel = conv2dCommonParams->outputCount();
80 
81     int weightSize             = 0;
82     const float *filterDataPtr = nullptr;
83 
84     std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
85     if (nullptr != conv2dParams->quanParameter()) {
86         quanCommon = ConvolutionCommon::load(conv2dParams->quanParameter(), true);
87         if (nullptr == quanCommon) {
88             MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str());
89         }
90         if (quanCommon->weightFloat.get() == nullptr) {
91             MNN_PRINT("quanCommon->weightFloat.get() == nullptr \n");
92         }
93         // Back to float
94         filterDataPtr = quanCommon->weightFloat.get();
95         weightSize    = quanCommon->weightFloat.size();
96     } else if (nullptr == conv2dParams->weight() || nullptr == conv2dParams->bias()) {
97         MNN_ERROR("%s has no weight or bias. The model may be benchmark model, please revert the weight/bias firstly\n", op->name()->c_str());
98     }
99 
100     if (nullptr == filterDataPtr) {
101         weightSize    = conv2dParams->weight()->size();
102         filterDataPtr = conv2dParams->weight()->data();
103     }
104     int inputChannel = weightSize / (kernelWidth * kernelHeight * outputChannel);
105 
106     auto gpuType = mOpenCLBackend->getOpenCLRuntime()->getGpuType();
107 
108     //select opt conv method
109     std::string kernelName = "conv_2d";
110     if (kernelHeight == kernelWidth && kernelHeight == 1 && mPaddings[0] == 0 &&
111         mPaddings[1] == 0) {
112         mConv1x1Opt = (mStrides[0] == 1 && mStrides[1] == 1 && gpuType == GpuType::MALI);
113 #if 0
114         if((gpuType == GpuType::ADRENO)){
115             uint64_t useLocalSize = UNIT*UNIT*4*sizeof(float)*4;
116             if(useLocalSize >= mOpenCLBackend->getOpenCLRuntime()->getMaxLocalMem()){
117                 mUseLocalMem = false;
118             }else{
119                 kernelName = "conv_2d_1x1_local";
120                 mUseLocalMem=true;
121             }
122         }
123 #endif
124         if(!mUseLocalMem){
125             if(mConv1x1Opt){
126                 kernelName = "conv_2d_1x1_mali";
127             }else{
128                 kernelName = "conv_2d_1x1";
129             }
130         }
131     }
132 
133     if(mConv1x1Opt && !mUseLocalMem){
134         cl_int error;
135         std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>({UP_DIV(outputChannel, 4)*4, UP_DIV(inputChannel, 4)*4, kernelWidth, kernelHeight}));
136 
137         int buffer_size = filterBuffer->elementSize();
138         if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
139             buffer_size *= sizeof(half_float::half);
140         } else {
141             buffer_size *= sizeof(float);
142         }
143 
144         mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size));
145         auto kernelBufferPtr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*(mKernelBuffer.get()), true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
146         if(kernelBufferPtr != nullptr && error == CL_SUCCESS){
147             ::memset(kernelBufferPtr, 0, buffer_size);
148             for(int o = 0; o < outputChannel; o++){
149                 for(int i = 0 ; i < inputChannel; i++){
150                     int bufferIdx = (o/4) * ROUND_UP(inputChannel, 4)*4 + (i/4)*16 + (o%4)*4 + (i%4);
151                     int filterIdx = o*inputChannel + i;
152                     if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()){
153                         ((half_float::half*)kernelBufferPtr)[bufferIdx] = (half_float::half)(filterDataPtr[filterIdx]);
154                     }else{
155                         ((float*)kernelBufferPtr)[bufferIdx] = (float)(filterDataPtr[filterIdx]);
156                     }
157                 }
158             }
159         }else{
160             MNN_ERROR("Map error ptrCL == nullptr \n");
161         }
162         mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*(mKernelBuffer.get()), kernelBufferPtr);
163 
164         //bias
165         int biasSize             = conv2dParams->bias()->size();
166         const float *biasDataPtr = conv2dParams->bias()->data();
167 
168         buffer_size = ALIGN_UP4(biasSize);
169         if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
170             buffer_size *= sizeof(half_float::half);
171         } else {
172             buffer_size *= sizeof(float);
173         }
174 
175         mBiasBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size));
176         auto biasPtrCL = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(
177             *(mBiasBuffer.get()), true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
178         if(biasPtrCL != nullptr && error == CL_SUCCESS){
179             if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()){
180                 for (int i = 0; i < biasSize; i++)
181                 {
182                     ((half_float::half*)biasPtrCL)[i] = (half_float::half)(biasDataPtr[i]);
183                 }
184                 for(int i=biasSize; i<ALIGN_UP4(biasSize); i++) {
185                     ((half_float::half*)biasPtrCL)[i] = (half_float::half)(0.0f);
186                 }
187             }else{
188                 ::memset(biasPtrCL, 0, ALIGN_UP4(biasSize) * sizeof(float));
189                 ::memcpy(biasPtrCL, biasDataPtr, biasSize * sizeof(float));
190             }
191         }else{
192             MNN_ERROR("Map error biasPtrCL == nullptr \n");
193         }
194         mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*(mBiasBuffer.get()), biasPtrCL);
195 
196     }else{
197         std::vector<int> filterImageShape{(int)inputChannel, (int)(UP_DIV(outputChannel, 4) * kernelWidth * kernelHeight)};
198         std::shared_ptr<Tensor> filterBuffer(
199             Tensor::createDevice<float>({outputChannel, inputChannel, kernelWidth, kernelHeight}));
200 
201         int buffer_size = filterBuffer->elementSize();
202         if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf()) {
203             buffer_size *= sizeof(half_float::half);
204         } else {
205             buffer_size *= sizeof(float);
206         }
207         cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
208         filterBuffer->buffer().device = (uint64_t)(&filterBufferCL);
209 
210         cl_int error;
211         auto ptrCL = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(filterBufferCL, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
212         if(ptrCL != nullptr && error == CL_SUCCESS) {
213             ::memset(ptrCL, 0, buffer_size);
214             if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf()){
215                 for(int i = 0 ; i < filterBuffer->elementSize(); i++){
216                     ((half_float::half*)ptrCL)[i] = (half_float::half)(filterDataPtr[i]);
217                 }
218             }else{
219                 ::memcpy(ptrCL, filterDataPtr, filterBuffer->size());
220             }
221         }else{
222             MNN_ERROR("Map error ptrCL == nullptr \n");
223         }
224         mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(filterBufferCL, ptrCL);
225 
226         mFilter.reset(Tensor::createDevice<float>({1, filterImageShape[1], 1, 4 * filterImageShape[0]}));
227         mOpenCLBackend->onAcquireBuffer(mFilter.get(), Backend::STATIC);
228         MNN::OpenCL::ImageBufferConvertor imageBufferConvertor{mOpenCLBackend->getOpenCLRuntime()};
229 
230         std::string buildOption = "";
231         if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf() == false){
232             buildOption = "-DBUFFER_INP_FP32";
233         }
234         imageBufferConvertor.convertBufferToImage(filterBuffer.get(), MNN::OpenCL::CONV2D_FILTER, mFilter.get(), false, buildOption);
235     }
236 
237     // Create Kernel
238     std::set<std::string> buildOptions;
239     buildOptions.emplace("-DBIAS");
240     if (mConv2dCommonParams->relu()) {
241         buildOptions.emplace("-DRELU");
242     } else if (mConv2dCommonParams->relu6()) {
243         buildOptions.emplace("-DRELU6");
244     }
245 
246 
247     mKernel           = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName, buildOptions);
248     mMaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mKernel));
249 
250 #ifdef LOG_VERBOSE
251     MNN_PRINT("end ConvExecution init !\n");
252 #endif
253 }
254 
~ConvExecution()255 ConvExecution::~ConvExecution() {
256     if(mUseLocalMem || !mConv1x1Opt){
257         mOpenCLBackend->onReleaseBuffer(mFilter.get(), Backend::STATIC);
258     }
259 }
260 
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)261 ErrorCode ConvExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
262 #ifdef LOG_VERBOSE
263     MNN_PRINT("Start ConvExecution onResize !\n");
264 #endif
265     auto input  = inputs[0];
266     auto output = outputs[0];
267 
268     std::vector<int> inputShape  = tensorShapeFormat(input);
269     std::vector<int> outputShape = tensorShapeFormat(output);
270     const int height             = outputShape.at(1);
271     const int width              = outputShape.at(2);
272 
273     const int inputHeight   = inputShape.at(1);
274     const int inputWidth    = inputShape.at(2);
275     const int inputChannels = inputShape.at(3);
276 
277     const int inputChannelBlocks = UP_DIV(inputChannels, 4);
278     int kernelHeight = mConv2dCommonParams->kernelY();
279     int kernelWidth  = mConv2dCommonParams->kernelX();
280 
281     auto pad = ConvolutionCommon::convolutionPad(input, output, mConv2dCommonParams);
282     mPaddings[0] = pad.second;
283     mPaddings[1] = pad.first;
284 
285     if (kernelHeight == kernelWidth && kernelHeight == 1 && mPaddings[0] == 0 && mPaddings[1] == 0) {
286         if(mConv1x1Opt){
287 
288             auto kernel             = &mKernel;
289             uint32_t idx            = 0;
290 
291             if(mUseLocalMem){
292                 mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), 4)), static_cast<uint32_t>(UP_DIV(outputShape.at(2), 4)),
293                 static_cast<uint32_t>(outputShape.at(0) * outputShape.at(1))};
294                 std::vector<uint32_t> lws{UNIT, UNIT, 1};
295                 mLocalWorkSize = lws;
296                 kernel->setArg(idx++, mGlobalWorkSize[0]);
297                 kernel->setArg(idx++, mGlobalWorkSize[1]);
298                 kernel->setArg(idx++, mGlobalWorkSize[2]);
299                 kernel->setArg(idx++, openCLImage(input));
300                 kernel->setArg(idx++, openCLImage(mFilter.get()));
301                 kernel->setArg(idx++, openCLImage(mBias.get()));
302                 kernel->setArg(idx++, openCLImage(output));
303                 kernel->setArg(idx++, static_cast<int>(inputChannelBlocks));
304                 kernel->setArg(idx++, height);
305                 kernel->setArg(idx++, width);
306             }else{
307                 mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), 4) * UP_DIV(outputShape.at(2), 4)),
308                            static_cast<uint32_t>(outputShape.at(0) * outputShape.at(1))};
309                 kernel->setArg(idx++, mGlobalWorkSize[0]);
310                 kernel->setArg(idx++, mGlobalWorkSize[1]);
311                 kernel->setArg(idx++, UP_DIV(width, 4));
312                 kernel->setArg(idx++, openCLImage(input));
313                 kernel->setArg(idx++, *mKernelBuffer.get());
314                 kernel->setArg(idx++, *mBiasBuffer.get());
315                 kernel->setArg(idx++, openCLImage(output));
316                 kernel->setArg(idx++, static_cast<int>(inputChannelBlocks));
317                 kernel->setArg(idx++, height);
318                 kernel->setArg(idx++, width);
319 
320                 std::string kernelName = "conv_2d_1x1_mali";
321                 mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
322             }
323 
324 
325         }else{
326             mGlobalWorkSize = {
327             static_cast<uint32_t>(UP_DIV(outputShape.at(3), 4) * static_cast<uint32_t>(UP_DIV(outputShape.at(2), 4))),
328             static_cast<uint32_t>(outputShape.at(0) * outputShape.at(1))};
329 
330             auto kernel             = &mKernel;
331             uint32_t idx            = 0;
332             int inputImageShape[2]  = {inputHeight, inputWidth};
333             int outputImageShape[2] = {height, width};
334             int stideShape[2]       = {mStrides[0], mStrides[1]};
335             kernel->setArg(idx++, mGlobalWorkSize[0]);
336             kernel->setArg(idx++, mGlobalWorkSize[1]);
337             kernel->setArg(idx++, openCLImage(input));
338             kernel->setArg(idx++, openCLImage(mFilter.get()));
339             kernel->setArg(idx++, openCLImage(mBias.get()));
340             kernel->setArg(idx++, openCLImage(output));
341             kernel->setArg(idx++, sizeof(inputImageShape), inputImageShape);
342             kernel->setArg(idx++, static_cast<int>(inputChannelBlocks));
343             kernel->setArg(idx++, sizeof(outputImageShape), outputImageShape);
344             kernel->setArg(idx++, sizeof(stideShape), stideShape);
345             kernel->setArg(idx++, UP_DIV(width, 4));
346             std::string kernelName = "conv_2d_1x1";
347             mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
348         }
349     } else {
350         mGlobalWorkSize         = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), 4) * UP_DIV(outputShape.at(2), 4)),
351                            static_cast<uint32_t>(outputShape.at(0) * outputShape.at(1))};
352 
353         int inputImageShape[2]  = {inputHeight, inputWidth};
354         int outputImageShape[2] = {height, width};
355         int kernelShape[2]      = {kernelHeight, kernelWidth};
356         int strideShape[2]      = {mStrides[0], mStrides[1]};
357         int paddingShape[2]     = {mPaddings[0], mPaddings[1]};
358         int dilationShape[2]    = {mDilations[0], mDilations[1]};
359         uint32_t idx            = 0;
360         auto kernel             = &mKernel;
361         kernel->setArg(idx++, mGlobalWorkSize[0]);
362         kernel->setArg(idx++, mGlobalWorkSize[1]);
363         kernel->setArg(idx++, openCLImage(input));
364         kernel->setArg(idx++, openCLImage(mFilter.get()));
365         kernel->setArg(idx++, openCLImage(mBias.get()));
366         kernel->setArg(idx++, openCLImage(output));
367         kernel->setArg(idx++, sizeof(inputImageShape), inputImageShape);
368         kernel->setArg(idx++, inputChannelBlocks);
369         kernel->setArg(idx++, sizeof(outputImageShape), outputImageShape);
370         kernel->setArg(idx++, sizeof(kernelShape), kernelShape);
371         kernel->setArg(idx++, sizeof(strideShape), strideShape);
372         kernel->setArg(idx++, sizeof(paddingShape), paddingShape);
373         kernel->setArg(idx++, sizeof(dilationShape), dilationShape);
374         kernel->setArg(idx++, UP_DIV(width, 4));
375 
376         std::string kernelName = "conv_2d";
377         mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
378     }
379 
380 #ifdef LOG_VERBOSE
381     MNN_PRINT("end ConvExecution onResize !\n");
382 #endif
383     return NO_ERROR;
384 }
385 
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)386 ErrorCode ConvExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
387 #ifdef LOG_VERBOSE
388     MNN_PRINT("Start ConvExecution onExecute !\n");
389 #endif
390     if(mUseLocalMem){
391     #ifdef ENABLE_OPENCL_TIME_PROFILER
392         cl::Event event;
393         run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
394                            mOpenCLBackend->getOpenCLRuntime(), &event);
395 
396         float costTime = mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
397         MNN_PRINT("kernel cost:%f    us Conv UseLocalMem\n",costTime);
398     #else
399         run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
400                            mOpenCLBackend->getOpenCLRuntime());
401     #endif
402     }
403 
404 #ifdef ENABLE_OPENCL_TIME_PROFILER
405     cl::Event event;
406     runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
407                 mOpenCLBackend->getOpenCLRuntime(), &event);
408 
409     int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
410     MNN_PRINT("kernel cost:%d    us Conv2D\n",costTime);
411 #else
412     runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
413                 mOpenCLBackend->getOpenCLRuntime());
414 #endif
415 
416 #ifdef LOG_VERBOSE
417     MNN_PRINT("end ConvExecution onExecute !\n");
418 #endif
419     return NO_ERROR;
420 }
421 
422 class ConvolutionCreator : public OpenCLBackend::Creator {
423 public:
424     virtual ~ConvolutionCreator() = default;
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * backend) const425     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
426                                 const MNN::Op *op, Backend *backend) const override {
427         if (inputs.size() > 1) {
428             return nullptr;
429         }
430         if (nullptr != op->main_as_Convolution2D()->quanParameter()) {
431             auto quan = op->main_as_Convolution2D()->quanParameter();
432             if (1 == quan->type() || 2 == quan->type()) {
433                 if (quan->has_scaleInt()) {
434                     // Don't support IDST-int8 because of error
435                     return nullptr;
436                 }
437             }
438         }
439 
440         auto conv2D = op->main_as_Convolution2D();
441         if (ConvWinograd::valid(conv2D->common(), inputs[0])) {
442             return new ConvWinograd(conv2D, backend);
443         }
444 
445         return new ConvExecution(inputs, outputs, op, backend);
446     }
447 };
448 
449 OpenCLCreatorRegister<ConvolutionCreator> __conv_op(OpType_Convolution, IMAGE);
450 
451 } // namespace OpenCL
452 } // namespace MNN
453