1 //
2 //  GLConvolution.cpp
3 //  MNN
4 //
5 //  Created by MNN on 2019/01/31.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #include "backend/opengl/GLConvolution.hpp"
10 #include <MNN/AutoTime.hpp>
11 
12 #include <sstream>
13 #include "AllShader.hpp"
14 #include "core/Macro.h"
15 #include "backend/opengl/GLConvolutionIm2col.hpp"
16 namespace MNN {
17 namespace OpenGL {
18 
19 #define UNIT 4
20 
GPUConvolution(const Op * convOp,Backend * b)21 GPUConvolution::GPUConvolution(const Op *convOp, Backend *b) : MNN::Execution(b) {
22     mCommon          = convOp->main_as_Convolution2D()->common();
23     auto convReal    = convOp->main_as_Convolution2D();
24     auto outputCount = mCommon->outputCount();
25     mInputDepth        = 0;
26 
27     if (convReal->weight() != NULL) {
28         auto weightSize = convReal->weight()->size();
29         mInputDepth       = weightSize * mCommon->group() / mCommon->kernelX() / mCommon->kernelY() / outputCount;
30     }
31 }
~GPUConvolution()32 GPUConvolution::~GPUConvolution() {
33 }
34 
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)35 ErrorCode GPUConvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
36     auto input  = inputs[0];
37     auto output = outputs[0];
38     if (mCommon->padMode() == PadMode_SAME) {
39         int kernelWidthSize = (mCommon->kernelX() - 1) * mCommon->dilateX() + 1;
40         int kernelHeightSize = (mCommon->kernelY() - 1) * mCommon->dilateY() + 1;
41         int pad_needed_width  = (output->width() - 1) * mCommon->strideX() + kernelWidthSize - input->width();
42         int pad_needed_height = (output->height() - 1) * mCommon->strideY() + kernelHeightSize - input->height();
43 
44         mPadX = (pad_needed_width > 0 ?  pad_needed_width : 0) / 2;
45         mPadY = (pad_needed_height > 0 ?  pad_needed_height : 0) / 2;
46         return NO_ERROR;
47     }
48     mPadX = mCommon->padX();
49     mPadY = mCommon->padY();
50 
51     return NO_ERROR;
52 }
53 
~GLConvolution()54 GLConvolution::~GLConvolution() {
55 }
56 
GLConvolution(const std::vector<Tensor * > & inputs,const Op * convOp,Backend * bn)57 GLConvolution::GLConvolution(const std::vector<Tensor *> &inputs, const Op *convOp, Backend *bn) : GPUConvolution(convOp, bn) {
58     auto totalWeightSize =
59         ALIGN_UP4(mCommon->outputCount()) * ALIGN_UP4(mInputDepth) * (mCommon->kernelY() * mCommon->kernelX());
60     auto extra = (GLBackend *)bn;
61 
62     mBiasBuffer.reset(new GLSSBOBuffer(sizeof(float) * ALIGN_UP4(mCommon->outputCount())));
63     float* bias = (float*)(mBiasBuffer->map(GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT));
64     if(bias != nullptr){
65         ::memset(bias, 0, ALIGN_UP4(mCommon->outputCount()) * sizeof(float));
66         ::memcpy(bias, convOp->main_as_Convolution2D()->bias()->data(),
67                  convOp->main_as_Convolution2D()->bias()->size() * sizeof(float));
68     }
69     mBiasBuffer->unmap();
70 
71     auto mKernelBuffer = std::shared_ptr<GLSSBOBuffer>(new GLSSBOBuffer(sizeof(float) * totalWeightSize));
72     int fw                = mCommon->kernelX();
73     int fh                = mCommon->kernelY();
74     int unit              = 4;
75     int unit2             = unit * unit;
76     int alignedWeightSize = UP_DIV(mInputDepth, unit) * fw * fh * unit2;
77     int oc_4         = UP_DIV(mCommon->outputCount(), unit);
78 
79     float *dest           = (float *)mKernelBuffer->map(GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
80     if(dest != nullptr){
81         ::memset(dest, 0, alignedWeightSize * sizeof(float));
82         const float *source = convOp->main_as_Convolution2D()->weight()->data();
83         int cur             = 0;
84 
85         //weight : oc ic h w -> oc/4, ic/4 ky kx ic4 oc4
86         for (int b = 0; b < mCommon->outputCount(); ++b) {
87             int b_4      = b / unit;
88             float *dst_b = dest + b_4 * alignedWeightSize;
89             int mx       = b % unit;
90             for (int d = 0; d < mInputDepth; ++d) {
91                 int my       = d % unit;
92                 int d_4      = d / unit;
93                 float *dst_d = dst_b + d_4 * fw * fh * unit2;
94                 for (int y = 0; y < fh; ++y) {
95                     float *dst_y = dst_d + y * fw * unit2;
96                     for (int x = 0; x < fw; ++x) {
97                         float *dst_x          = dst_y + x * unit2;
98                         dst_x[unit * my + mx] = source[cur++];
99                     }
100                 }
101             }
102         }
103     }
104 
105     mKernelBuffer->unmap();
106 
107     int ic_4      = UP_DIV(mInputDepth, unit);
108     //weight image : ky kx, oc/4, ic/4*ic4 oc4
109     mKernelTexture =
110     std::shared_ptr<GLTexture>(new GLTexture(ic_4 * unit, oc_4, fw * fh, ((GLBackend *)backend())->getTextrueFormat() , GL_TEXTURE_3D, false));
111 
112     auto transform = extra->getProgram("transform_kernel_image_adreno", glsl_kernel2image_adreno_glsl);
113     transform->useProgram();
114     glBindImageTexture(0, mKernelTexture->id(), 0, GL_TRUE, 0, GL_WRITE_ONLY, ((GLBackend *)backend())->getTextrueFormat());
115     glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, mKernelBuffer->getId());
116     OPENGL_CHECK_ERROR;
117     glUniform1i(3, fw * fh);
118     glUniform1i(4, ic_4);
119     OPENGL_CHECK_ERROR;
120 
121     ((GLBackend *)backend())->compute(ic_4, oc_4, fw * fh);
122     OPENGL_CHECK_ERROR;
123 }
124 
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)125 ErrorCode GLConvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
126     GPUConvolution::onResize(inputs, outputs);
127     auto extra = (GLBackend *)backend();
128     std::vector<std::string> prefix;
129     if (mCommon->relu()) {
130         prefix.push_back("#define RELU");
131     }
132     if (mCommon->relu6()) {
133         prefix.push_back("#define RELU6");
134     }
135 
136     auto dstDepthQuad = UP_DIV(outputs[0]->channel(), 4);
137 
138     setLocalSize(prefix, mLocalSize, 1, 1, dstDepthQuad);
139 
140     if (1 == mCommon->kernelY() && 1 == mCommon->kernelX() && 1 == mCommon->strideY() && 1 == mCommon->strideX() &&
141         0 == mCommon->padX() && 0 == mCommon->padY()) {
142         mIs1x1      = true;
143     }
144 
145     if (mIs1x1) {
146         mProgram = extra->getProgram("convolution1x1", glsl_convolution1x1_glsl, prefix);
147     } else {
148         mKx      = mCommon->kernelX();
149         mKy      = mCommon->kernelY();
150         mSx      = mCommon->strideX();
151         mSy      = mCommon->strideY();
152         mDx      = mCommon->dilateX();
153         mDy      = mCommon->dilateY();
154         mProgram = extra->getProgram("convolution", glsl_convolution_glsl, prefix);
155     }
156 
157     return NO_ERROR;
158 }
159 
160 
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)161 ErrorCode GLConvolution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
162     {
163         auto convLayer = mCommon;
164 
165         auto input         = inputs[0];
166         auto output        = outputs[0];
167         auto inputTexture  = input->deviceId();
168         auto outputTexture = output->deviceId();
169         int oc_4 = UP_DIV(output->channel(), 4);
170 
171         mProgram->useProgram();
172         glBindImageTexture(0, outputTexture, 0, GL_TRUE, 0, GL_WRITE_ONLY, ((GLBackend *)backend())->getTextrueFormat());
173         {
174             int texId = 0;
175             glActiveTexture(GL_TEXTURE0 + texId);
176             glUniform1i(1, texId);
177             glBindTexture(GL_TEXTURE_3D, inputTexture);
178             OPENGL_CHECK_ERROR;
179         }
180         {
181             int texId = 1;
182             glActiveTexture(GL_TEXTURE0 + texId);
183             OPENGL_CHECK_ERROR;
184             glUniform1i(2, texId);
185             OPENGL_CHECK_ERROR;
186             glBindTexture(GL_TEXTURE_3D, mKernelTexture->id());
187             OPENGL_CHECK_ERROR;
188         }
189         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, mBiasBuffer->getId());
190 
191         if(!mIs1x1){
192             glUniform2i(4, mPadX, mPadY);
193             glUniform2i(5, mKx, mKy);
194             glUniform2i(6, mSx, mSy);
195             glUniform2i(7, mDx, mDy);
196         }
197         OPENGL_CHECK_ERROR;
198         glUniform3i(10, output->width(), output->height(), UP_DIV(output->channel(), 4));
199         glUniform3i(11, input->width(), input->height(), UP_DIV(input->channel(), 4));
200 
201         glUniform1i(8, UNIT);
202         OPENGL_CHECK_ERROR;
203 
204         ((GLBackend *)backend())->compute(UP_DIV(output->width(), UNIT*mLocalSize[0]), UP_DIV(output->height(), mLocalSize[1]),
205                                                 UP_DIV(oc_4, mLocalSize[2]));
206 
207         OPENGL_CHECK_ERROR;
208     }
209 
210     return NO_ERROR;
211 }
212 
213 
214 class ConvolutionCreator : public GLBackend::Creator {
215 public:
216     virtual ~ConvolutionCreator() = default;
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * backend) const217     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
218                                 const MNN::Op *op, Backend *backend) const override {
219         auto common = op->main_as_Convolution2D()->common();
220 
221         //TODO: bugfix
222         if(common->padX() == 1 || common->strideX() != 1){
223             return new GLConvolution(inputs, op, backend);
224         }
225         if(((GLBackend *)backend)->gpuType() == GLBackend::ADRENO){
226             if(((GLBackend *)backend)->glVersion() >= 269){
227                 return new GLConvolution(inputs, op, backend);
228             }else{
229                 return new GLConvolutionIm2col(inputs, op, backend);
230             }
231         }else{
232             return new GLConvolutionIm2col(inputs, op, backend);
233         }
234     }
235 };
236 
237 GLCreatorRegister<ConvolutionCreator> __gl_conv_op(OpType_Convolution);
238 } // namespace OpenGL
239 } // namespace MNN
240