1 //
2 //  ReluExecution.cpp
3 //  MNN
4 //
5 //  Created by MNN on 2019/02/28.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #include "backend/opencl/execution/image/ReluExecution.hpp"
10 #include "core/TensorUtils.hpp"
11 #include "backend/opencl/execution/image/UnaryExecution.hpp"
12 #include <string.h>
13 namespace MNN {
14 namespace OpenCL {
15 
ReluExecution(const std::vector<Tensor * > & inputs,const MNN::Op * op,Backend * backend)16 ReluExecution::ReluExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
17     : CommonExecution(backend) {
18     auto mOpenCLBackend       = static_cast<OpenCLBackend *>(backend);
19     auto mPreluParamPtr       = op->main_as_PRelu();
20     int preluSize             = mPreluParamPtr->slopeCount();
21     const float *preluDataPtr = mPreluParamPtr->slope()->data();
22 
23     int buffer_size = ALIGN_UP4(preluSize);
24     if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf()) {
25         buffer_size *= sizeof(half_float::half);
26     } else {
27         buffer_size *= sizeof(float);
28     }
29     cl::Buffer preluBuffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, buffer_size);
30     cl_int error;
31     auto preluDataPtrCL = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(
32         preluBuffer, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
33     if(preluDataPtrCL != nullptr && error == CL_SUCCESS){
34         if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf()){
35             for(int i=0; i<preluSize; i++) {
36                 ((half_float::half*)preluDataPtrCL)[i] = (half_float::half)(preluDataPtr[i]);
37             }
38             for(int i=preluSize; i<ALIGN_UP4(preluSize); i++) {
39                 ((half_float::half*)preluDataPtrCL)[i] = (half_float::half)(0.0f);
40             }
41         }else{
42             ::memset(preluDataPtrCL, 0, buffer_size);
43             ::memcpy(preluDataPtrCL, preluDataPtr, preluSize * sizeof(float));
44         }
45     }else{
46         MNN_ERROR("Map error preluDataPtrCL == nullptr \n");
47     }
48     mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(preluBuffer, preluDataPtrCL);
49     mPreluParam.reset(Tensor::createDevice<float>({1, 1, 1, preluSize}));
50     mOpenCLBackend->onAcquireBuffer(mPreluParam.get(), Backend::STATIC);
51     copyBufferToImage(mOpenCLBackend->getOpenCLRuntime(), preluBuffer, openCLImage(mPreluParam.get()),
52                       UP_DIV(preluSize, 4), 1);
53     mOp = op;
54 }
~ReluExecution()55 ReluExecution::~ReluExecution() {
56     backend()->onReleaseBuffer(mPreluParam.get(), Backend::STATIC);
57 }
58 
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)59 ErrorCode ReluExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
60     mUnits.resize(1);
61     auto nhwc              = tensorShapeFormat(outputs[0]);
62     int nhwcArray[4]        = {nhwc[0], nhwc[1], nhwc[2], UP_DIV(nhwc[3], 4)};
63 
64     auto imageWidth        = nhwc[2] * UP_DIV(nhwc[3], 4);
65     auto imageHeight       = nhwc[0] * nhwc[1];
66     int reluImageWH[2]      = {1, 1};
67     int reluStride[4]       = {0, 0, 0, 1};
68     cl::NDRange localSize  = {4, 4};
69     cl::NDRange globalSize = {(uint32_t)UP_DIV(imageWidth, 4) * 4, (uint32_t)UP_DIV(imageHeight, 4) * 4};
70 
71     auto runTime     = ((OpenCLBackend *)backend())->getOpenCLRuntime();
72     mUnits[0].kernel = runTime->buildKernel("binary", "binary_prelu", {"-DOPERATOR=select(in0*in1,in0,in0>=(FLOAT4)0)"});
73     mUnits[0].kernel.setArg(0, openCLImage(inputs[0]));
74     mUnits[0].kernel.setArg(1, openCLImage(mPreluParam.get()));
75     mUnits[0].kernel.setArg(2, openCLImage(outputs[0]));
76     mUnits[0].kernel.setArg(3, nhwcArray);
77     mUnits[0].kernel.setArg(4, reluImageWH);
78     mUnits[0].kernel.setArg(5, reluStride);
79     mUnits[0].globalWorkSize = globalSize;
80     mUnits[0].localWorkSize  = localSize;
81 
82     return NO_ERROR;
83 }
84 class ReluCreator : public OpenCLBackend::Creator {
85 public:
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * backend) const86     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
87                                 const MNN::Op *op, Backend *backend) const override {
88         // There seems to be a bug on OpenCL compiler of AMD Radeon HD 7000 series.
89         // When use build option -Dname=definition, definition will be truncated by
90         // a comma, which violate opencl specification (quote, 'In particular, the definition will
91         // be truncated by embedded newline characters'.)
92         // So we use ternary operation (A ? B: C) instead of function call with comma
93         // (e.g, fmax(in,(float4)(0))), when there is a Radeon GPU.
94         bool isRadeonGpu = (static_cast<OpenCLBackend*>(backend)->getOpenCLRuntime()->getGpuType() == RADEON);
95 
96         if (op->type() == OpType_ReLU6) {
97             char storage[128];
98             float minValue = 0.0f;
99             float maxValue = 6.0f;
100             if (nullptr != op->main_as_Relu6()) {
101                 minValue = op->main_as_Relu6()->minValue();
102                 maxValue = op->main_as_Relu6()->maxValue();
103             }
104             if (isRadeonGpu) {
105                 std::string temp = "(in<=(FLOAT4)((FLOAT)%f)?(FLOAT4)((FLOAT)%f):(in>=(FLOAT4)((FLOAT)%f)?(FLOAT4)((FLOAT)%f):in))";
106                 sprintf(storage, temp.c_str(), minValue, minValue, maxValue, maxValue);
107                 return new UnaryExecution(storage, backend);
108             }
109             std::string temp = "clamp(in,(FLOAT4)((FLOAT)%f),(FLOAT4)((FLOAT)%f))";
110             sprintf(storage, temp.c_str(), minValue, maxValue);
111             return new UnaryExecution(storage, backend);
112         }
113         if (op->type() == OpType_ReLU) {
114             if (op->main_as_Relu()->slope() == 0.0f) {
115                 if (isRadeonGpu) {
116                     return new UnaryExecution("(in>(FLOAT4)((FLOAT)0)?in:(FLOAT4)((FLOAT)0))", backend);
117                 }
118                 return new UnaryExecution("fmax(in,(FLOAT4)((FLOAT)0))", backend);
119             }
120             auto slope         = op->main_as_Relu()->slope();
121             char slopeCStr[30] = {};
122             sprintf(slopeCStr, "%.8f", slope);
123             std::string slopeStr = slopeCStr;
124             if (isRadeonGpu) {
125                 return new UnaryExecution("in<(FLOAT4)((FLOAT)0)?(FLOAT)(" + slopeStr + "f)*in:in", backend);
126             }
127             return new UnaryExecution("select((FLOAT)(" + slopeStr + "f)*in,in,in>=(FLOAT4)((FLOAT)0))", backend);
128         }
129         if (op->type() == OpType_PReLU) {
130             if (op->main_as_PRelu()->slopeCount() == 1) {
131                 auto slope         = op->main_as_PRelu()->slope()->data()[0];
132                 char slopeCStr[30] = {};
133                 sprintf(slopeCStr, "%.8f", slope);
134                 std::string slopeStr = slopeCStr;
135                 if (isRadeonGpu) {
136                     return new UnaryExecution("in<(FLOAT4)((FLOAT)0)?(FLOAT)(" + slopeStr + "f)*in:in", backend);
137                 }
138                 return new UnaryExecution("select((FLOAT)(" + slopeStr + "f)*in,in,in>=(FLOAT4)((FLOAT)0))", backend);
139             }
140             // FUNC_PRINT(1);
141             return new ReluExecution(inputs, op, backend);
142         }
143         return nullptr;
144     }
145 };
146 
147 OpenCLCreatorRegister<ReluCreator> __Relu_op(OpType_ReLU, IMAGE);
148 OpenCLCreatorRegister<ReluCreator> __PRelu_op(OpType_PReLU, IMAGE);
149 OpenCLCreatorRegister<ReluCreator> __Relu6_op(OpType_ReLU6, IMAGE);
150 
151 } // namespace OpenCL
152 } // namespace MNN
153