1 //
2 // ReluExecution.cpp
3 // MNN
4 //
5 // Created by MNN on 2019/02/28.
6 // Copyright © 2018, Alibaba Group Holding Limited
7 //
8
9 #include "backend/opencl/execution/image/ReluExecution.hpp"
10 #include "core/TensorUtils.hpp"
11 #include "backend/opencl/execution/image/UnaryExecution.hpp"
12 #include <string.h>
13 namespace MNN {
14 namespace OpenCL {
15
ReluExecution(const std::vector<Tensor * > & inputs,const MNN::Op * op,Backend * backend)16 ReluExecution::ReluExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
17 : CommonExecution(backend) {
18 auto mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
19 auto mPreluParamPtr = op->main_as_PRelu();
20 int preluSize = mPreluParamPtr->slopeCount();
21 const float *preluDataPtr = mPreluParamPtr->slope()->data();
22
23 int buffer_size = ALIGN_UP4(preluSize);
24 if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf()) {
25 buffer_size *= sizeof(half_float::half);
26 } else {
27 buffer_size *= sizeof(float);
28 }
29 cl::Buffer preluBuffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, buffer_size);
30 cl_int error;
31 auto preluDataPtrCL = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(
32 preluBuffer, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
33 if(preluDataPtrCL != nullptr && error == CL_SUCCESS){
34 if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf()){
35 for(int i=0; i<preluSize; i++) {
36 ((half_float::half*)preluDataPtrCL)[i] = (half_float::half)(preluDataPtr[i]);
37 }
38 for(int i=preluSize; i<ALIGN_UP4(preluSize); i++) {
39 ((half_float::half*)preluDataPtrCL)[i] = (half_float::half)(0.0f);
40 }
41 }else{
42 ::memset(preluDataPtrCL, 0, buffer_size);
43 ::memcpy(preluDataPtrCL, preluDataPtr, preluSize * sizeof(float));
44 }
45 }else{
46 MNN_ERROR("Map error preluDataPtrCL == nullptr \n");
47 }
48 mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(preluBuffer, preluDataPtrCL);
49 mPreluParam.reset(Tensor::createDevice<float>({1, 1, 1, preluSize}));
50 mOpenCLBackend->onAcquireBuffer(mPreluParam.get(), Backend::STATIC);
51 copyBufferToImage(mOpenCLBackend->getOpenCLRuntime(), preluBuffer, openCLImage(mPreluParam.get()),
52 UP_DIV(preluSize, 4), 1);
53 mOp = op;
54 }
~ReluExecution()55 ReluExecution::~ReluExecution() {
56 backend()->onReleaseBuffer(mPreluParam.get(), Backend::STATIC);
57 }
58
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)59 ErrorCode ReluExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
60 mUnits.resize(1);
61 auto nhwc = tensorShapeFormat(outputs[0]);
62 int nhwcArray[4] = {nhwc[0], nhwc[1], nhwc[2], UP_DIV(nhwc[3], 4)};
63
64 auto imageWidth = nhwc[2] * UP_DIV(nhwc[3], 4);
65 auto imageHeight = nhwc[0] * nhwc[1];
66 int reluImageWH[2] = {1, 1};
67 int reluStride[4] = {0, 0, 0, 1};
68 cl::NDRange localSize = {4, 4};
69 cl::NDRange globalSize = {(uint32_t)UP_DIV(imageWidth, 4) * 4, (uint32_t)UP_DIV(imageHeight, 4) * 4};
70
71 auto runTime = ((OpenCLBackend *)backend())->getOpenCLRuntime();
72 mUnits[0].kernel = runTime->buildKernel("binary", "binary_prelu", {"-DOPERATOR=select(in0*in1,in0,in0>=(FLOAT4)0)"});
73 mUnits[0].kernel.setArg(0, openCLImage(inputs[0]));
74 mUnits[0].kernel.setArg(1, openCLImage(mPreluParam.get()));
75 mUnits[0].kernel.setArg(2, openCLImage(outputs[0]));
76 mUnits[0].kernel.setArg(3, nhwcArray);
77 mUnits[0].kernel.setArg(4, reluImageWH);
78 mUnits[0].kernel.setArg(5, reluStride);
79 mUnits[0].globalWorkSize = globalSize;
80 mUnits[0].localWorkSize = localSize;
81
82 return NO_ERROR;
83 }
84 class ReluCreator : public OpenCLBackend::Creator {
85 public:
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * backend) const86 virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
87 const MNN::Op *op, Backend *backend) const override {
88 // There seems to be a bug on OpenCL compiler of AMD Radeon HD 7000 series.
89 // When use build option -Dname=definition, definition will be truncated by
90 // a comma, which violate opencl specification (quote, 'In particular, the definition will
91 // be truncated by embedded newline characters'.)
92 // So we use ternary operation (A ? B: C) instead of function call with comma
93 // (e.g, fmax(in,(float4)(0))), when there is a Radeon GPU.
94 bool isRadeonGpu = (static_cast<OpenCLBackend*>(backend)->getOpenCLRuntime()->getGpuType() == RADEON);
95
96 if (op->type() == OpType_ReLU6) {
97 char storage[128];
98 float minValue = 0.0f;
99 float maxValue = 6.0f;
100 if (nullptr != op->main_as_Relu6()) {
101 minValue = op->main_as_Relu6()->minValue();
102 maxValue = op->main_as_Relu6()->maxValue();
103 }
104 if (isRadeonGpu) {
105 std::string temp = "(in<=(FLOAT4)((FLOAT)%f)?(FLOAT4)((FLOAT)%f):(in>=(FLOAT4)((FLOAT)%f)?(FLOAT4)((FLOAT)%f):in))";
106 sprintf(storage, temp.c_str(), minValue, minValue, maxValue, maxValue);
107 return new UnaryExecution(storage, backend);
108 }
109 std::string temp = "clamp(in,(FLOAT4)((FLOAT)%f),(FLOAT4)((FLOAT)%f))";
110 sprintf(storage, temp.c_str(), minValue, maxValue);
111 return new UnaryExecution(storage, backend);
112 }
113 if (op->type() == OpType_ReLU) {
114 if (op->main_as_Relu()->slope() == 0.0f) {
115 if (isRadeonGpu) {
116 return new UnaryExecution("(in>(FLOAT4)((FLOAT)0)?in:(FLOAT4)((FLOAT)0))", backend);
117 }
118 return new UnaryExecution("fmax(in,(FLOAT4)((FLOAT)0))", backend);
119 }
120 auto slope = op->main_as_Relu()->slope();
121 char slopeCStr[30] = {};
122 sprintf(slopeCStr, "%.8f", slope);
123 std::string slopeStr = slopeCStr;
124 if (isRadeonGpu) {
125 return new UnaryExecution("in<(FLOAT4)((FLOAT)0)?(FLOAT)(" + slopeStr + "f)*in:in", backend);
126 }
127 return new UnaryExecution("select((FLOAT)(" + slopeStr + "f)*in,in,in>=(FLOAT4)((FLOAT)0))", backend);
128 }
129 if (op->type() == OpType_PReLU) {
130 if (op->main_as_PRelu()->slopeCount() == 1) {
131 auto slope = op->main_as_PRelu()->slope()->data()[0];
132 char slopeCStr[30] = {};
133 sprintf(slopeCStr, "%.8f", slope);
134 std::string slopeStr = slopeCStr;
135 if (isRadeonGpu) {
136 return new UnaryExecution("in<(FLOAT4)((FLOAT)0)?(FLOAT)(" + slopeStr + "f)*in:in", backend);
137 }
138 return new UnaryExecution("select((FLOAT)(" + slopeStr + "f)*in,in,in>=(FLOAT4)((FLOAT)0))", backend);
139 }
140 // FUNC_PRINT(1);
141 return new ReluExecution(inputs, op, backend);
142 }
143 return nullptr;
144 }
145 };
146
147 OpenCLCreatorRegister<ReluCreator> __Relu_op(OpType_ReLU, IMAGE);
148 OpenCLCreatorRegister<ReluCreator> __PRelu_op(OpType_PReLU, IMAGE);
149 OpenCLCreatorRegister<ReluCreator> __Relu6_op(OpType_ReLU6, IMAGE);
150
151 } // namespace OpenCL
152 } // namespace MNN
153