1 //
2 // CPURelu.cpp
3 // MNN
4 //
5 // Created by MNN on 2018/07/15.
6 // Copyright © 2018, Alibaba Group Holding Limited
7 //
8
9 #include <string.h>
10 #include "backend/cpu/CPURelu.hpp"
11 #include "backend/cpu/CPUBackend.hpp"
12 #include "backend/cpu/compute/CommonOptFunction.h"
13 #include "core/Macro.h"
14 #include "core/Concurrency.h"
15 #include "CPUBackend.hpp"
16 #include "core/TensorUtils.hpp"
17 namespace MNN {
getTensorElementSizeHelper(const Tensor * t,int pack)18 static int getTensorElementSizeHelper(const Tensor* t, int pack) {
19 int size = 1;
20 for (int i = 0; i < t->dimensions(); i++) {
21 int currentDimSize = t->length(i);
22 if (TensorUtils::getDescribe(t)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
23 currentDimSize = UP_DIV(currentDimSize, pack) * pack;
24 }
25 size *= currentDimSize;
26 }
27 return size;
28 }
29
CPURelu(Backend * b,float slope)30 CPURelu::CPURelu(Backend *b, float slope) : Execution(b) {
31 auto core = static_cast<CPUBackend*>(b)->functions();
32 mSlope.reset(core->bytes * core->pack);
33 if (core->bytes < 4) {
34 // For Lowp
35 std::vector<float> tempSlope(core->pack);
36 for (int i=0; i<core->pack; ++i) {
37 tempSlope[i] = slope;
38 }
39 core->MNNFp32ToLowp(tempSlope.data(), (int16_t*)mSlope.get(), core->pack);
40 } else {
41 for (int i=0; i<core->pack; ++i) {
42 ((float*)mSlope.get())[i] = slope;
43 }
44 }
45 }
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)46 ErrorCode CPURelu::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
47 auto core = static_cast<CPUBackend*>(backend())->functions();
48 mRealSize = getTensorElementSizeHelper(inputs[0], core->pack);
49 if (mRealSize % core->pack != 0) {
50 mCacheDst.reset(core->pack * core->bytes);
51 mCacheSrc.reset(core->pack * core->bytes);
52 }
53 return NO_ERROR;
54 }
55
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)56 ErrorCode CPURelu::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
57 auto& ib = inputs[0]->buffer();
58 auto& ob = outputs[0]->buffer();
59
60 if (inputs[0]->getType() == halide_type_of<int8_t>()) {
61 const int8_t* srcO = (const int8_t*)ib.host;
62 int8_t* dstO = (int8_t*)ob.host;
63 auto size = inputs[0]->size() / sizeof(int8_t);
64 auto numberThread = ((CPUBackend*)backend())->threadNumber();
65 int sizeQuad = size / 16;
66 int remain = sizeQuad * 16;
67 int sizeDivide = sizeQuad / numberThread;
68 if (sizeQuad > 0) {
69 MNN_CONCURRENCY_BEGIN(tId, numberThread) {
70 int number = sizeDivide;
71 if (tId == numberThread - 1) {
72 number = sizeQuad - tId * sizeDivide;
73 }
74 MNNReluInt8(dstO + 16 * tId * sizeDivide, srcO + 16 * tId * sizeDivide, number * 16);
75 }
76 MNN_CONCURRENCY_END();
77 }
78 for (int i = remain; i < size; i++) {
79 dstO[i] = srcO[i] > 0 ? srcO[i] : 0;
80 }
81 return NO_ERROR;
82 }
83 auto core = static_cast<CPUBackend*>(backend())->functions();
84 const uint8_t* srcO = (const uint8_t*)ib.host;
85 uint8_t* dstO = (uint8_t*)ob.host;
86 auto size = mRealSize;
87 auto numberThread = ((CPUBackend*)backend())->threadNumber();
88 int sizeQuad = size / core->pack;
89 int remain = size % core->pack;
90 int sizeDivide = sizeQuad / numberThread;
91 if (sizeQuad > 0) {
92 MNN_CONCURRENCY_BEGIN(tId, numberThread) {
93 int number = sizeDivide;
94 if (tId == numberThread - 1) {
95 number = sizeQuad - tId * sizeDivide;
96 }
97 core->MNNReluWithSlopeChannel((float*)(dstO + core->pack * core->bytes * tId * sizeDivide), (const float*)(srcO + core->pack * core->bytes * tId * sizeDivide), (const float*)mSlope.get(), number, 1);
98 }
99 MNN_CONCURRENCY_END();
100 }
101 if (remain > 0) {
102 ::memcpy(mCacheSrc.get(), srcO + sizeQuad * core->pack * core->bytes, remain * core->bytes);
103 core->MNNReluWithSlopeChannel((float*)(mCacheDst.get()), (const float*)(mCacheSrc.get()), (const float*)mSlope.get(), 1, 1);
104 ::memcpy(dstO + sizeQuad * core->pack * core->bytes, mCacheDst.get(), remain * core->bytes);
105 }
106 return NO_ERROR;
107 }
108
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)109 ErrorCode CPURelu6::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
110 auto core = static_cast<CPUBackend*>(backend())->functions();
111 mRealSize = getTensorElementSizeHelper(inputs[0], core->pack);
112 if (mRealSize % core->pack != 0) {
113 mCacheDst.reset(core->pack * core->bytes);
114 mCacheSrc.reset(core->pack * core->bytes);
115 }
116 return NO_ERROR;
117 }
118
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)119 ErrorCode CPURelu6::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
120 auto& ib = inputs[0]->buffer();
121 auto& ob = outputs[0]->buffer();
122 auto core = static_cast<CPUBackend*>(backend())->functions();
123 const uint8_t* srcO = (const uint8_t*)ib.host;
124 uint8_t* dstO = (uint8_t*)ob.host;
125 auto size = mRealSize;
126 auto numberThread = ((CPUBackend*)backend())->threadNumber();
127 int sizeQuad = size / core->pack;
128 int remain = size % core->pack;
129 int sizeDivide = sizeQuad / numberThread;
130 std::vector<uint8_t> bias(core->pack * core->bytes, 0);
131 auto biasPtr = (float*)bias.data();
132 if (sizeQuad > 0) {
133 MNN_CONCURRENCY_BEGIN(tId, numberThread) {
134 int number = sizeDivide;
135 if (tId == numberThread - 1) {
136 number = sizeQuad - tId * sizeDivide;
137 }
138 core->MNNAxByClampBroadcastUnit((float*)(dstO + core->pack * core->bytes * tId * sizeDivide), (const float*)(srcO + core->pack * core->bytes * tId * sizeDivide), biasPtr, number, 0, 0, 1, mParam.data());
139 }
140 MNN_CONCURRENCY_END();
141 }
142 if (remain > 0) {
143 ::memcpy(mCacheSrc.get(), srcO + sizeQuad * core->pack * core->bytes, remain * core->bytes);
144 core->MNNAxByClampBroadcastUnit((float*)(mCacheDst.get()), (const float*)(mCacheSrc.get()), biasPtr, 1, 0, 0, 1, mParam.data());
145 ::memcpy(dstO + sizeQuad * core->pack * core->bytes, mCacheDst.get(), remain * core->bytes);
146 }
147 return NO_ERROR;
148 }
149
CPUPRelu(Backend * b,const Op * op)150 CPUPRelu::CPUPRelu(Backend* b, const Op* op) : MNN::Execution(b) {
151 auto c = op->main_as_PRelu();
152 auto core = static_cast<CPUBackend*>(b)->functions();
153 mSlope.buffer().dimensions = 1;
154 mSlope.buffer().dim[0].extent = UP_DIV(c->slopeCount(), core->pack) * core->pack;
155 mValid = b->onAcquireBuffer(&mSlope, Backend::STATIC);
156 if (!mValid) {
157 return;
158 }
159 ::memset(mSlope.host<void>(), 0, mSlope.length(0) * core->bytes);
160 if (core->bytes < 4) {
161 // For Lowp
162 core->MNNFp32ToLowp(c->slope()->data(), mSlope.host<int16_t>(), c->slopeCount());
163 } else {
164 ::memcpy(mSlope.host<void>(), c->slope()->data(), c->slopeCount() * sizeof(float));
165 }
166 }
~CPUPRelu()167 CPUPRelu::~CPUPRelu() {
168 if (mValid) {
169 backend()->onReleaseBuffer(&mSlope, Backend::STATIC);
170 }
171 }
172
173
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)174 ErrorCode CPUPRelu::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
175 auto& ib = inputs[0]->buffer();
176 auto& ob = outputs[0]->buffer();
177 int sizeQuad = 1;
178 for (int i=2; i<ib.dimensions; ++i) {
179 sizeQuad *= ib.dim[i].extent;
180 }
181 auto core = static_cast<CPUBackend*>(backend())->functions();
182 const int channel = ib.dim[1].extent;
183 const int batch = ib.dim[0].extent;
184 const int depthQuad = UP_DIV(channel, core->pack);
185 const uint8_t* srcO = (const uint8_t*)ib.host;
186 uint8_t* dstO = (uint8_t*)ob.host;
187 auto totalCount = batch * depthQuad;
188 auto numberThread = ((CPUBackend*)backend())->threadNumber();
189 MNN_CONCURRENCY_BEGIN(tId, numberThread) {
190 for (int b=tId; b<totalCount; b+=numberThread) {
191 auto c = b % depthQuad;
192 core->MNNReluWithSlopeChannel((float*)(dstO + sizeQuad * core->bytes * core->pack * b), (const float*)(srcO + sizeQuad * core->pack * core->bytes * b), (const float*)(mSlope.host<uint8_t>() + core->bytes * core->pack * c), sizeQuad, 1);
193 }
194 }
195 MNN_CONCURRENCY_END();
196 return NO_ERROR;
197 }
198
199 class CPUReluCreator : public CPUBackend::Creator {
200 public:
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * backend) const201 virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
202 const MNN::Op* op, Backend* backend) const {
203 if (op->type() == OpType_ReLU) {
204 auto slope = 0.0f;
205 if (nullptr != op->main() && OpParameter_Relu == op->main_type()) {
206 slope = op->main_as_Relu()->slope();
207 }
208 return new CPURelu(backend, slope);
209 }
210 MNN_ASSERT(op->type() == OpType_PReLU);
211 if (op->main_as_PRelu()->slopeCount() == 1) {
212 return new CPURelu(backend, op->main_as_PRelu()->slope()->data()[0]);
213 }
214 return new CPUPRelu(backend, op);
215 }
216 };
217
218 class CPURelu6Creator : public CPUBackend::Creator {
219 public:
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * backend) const220 virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
221 const MNN::Op* op, Backend* backend) const {
222 float minV = 0.0f;
223 float maxV = 6.0f;
224 if (nullptr != op->main()) {
225 auto p = op->main_as_Relu6();
226 minV = p->minValue();
227 maxV = p->maxValue();
228 }
229 return new CPURelu6(maxV, minV, backend);
230 }
231 };
232
233 REGISTER_CPU_OP_CREATOR(CPUReluCreator, OpType_ReLU);
234 REGISTER_CPU_OP_CREATOR(CPUReluCreator, OpType_PReLU);
235 REGISTER_CPU_OP_CREATOR(CPURelu6Creator, OpType_ReLU6);
236 } // namespace MNN
237