1 //
2 //  CPURelu.cpp
3 //  MNN
4 //
5 //  Created by MNN on 2018/07/15.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #include <string.h>
10 #include "backend/cpu/CPURelu.hpp"
11 #include "backend/cpu/CPUBackend.hpp"
12 #include "backend/cpu/compute/CommonOptFunction.h"
13 #include "core/Macro.h"
14 #include "core/Concurrency.h"
15 #include "CPUBackend.hpp"
16 #include "core/TensorUtils.hpp"
17 namespace MNN {
getTensorElementSizeHelper(const Tensor * t,int pack)18 static int getTensorElementSizeHelper(const Tensor* t, int pack) {
19     int size = 1;
20     for (int i = 0; i < t->dimensions(); i++) {
21         int currentDimSize = t->length(i);
22         if (TensorUtils::getDescribe(t)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
23             currentDimSize = UP_DIV(currentDimSize, pack) * pack;
24         }
25         size *= currentDimSize;
26     }
27     return size;
28 }
29 
CPURelu(Backend * b,float slope)30 CPURelu::CPURelu(Backend *b, float slope) : Execution(b) {
31     auto core = static_cast<CPUBackend*>(b)->functions();
32     mSlope.reset(core->bytes * core->pack);
33     if (core->bytes < 4) {
34         // For Lowp
35         std::vector<float> tempSlope(core->pack);
36         for (int i=0; i<core->pack; ++i) {
37             tempSlope[i] = slope;
38         }
39         core->MNNFp32ToLowp(tempSlope.data(), (int16_t*)mSlope.get(), core->pack);
40     } else {
41         for (int i=0; i<core->pack; ++i) {
42             ((float*)mSlope.get())[i] = slope;
43         }
44     }
45 }
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)46 ErrorCode CPURelu::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
47     auto core = static_cast<CPUBackend*>(backend())->functions();
48     mRealSize = getTensorElementSizeHelper(inputs[0], core->pack);
49     if (mRealSize % core->pack != 0) {
50         mCacheDst.reset(core->pack * core->bytes);
51         mCacheSrc.reset(core->pack * core->bytes);
52     }
53     return NO_ERROR;
54 }
55 
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)56 ErrorCode CPURelu::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
57     auto& ib = inputs[0]->buffer();
58     auto& ob = outputs[0]->buffer();
59 
60     if (inputs[0]->getType() == halide_type_of<int8_t>()) {
61         const int8_t* srcO = (const int8_t*)ib.host;
62         int8_t* dstO       = (int8_t*)ob.host;
63         auto size         = inputs[0]->size() / sizeof(int8_t);
64         auto numberThread = ((CPUBackend*)backend())->threadNumber();
65         int sizeQuad     = size / 16;
66         int remain       = sizeQuad * 16;
67         int sizeDivide = sizeQuad / numberThread;
68         if (sizeQuad > 0) {
69             MNN_CONCURRENCY_BEGIN(tId, numberThread) {
70                 int number = sizeDivide;
71                 if (tId == numberThread - 1) {
72                     number = sizeQuad - tId * sizeDivide;
73                 }
74                 MNNReluInt8(dstO + 16 * tId * sizeDivide, srcO + 16 * tId * sizeDivide, number * 16);
75             }
76             MNN_CONCURRENCY_END();
77         }
78         for (int i = remain; i < size; i++) {
79             dstO[i] = srcO[i] > 0 ? srcO[i] : 0;
80         }
81         return NO_ERROR;
82     }
83     auto core = static_cast<CPUBackend*>(backend())->functions();
84     const uint8_t* srcO = (const uint8_t*)ib.host;
85     uint8_t* dstO       = (uint8_t*)ob.host;
86     auto size         = mRealSize;
87     auto numberThread = ((CPUBackend*)backend())->threadNumber();
88     int sizeQuad     = size / core->pack;
89     int remain       = size % core->pack;
90     int sizeDivide = sizeQuad / numberThread;
91     if (sizeQuad > 0) {
92         MNN_CONCURRENCY_BEGIN(tId, numberThread) {
93             int number = sizeDivide;
94             if (tId == numberThread - 1) {
95                 number = sizeQuad - tId * sizeDivide;
96             }
97             core->MNNReluWithSlopeChannel((float*)(dstO + core->pack * core->bytes * tId * sizeDivide), (const float*)(srcO + core->pack * core->bytes * tId * sizeDivide), (const float*)mSlope.get(), number, 1);
98         }
99         MNN_CONCURRENCY_END();
100     }
101     if (remain > 0) {
102         ::memcpy(mCacheSrc.get(), srcO + sizeQuad * core->pack * core->bytes, remain * core->bytes);
103         core->MNNReluWithSlopeChannel((float*)(mCacheDst.get()), (const float*)(mCacheSrc.get()), (const float*)mSlope.get(), 1, 1);
104         ::memcpy(dstO + sizeQuad * core->pack * core->bytes, mCacheDst.get(), remain * core->bytes);
105     }
106     return NO_ERROR;
107 }
108 
onResize(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)109 ErrorCode CPURelu6::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
110     auto core = static_cast<CPUBackend*>(backend())->functions();
111     mRealSize = getTensorElementSizeHelper(inputs[0], core->pack);
112     if (mRealSize % core->pack != 0) {
113         mCacheDst.reset(core->pack * core->bytes);
114         mCacheSrc.reset(core->pack * core->bytes);
115     }
116     return NO_ERROR;
117 }
118 
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)119 ErrorCode CPURelu6::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
120     auto& ib = inputs[0]->buffer();
121     auto& ob = outputs[0]->buffer();
122     auto core = static_cast<CPUBackend*>(backend())->functions();
123     const uint8_t* srcO = (const uint8_t*)ib.host;
124     uint8_t* dstO       = (uint8_t*)ob.host;
125     auto size         = mRealSize;
126     auto numberThread = ((CPUBackend*)backend())->threadNumber();
127     int sizeQuad     = size / core->pack;
128     int remain       = size % core->pack;
129     int sizeDivide = sizeQuad / numberThread;
130     std::vector<uint8_t> bias(core->pack * core->bytes, 0);
131     auto biasPtr = (float*)bias.data();
132     if (sizeQuad > 0) {
133         MNN_CONCURRENCY_BEGIN(tId, numberThread) {
134             int number = sizeDivide;
135             if (tId == numberThread - 1) {
136                 number = sizeQuad - tId * sizeDivide;
137             }
138             core->MNNAxByClampBroadcastUnit((float*)(dstO + core->pack * core->bytes * tId * sizeDivide), (const float*)(srcO + core->pack * core->bytes * tId * sizeDivide), biasPtr, number, 0, 0, 1, mParam.data());
139         }
140         MNN_CONCURRENCY_END();
141     }
142     if (remain > 0) {
143         ::memcpy(mCacheSrc.get(), srcO + sizeQuad * core->pack * core->bytes, remain * core->bytes);
144         core->MNNAxByClampBroadcastUnit((float*)(mCacheDst.get()), (const float*)(mCacheSrc.get()), biasPtr, 1, 0, 0, 1, mParam.data());
145         ::memcpy(dstO + sizeQuad * core->pack * core->bytes, mCacheDst.get(), remain * core->bytes);
146     }
147     return NO_ERROR;
148 }
149 
CPUPRelu(Backend * b,const Op * op)150 CPUPRelu::CPUPRelu(Backend* b, const Op* op) : MNN::Execution(b) {
151     auto c = op->main_as_PRelu();
152     auto core = static_cast<CPUBackend*>(b)->functions();
153     mSlope.buffer().dimensions = 1;
154     mSlope.buffer().dim[0].extent = UP_DIV(c->slopeCount(), core->pack) * core->pack;
155     mValid = b->onAcquireBuffer(&mSlope, Backend::STATIC);
156     if (!mValid) {
157         return;
158     }
159     ::memset(mSlope.host<void>(), 0, mSlope.length(0) * core->bytes);
160     if (core->bytes < 4) {
161         // For Lowp
162         core->MNNFp32ToLowp(c->slope()->data(), mSlope.host<int16_t>(), c->slopeCount());
163     } else {
164         ::memcpy(mSlope.host<void>(), c->slope()->data(), c->slopeCount() * sizeof(float));
165     }
166 }
~CPUPRelu()167 CPUPRelu::~CPUPRelu() {
168     if (mValid) {
169         backend()->onReleaseBuffer(&mSlope, Backend::STATIC);
170     }
171 }
172 
173 
onExecute(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs)174 ErrorCode CPUPRelu::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
175     auto& ib            = inputs[0]->buffer();
176     auto& ob            = outputs[0]->buffer();
177     int sizeQuad = 1;
178     for (int i=2; i<ib.dimensions; ++i) {
179         sizeQuad *= ib.dim[i].extent;
180     }
181     auto core = static_cast<CPUBackend*>(backend())->functions();
182     const int channel   = ib.dim[1].extent;
183     const int batch     = ib.dim[0].extent;
184     const int depthQuad = UP_DIV(channel, core->pack);
185     const uint8_t* srcO   = (const uint8_t*)ib.host;
186     uint8_t* dstO         = (uint8_t*)ob.host;
187     auto totalCount = batch * depthQuad;
188     auto numberThread = ((CPUBackend*)backend())->threadNumber();
189     MNN_CONCURRENCY_BEGIN(tId, numberThread) {
190         for (int b=tId; b<totalCount; b+=numberThread) {
191             auto c = b % depthQuad;
192             core->MNNReluWithSlopeChannel((float*)(dstO + sizeQuad * core->bytes * core->pack * b), (const float*)(srcO + sizeQuad * core->pack * core->bytes * b), (const float*)(mSlope.host<uint8_t>() + core->bytes * core->pack * c), sizeQuad, 1);
193         }
194     }
195     MNN_CONCURRENCY_END();
196     return NO_ERROR;
197 }
198 
199 class CPUReluCreator : public CPUBackend::Creator {
200 public:
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * backend) const201     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
202                                 const MNN::Op* op, Backend* backend) const {
203         if (op->type() == OpType_ReLU) {
204             auto slope = 0.0f;
205             if (nullptr != op->main() && OpParameter_Relu == op->main_type()) {
206                 slope = op->main_as_Relu()->slope();
207             }
208             return new CPURelu(backend, slope);
209         }
210         MNN_ASSERT(op->type() == OpType_PReLU);
211         if (op->main_as_PRelu()->slopeCount() == 1) {
212             return new CPURelu(backend, op->main_as_PRelu()->slope()->data()[0]);
213         }
214         return new CPUPRelu(backend, op);
215     }
216 };
217 
218 class CPURelu6Creator : public CPUBackend::Creator {
219 public:
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op,Backend * backend) const220     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
221                                 const MNN::Op* op, Backend* backend) const {
222         float minV = 0.0f;
223         float maxV = 6.0f;
224         if (nullptr != op->main()) {
225             auto p = op->main_as_Relu6();
226             minV = p->minValue();
227             maxV = p->maxValue();
228         }
229         return new CPURelu6(maxV, minV, backend);
230     }
231 };
232 
233 REGISTER_CPU_OP_CREATOR(CPUReluCreator, OpType_ReLU);
234 REGISTER_CPU_OP_CREATOR(CPUReluCreator, OpType_PReLU);
235 REGISTER_CPU_OP_CREATOR(CPURelu6Creator, OpType_ReLU6);
236 } // namespace MNN
237