1 //
2 // Arm82Backend.cpp
3 // MNN
4 //
5 // Created by MNN on 2019/01/31.
6 // Copyright © 2018, Alibaba Group Holding Limited
7 //
8 #if defined(__ANDROID__) || defined(__aarch64__)
9
10 #include <algorithm>
11 #include <mutex>
12
13 #include "Arm82Backend.hpp"
14 #include "Arm82OptFunc.hpp"
15 #include "Arm82Functions.hpp"
16 #include "core/BufferAllocator.hpp"
17 #include "core/TensorUtils.hpp"
18 #include "core/OpCommonUtils.hpp"
19 #include "backend/cpu/compute/CommonOptFunction.h"
20 #include "half.hpp"
21
22 namespace MNN {
23
24 void registerArm82Ops();
25
getArm82CreatorContainer()26 static inline std::map<OpType, Arm82Backend::Arm82Creator*>* getArm82CreatorContainer() {
27 static std::once_flag fg;
28 static std::map<OpType, Arm82Backend::Arm82Creator*>* ret = nullptr;
29 std::call_once(fg, [&] { ret = new std::map<OpType, Arm82Backend::Arm82Creator*>; });
30 return ret;
31 }
32
addArm82Creator(OpType t,Arm82Creator * ct)33 bool Arm82Backend::addArm82Creator(OpType t, Arm82Creator* ct) {
34 auto creatorContainer = getArm82CreatorContainer();
35 if (creatorContainer->find(t) == creatorContainer->end()) {
36 creatorContainer->insert(std::make_pair(t, ct));
37 }
38 return true;
39 }
40
Arm82Backend(const CPURuntime * runtime)41 Arm82Backend::Arm82Backend(const CPURuntime* runtime) : CPUBackend(runtime, BackendConfig::Precision_Low, MNN_FORWARD_CPU_EXTENSION) {
42 mCoreFunctions = Arm82Functions::get();
43 }
44
~Arm82Backend()45 Arm82Backend::~Arm82Backend() {
46 // nothing to do
47 }
48
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op)49 Execution* Arm82Backend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
50 const MNN::Op* op) {
51 for (auto t : outputs) {
52 if (t->getType().code != halide_type_float) {
53 return nullptr;
54 }
55 }
56 auto quantInfo = OpCommonUtils::getQuantInfo(inputs);
57 if (quantInfo.first) {
58 return nullptr;
59 }
60 bool originCreate = OpCommonUtils::opCompabilityForLowp(op);
61 if (originCreate) {
62 return CPUBackend::onCreate(inputs, outputs, op);
63 }
64 auto creatorContainer = getArm82CreatorContainer();
65 // MNN_PRINT("====> create Execution for type: %s\n", MNN::EnumNameOpType(op->type()));
66 auto iter = creatorContainer->find(op->type());
67
68 if (iter == creatorContainer->end()) {
69 // MNN_PRINT("[MNNWarning]: ARMV82 don't support type: [%s]\n", MNN::EnumNameOpType(op->type()));
70 return nullptr;
71 }
72 auto exe = iter->second->onCreate(inputs, outputs, op, this);
73 if (exe == nullptr) {
74 // MNN_PRINT("[MNNWarning]: ARMV82 don't support type: [%s]\n", MNN::EnumNameOpType(op->type()));
75 return nullptr;
76 }
77 return exe;
78 }
79
_getAliginSize(const halide_buffer_t & buffer,MNN_DATA_FORMAT format)80 static int _getAliginSize(const halide_buffer_t& buffer, MNN_DATA_FORMAT format) {
81 // The default data type of input tensor for arm82 backend is FLOAT32.
82 // However, Arm82Backend default data type is FLOAT16, so check whether data type is FLOAT32,
83 // then divide size by 2
84 int size = sizeof(int16_t);
85 const int dimensions = buffer.dimensions;
86 for (int i = 0; i < dimensions; i++) {
87 int currentDimSize = buffer.dim[i].extent;
88 if (format == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
89 currentDimSize = ALIGN_UP8(currentDimSize);
90 }
91 size *= currentDimSize;
92 }
93 return size;
94 }
95
onAcquireBuffer(const Tensor * nativeTensor,StorageType storageType)96 bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) {
97 // arm82 backend tensor data type is fp16 default
98 auto tensor = const_cast<Tensor*>(nativeTensor);
99 auto& buffer = tensor->buffer();
100 if (buffer.type != halide_type_of<float>() && buffer.type != halide_type_of<FLOAT16>()) {
101 return CPUBackend::onAcquireBuffer(nativeTensor, storageType);
102 }
103 auto res = allocBuffer(_getAliginSize(buffer, TensorUtils::getDescribe(nativeTensor)->dimensionFormat), (Tensor*)nativeTensor, storageType);
104 if (!res) {
105 return false;
106 }
107 // Set mask in device for easy to determine
108 buffer.device = 1;
109 return true;
110 }
_convertFp16Inside(const halide_buffer_t & ib,const halide_buffer_t & ob,MNN_DATA_FORMAT source,MNN_DATA_FORMAT dest)111 static void _convertFp16Inside(const halide_buffer_t& ib, const halide_buffer_t& ob, MNN_DATA_FORMAT source, MNN_DATA_FORMAT dest) {
112 int area = 1;
113 int channel = 0;
114 if (source == dest) {
115 ::memcpy(ob.host, ib.host, _getAliginSize(ib, source));
116 return;
117 }
118 if (source == MNN_DATA_FORMAT_NC4HW4 || source == MNN_DATA_FORMAT_NCHW) {
119 channel = ib.dim[1].extent;
120 for (int axis = 2; axis < ib.dimensions; ++axis) {
121 area *= ib.dim[axis].extent;
122 }
123 } else {
124 channel = ib.dim[ib.dimensions - 1].extent;
125 for (int axis = 1; axis < ib.dimensions - 1; ++axis) {
126 area *= ib.dim[axis].extent;
127 }
128 }
129
130 // external use
131 // copy between user and Arm82Backend
132 // fp16 fp32 transformation
133 const int batch = ib.dim[0].extent;
134
135 if (source == MNN_DATA_FORMAT_NC4HW4 && dest == MNN_DATA_FORMAT_NCHW) {
136 const int inbatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
137 const int outBatchStide = channel * area;
138
139 for (int i = 0; i < batch; ++i) {
140 MNNUnPackC8FP16((FLOAT16*)ob.host + outBatchStide * i, (const FLOAT16*)ib.host + inbatchStride * i, area,
141 channel);
142 }
143 return;
144 }
145
146 if (source == MNN_DATA_FORMAT_NCHW && dest == MNN_DATA_FORMAT_NC4HW4) {
147 const int inbatchStride = channel * area;
148 const int outBatchStide = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
149 for (int i = 0; i < batch; ++i) {
150 MNNPackC8FP16((FLOAT16*)ob.host + outBatchStide * i, (const FLOAT16*)ib.host + inbatchStride * i, area,
151 channel);
152 }
153 return;
154 }
155 MNN_ERROR("Invalide format %d - %d copy for intenal Arm82 Backend\n", source, dest);
156 }
onCopyBuffer(const Tensor * srcTensor,const Tensor * dstTensor) const157 void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
158 auto& ib = srcTensor->buffer();
159 auto& ob = dstTensor->buffer();
160 if (ib.type.code != halide_type_float) {
161 CPUBackend::onCopyBuffer(srcTensor, dstTensor);
162 return;
163 }
164 auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
165 auto dest = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
166 auto srcType = MNN_FORWARD_CPU;
167 if (ib.device != 0) {
168 srcType = MNN_FORWARD_CPU_EXTENSION;
169 }
170 auto dstType = MNN_FORWARD_CPU;
171 if (ob.device != 0) {
172 dstType = MNN_FORWARD_CPU_EXTENSION;
173 }
174 if (srcType == dstType) {
175 if (srcType == MNN_FORWARD_CPU) {
176 MNNCPUCopyBuffer(srcTensor, dstTensor);
177 } else {
178 _convertFp16Inside(ib, ob, source, dest);
179 }
180 return;
181 }
182 // Use CPU Copy to turn save format
183 std::shared_ptr<Tensor> tempTensor;
184 if (source != dest) {
185 if (srcType == MNN_FORWARD_CPU) {
186 tempTensor.reset(Tensor::create<float>(dstTensor->shape(), nullptr, TensorUtils::getDimType(dstTensor)));
187 MNNCPUCopyBuffer(srcTensor, tempTensor.get());
188 srcTensor = tempTensor.get();
189 source = dest;
190 } else {
191 tempTensor.reset(Tensor::create<float>(srcTensor->shape(), nullptr, TensorUtils::getDimType(srcTensor)), [dstTensor](void* ptr) {
192 auto tempT = (Tensor*)ptr;
193 MNNCPUCopyBuffer(tempT, dstTensor);
194 delete tempT;
195 });
196 dstTensor = tempTensor.get();
197 dest = source;
198 }
199 }
200 if (source == MNN_DATA_FORMAT_NC4HW4) {
201 // NC4HW4 <-> NC8HW8
202 int area = 1;
203 int channel = srcTensor->length(1);
204 for (int axis = 2; axis < ib.dimensions; ++axis) {
205 area *= srcTensor->length(axis);
206 }
207 const int batch = srcTensor->length(0);
208 if (srcType == MNN_FORWARD_CPU) {
209 const int outBatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
210 const int inbatchStride = UP_DIV(channel, 4) * area * 4;
211 for (int i = 0; i < batch; ++i) {
212 MNNNC4HW4TONC8HW8(dstTensor->host<FLOAT16>() + outBatchStride * i, srcTensor->host<float>() + inbatchStride * i, area,
213 channel);
214 }
215 } else {
216 const int inbatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
217 const int outBatchStide = UP_DIV(channel, 4) * area * 4;
218 for (int i = 0; i < batch; ++i) {
219 MNNNC8HW8TONC4HW4(dstTensor->host<float>() + outBatchStide * i, srcTensor->host<FLOAT16>() + inbatchStride * i, area,
220 channel);
221 }
222 }
223 return;
224 }
225 //MNN_PRINT("%d, %d - %d, %d\n", source, srcType, dest, dstType);
226 // The format is the same, just convert fp32-fp16
227 const int elemenSize = srcTensor->elementSize();
228 // copy and quantize/dequantize data
229 // cpu -> arm82 copy
230 if (srcType == MNN_FORWARD_CPU) {
231 const auto src = srcTensor->host<float>();
232 auto dst = dstTensor->host<int16_t>();
233 MNNQuantizeFP16(src, dst, elemenSize);
234 return;
235 }
236 // arm82 -> cpu copy
237 if (srcType == MNN_FORWARD_CPU_EXTENSION) {
238 const auto src = srcTensor->host<int16_t>();
239 auto dst = dstTensor->host<float>();
240 MNNDequantizeFP16(src, dst, elemenSize);
241 return;
242 }
243 MNN_ERROR("Invalide copy for intenal Arm82 Backend\n");
244 return;
245 }
246
registerArm82RuntimeCreator()247 void registerArm82RuntimeCreator() {
248 Arm82Functions::init();
249 registerArm82Ops();
250 };
251 #ifndef MNN_CODEGEN_REGISTER
__anon1c5b8ec40302() 252 static const auto __arm82_global_initializer = []() {
253 registerArm82RuntimeCreator();
254 return true;
255 }();
256 #endif
257
258 } // namespace MNN
259 #endif
260