1 //
2 //  Arm82Backend.cpp
3 //  MNN
4 //
5 //  Created by MNN on 2019/01/31.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 #if defined(__ANDROID__) || defined(__aarch64__)
9 
10 #include <algorithm>
11 #include <mutex>
12 
13 #include "Arm82Backend.hpp"
14 #include "Arm82OptFunc.hpp"
15 #include "Arm82Functions.hpp"
16 #include "core/BufferAllocator.hpp"
17 #include "core/TensorUtils.hpp"
18 #include "core/OpCommonUtils.hpp"
19 #include "backend/cpu/compute/CommonOptFunction.h"
20 #include "half.hpp"
21 
22 namespace MNN {
23 
24 void registerArm82Ops();
25 
getArm82CreatorContainer()26 static inline std::map<OpType, Arm82Backend::Arm82Creator*>* getArm82CreatorContainer() {
27     static std::once_flag fg;
28     static std::map<OpType, Arm82Backend::Arm82Creator*>* ret = nullptr;
29     std::call_once(fg, [&] { ret = new std::map<OpType, Arm82Backend::Arm82Creator*>; });
30     return ret;
31 }
32 
addArm82Creator(OpType t,Arm82Creator * ct)33 bool Arm82Backend::addArm82Creator(OpType t, Arm82Creator* ct) {
34     auto creatorContainer = getArm82CreatorContainer();
35     if (creatorContainer->find(t) == creatorContainer->end()) {
36         creatorContainer->insert(std::make_pair(t, ct));
37     }
38     return true;
39 }
40 
Arm82Backend(const CPURuntime * runtime)41 Arm82Backend::Arm82Backend(const CPURuntime* runtime) : CPUBackend(runtime, BackendConfig::Precision_Low, MNN_FORWARD_CPU_EXTENSION) {
42     mCoreFunctions = Arm82Functions::get();
43 }
44 
~Arm82Backend()45 Arm82Backend::~Arm82Backend() {
46     // nothing to do
47 }
48 
onCreate(const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,const MNN::Op * op)49 Execution* Arm82Backend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
50                                   const MNN::Op* op) {
51     for (auto t : outputs) {
52         if (t->getType().code != halide_type_float) {
53             return nullptr;
54         }
55     }
56     auto quantInfo = OpCommonUtils::getQuantInfo(inputs);
57     if (quantInfo.first) {
58         return nullptr;
59     }
60     bool originCreate = OpCommonUtils::opCompabilityForLowp(op);
61     if (originCreate) {
62         return CPUBackend::onCreate(inputs, outputs, op);
63     }
64     auto creatorContainer = getArm82CreatorContainer();
65     // MNN_PRINT("====> create Execution for type: %s\n", MNN::EnumNameOpType(op->type()));
66     auto iter = creatorContainer->find(op->type());
67 
68     if (iter == creatorContainer->end()) {
69 //        MNN_PRINT("[MNNWarning]: ARMV82 don't support type: [%s]\n", MNN::EnumNameOpType(op->type()));
70         return nullptr;
71     }
72     auto exe = iter->second->onCreate(inputs, outputs, op, this);
73     if (exe == nullptr) {
74 //        MNN_PRINT("[MNNWarning]: ARMV82 don't support type: [%s]\n", MNN::EnumNameOpType(op->type()));
75         return nullptr;
76     }
77     return exe;
78 }
79 
_getAliginSize(const halide_buffer_t & buffer,MNN_DATA_FORMAT format)80 static int _getAliginSize(const halide_buffer_t& buffer, MNN_DATA_FORMAT format) {
81     // The default data type of input tensor for arm82 backend is FLOAT32.
82     // However, Arm82Backend default data type is FLOAT16, so check whether data type is FLOAT32,
83     // then divide size by 2
84     int size          = sizeof(int16_t);
85     const int dimensions = buffer.dimensions;
86     for (int i = 0; i < dimensions; i++) {
87         int currentDimSize = buffer.dim[i].extent;
88         if (format == MNN_DATA_FORMAT_NC4HW4 && 1 == i) {
89             currentDimSize = ALIGN_UP8(currentDimSize);
90         }
91         size *= currentDimSize;
92     }
93     return size;
94 }
95 
onAcquireBuffer(const Tensor * nativeTensor,StorageType storageType)96 bool Arm82Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) {
97     // arm82 backend tensor data type is fp16 default
98     auto tensor = const_cast<Tensor*>(nativeTensor);
99     auto& buffer = tensor->buffer();
100     if (buffer.type != halide_type_of<float>() && buffer.type != halide_type_of<FLOAT16>()) {
101         return CPUBackend::onAcquireBuffer(nativeTensor, storageType);
102     }
103     auto res = allocBuffer(_getAliginSize(buffer, TensorUtils::getDescribe(nativeTensor)->dimensionFormat), (Tensor*)nativeTensor, storageType);
104     if (!res) {
105         return false;
106     }
107     // Set mask in device for easy to determine
108     buffer.device = 1;
109     return true;
110 }
_convertFp16Inside(const halide_buffer_t & ib,const halide_buffer_t & ob,MNN_DATA_FORMAT source,MNN_DATA_FORMAT dest)111 static void _convertFp16Inside(const halide_buffer_t& ib, const halide_buffer_t& ob, MNN_DATA_FORMAT source, MNN_DATA_FORMAT dest) {
112     int area    = 1;
113     int channel = 0;
114     if (source == dest) {
115         ::memcpy(ob.host, ib.host, _getAliginSize(ib, source));
116         return;
117     }
118     if (source == MNN_DATA_FORMAT_NC4HW4 || source == MNN_DATA_FORMAT_NCHW) {
119         channel = ib.dim[1].extent;
120         for (int axis = 2; axis < ib.dimensions; ++axis) {
121             area *= ib.dim[axis].extent;
122         }
123     } else {
124         channel = ib.dim[ib.dimensions - 1].extent;
125         for (int axis = 1; axis < ib.dimensions - 1; ++axis) {
126             area *= ib.dim[axis].extent;
127         }
128     }
129 
130     // external use
131     // copy between user and Arm82Backend
132     // fp16 fp32 transformation
133     const int batch = ib.dim[0].extent;
134 
135     if (source == MNN_DATA_FORMAT_NC4HW4 && dest == MNN_DATA_FORMAT_NCHW) {
136         const int inbatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
137         const int outBatchStide = channel * area;
138 
139         for (int i = 0; i < batch; ++i) {
140             MNNUnPackC8FP16((FLOAT16*)ob.host + outBatchStide * i, (const FLOAT16*)ib.host + inbatchStride * i, area,
141                             channel);
142         }
143         return;
144     }
145 
146     if (source == MNN_DATA_FORMAT_NCHW && dest == MNN_DATA_FORMAT_NC4HW4) {
147         const int inbatchStride = channel * area;
148         const int outBatchStide = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
149         for (int i = 0; i < batch; ++i) {
150             MNNPackC8FP16((FLOAT16*)ob.host + outBatchStide * i, (const FLOAT16*)ib.host + inbatchStride * i, area,
151                             channel);
152         }
153         return;
154     }
155     MNN_ERROR("Invalide format %d - %d copy for intenal Arm82 Backend\n", source, dest);
156 }
onCopyBuffer(const Tensor * srcTensor,const Tensor * dstTensor) const157 void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
158     auto& ib     = srcTensor->buffer();
159     auto& ob     = dstTensor->buffer();
160     if (ib.type.code != halide_type_float) {
161         CPUBackend::onCopyBuffer(srcTensor, dstTensor);
162         return;
163     }
164     auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
165     auto dest   = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
166     auto srcType = MNN_FORWARD_CPU;
167     if (ib.device != 0) {
168         srcType = MNN_FORWARD_CPU_EXTENSION;
169     }
170     auto dstType = MNN_FORWARD_CPU;
171     if (ob.device != 0) {
172         dstType = MNN_FORWARD_CPU_EXTENSION;
173     }
174     if (srcType == dstType) {
175         if (srcType == MNN_FORWARD_CPU) {
176             MNNCPUCopyBuffer(srcTensor, dstTensor);
177         } else {
178             _convertFp16Inside(ib, ob, source, dest);
179         }
180         return;
181     }
182     // Use CPU Copy to turn save format
183     std::shared_ptr<Tensor> tempTensor;
184     if (source != dest) {
185         if (srcType == MNN_FORWARD_CPU) {
186             tempTensor.reset(Tensor::create<float>(dstTensor->shape(), nullptr, TensorUtils::getDimType(dstTensor)));
187             MNNCPUCopyBuffer(srcTensor, tempTensor.get());
188             srcTensor = tempTensor.get();
189             source = dest;
190         } else {
191             tempTensor.reset(Tensor::create<float>(srcTensor->shape(), nullptr, TensorUtils::getDimType(srcTensor)), [dstTensor](void* ptr) {
192                 auto tempT = (Tensor*)ptr;
193                 MNNCPUCopyBuffer(tempT, dstTensor);
194                 delete tempT;
195             });
196             dstTensor = tempTensor.get();
197             dest = source;
198         }
199     }
200     if (source == MNN_DATA_FORMAT_NC4HW4) {
201         // NC4HW4 <-> NC8HW8
202         int area    = 1;
203         int channel = srcTensor->length(1);
204         for (int axis = 2; axis < ib.dimensions; ++axis) {
205             area *= srcTensor->length(axis);
206         }
207         const int batch = srcTensor->length(0);
208         if (srcType == MNN_FORWARD_CPU) {
209             const int outBatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
210             const int inbatchStride = UP_DIV(channel, 4) * area * 4;
211             for (int i = 0; i < batch; ++i) {
212                 MNNNC4HW4TONC8HW8(dstTensor->host<FLOAT16>() + outBatchStride * i, srcTensor->host<float>() + inbatchStride * i, area,
213                                 channel);
214             }
215         } else {
216             const int inbatchStride = UP_DIV(channel, ARMV82_CHANNEL_UNIT) * area * ARMV82_CHANNEL_UNIT;
217             const int outBatchStide = UP_DIV(channel, 4) * area * 4;
218             for (int i = 0; i < batch; ++i) {
219                 MNNNC8HW8TONC4HW4(dstTensor->host<float>() + outBatchStide * i, srcTensor->host<FLOAT16>() + inbatchStride * i, area,
220                                 channel);
221             }
222         }
223         return;
224     }
225     //MNN_PRINT("%d, %d - %d, %d\n", source, srcType, dest, dstType);
226     // The format is the same, just convert fp32-fp16
227     const int elemenSize = srcTensor->elementSize();
228     // copy and quantize/dequantize data
229     // cpu -> arm82 copy
230     if (srcType == MNN_FORWARD_CPU) {
231         const auto src = srcTensor->host<float>();
232         auto dst       = dstTensor->host<int16_t>();
233         MNNQuantizeFP16(src, dst, elemenSize);
234         return;
235     }
236     // arm82 -> cpu copy
237     if (srcType == MNN_FORWARD_CPU_EXTENSION) {
238         const auto src = srcTensor->host<int16_t>();
239         auto dst       = dstTensor->host<float>();
240         MNNDequantizeFP16(src, dst, elemenSize);
241         return;
242     }
243     MNN_ERROR("Invalide copy for intenal Arm82 Backend\n");
244     return;
245 }
246 
registerArm82RuntimeCreator()247 void registerArm82RuntimeCreator() {
248     Arm82Functions::init();
249     registerArm82Ops();
250 };
251 #ifndef MNN_CODEGEN_REGISTER
__anon1c5b8ec40302() 252 static const auto __arm82_global_initializer = []() {
253     registerArm82RuntimeCreator();
254     return true;
255 }();
256 #endif
257 
258 } // namespace MNN
259 #endif
260