1 //
2 // CommonOptFunction.cpp
3 // MNN
4 //
5 // Created by MNN on 2018/09/06.
6 // Copyright © 2018, Alibaba Group Holding Limited
7 //
8
9 #include "CommonOptFunction.h"
10 #include "ConvOpt.h"
11 #include "WinogradOptFunction.hpp"
12 #include <string.h>
13 #include <algorithm>
14 #include <math.h>
15 #include "math/Vec.hpp"
16 #include <vector>
17 #include "../CPURuntime.hpp"
18 #include "core/MemoryFormater.h"
19 #include "core/OpCommonUtils.hpp"
20 // TODO: Find better way to optimize it
21 #include "../CPUBinary.hpp"
22 #include "../CPUUnary.hpp"
23 #include "../CPUPool.hpp"
24 #ifndef MNN_USE_SSE
MNNInt8ToInt16(int16_t * dest,const int8_t * source,size_t count)25 void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count) {
26 // Should not be called
27 MNN_ASSERT(false);
28 }
29 #endif
30
31 /*
32 source: source matrix is h x l
33 transpose: if false, export compressed matrix as h x l, other export as l x h.
34 */
MNNPackForSparseMatMul_B(float * dest,unsigned int * NNZMap,int * dataOffsetMap,int sparseBlockOC,const float * source,size_t h,size_t l,const int eP,bool transpose)35 void MNNPackForSparseMatMul_B(float* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const float* source, size_t h, size_t l, const int eP, bool transpose) {
36 // 1. in convolution, source B layout is OC x (IC * KH * KW),
37 // the dest layout of weight is BCSC(block compressed sparse colum) format, which is OC(!=0) x (IC*KH*KW!=0), as a canceled result, just do BCSR, transpose should be false.
38 // 2. in ordinary sparse MatMul, transpose is corresponding to BCSR or BCSC
39
40 // BCSR
41 if (transpose) {
42 int rowOffset = 0;
43 for (int i = 0; i < l; i += 1) {
44 *NNZMap = 0;
45 for(int j = 0; j < h; j += sparseBlockOC) {
46 if(!MNN::OpCommonUtils::checkAllZeros(source + j * l + i, l, sparseBlockOC, 1)) {
47 *dest = *(source + j * l + l);
48 dest++;
49 *NNZMap = *NNZMap + 1;
50 *dataOffsetMap = rowOffset;
51 dataOffsetMap++;
52 rowOffset = 0;
53 }
54 rowOffset += eP;
55 }
56 NNZMap++;
57 rowOffset -= h * eP;
58 }
59 } else { // BCSC
60 int columOffset = 0;
61 int i = 0;
62 for (; i + sparseBlockOC <= h; i += sparseBlockOC) {
63 *NNZMap = 0;
64 for(int j = 0; j < l; j += 1) {
65 if (!MNN::OpCommonUtils::checkAllZeros(source, l, sparseBlockOC, 1)) {
66 for (int ioc = 0; ioc < sparseBlockOC; ioc++) {
67 *dest = *(source + ioc * l);
68 dest++;
69 }
70 *NNZMap = *NNZMap + 1;
71 *dataOffsetMap = columOffset;
72 dataOffsetMap++;
73 columOffset = 0;
74 }
75 columOffset += eP;
76 source++;
77 }
78 NNZMap++;
79 source += l * (sparseBlockOC - 1);
80 columOffset -= l * eP;
81 }
82
83 for (; i < h; i++) {
84 *NNZMap = 0;
85 for(int j = 0; j < l; j++) {
86 if (*source != 0.0f) {
87 *dest = *source;
88 dest++;
89 *NNZMap = *NNZMap + 1;
90 *dataOffsetMap = columOffset;
91 dataOffsetMap++;
92 columOffset = 0;
93 }
94 columOffset += eP;
95 source++;
96 }
97 NNZMap++;
98 columOffset -= l * eP;
99 }
100
101 *dataOffsetMap = columOffset; //
102 }
103 return;
104 }
105
106 #ifndef MNN_USE_NEON
107
MNNGetMatMulPackMode(int * eP,int * lP,int * hP)108 void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
109 *eP = 16;
110 *lP = 1;
111 *hP = 4;
112 }
113
MNNGetSparseMatMulPackMode(int * eP,int * lP,int * hP)114 void MNNGetSparseMatMulPackMode(int* eP, int *lP, int* hP) {
115 *eP = 16;
116 *lP = 1;
117 *hP = 4;
118 // hp is corresponding to sparse block along right matrix colum dimension. in ramdom sparse, it is 1.
119 return;
120 }
121
MNNPackForMatMul_B(float * dest,const float * source,size_t h,size_t l,bool transpose)122 void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) {
123 auto hP = h / 4;
124 auto hR = hP * 4;
125 if (hR != h) {
126 ::memset(dest, 0, UP_DIV(h, 4)*4*l*sizeof(float));
127 }
128 if (!transpose) {
129 for (int y=0; y<hP; ++y) {
130 auto destY = dest + y * 4 * l;
131 auto sourceY = source + y * 4;
132 for (int x=0; x<l; ++x) {
133 ::memcpy(destY + 4 * x, sourceY + x * h, 4 * sizeof(float));
134 }
135 }
136 auto hRemain = h - hR;
137 if (hRemain > 0) {
138 auto destY = dest + hP * 4 * l;
139 auto sourceY = source + hP * 4;
140 for (int x=0; x<l; ++x) {
141 ::memcpy(destY + 4 * x, sourceY + x * h, hRemain * sizeof(float));
142 }
143 }
144 return;
145 }
146 MNNPackC4(dest, source, l, h);
147 }
148
_MNNPackedMatMulRemain(float * C,const float * A,const float * B,size_t eSize,const size_t * parameter,const float * postParameters,const float * bias,int aStride)149 static void _MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, int aStride) {
150 auto h = parameter[2];
151 auto l = parameter[1];
152 auto cStride = parameter[3] / sizeof(float);
153 auto hRemain = parameter[4];
154 auto bExtraStride = parameter[5] / sizeof(float);
155 auto bStride = bExtraStride + l * 4;
156 auto hC4 = UP_DIV(h, 4);
157 for (int y=0; y<hC4; ++y) {
158 ::memset(C + y * cStride, 0, eSize * 4 * sizeof(float));
159 }
160 float alpha = 1.0f;
161 float beta = 0.0f;
162 float minValue = -std::numeric_limits<float>().max();
163 float maxValue = std::numeric_limits<float>().max();
164 if (nullptr != postParameters) {
165 minValue = postParameters[2];
166 maxValue = postParameters[3];
167 alpha = postParameters[0];
168 beta = postParameters[1];
169 }
170
171 for (int x=0; x<eSize; ++x) {
172 auto dst = C + 4 * x;
173 auto src = A + x;
174 for (int y=0; y<hC4; ++y) {
175 auto dstY = dst + y * cStride;
176 auto weight = B + y * bStride;
177 float summer[4] = {
178 0.0f,
179 0.0f,
180 0.0f,
181 0.0f,
182 };
183 if (nullptr != bias) {
184 for (int v=0; v<4; ++v) {
185 summer[v] = bias[4 * y + v];
186 }
187 }
188 for (int z=0; z<l; ++z) {
189 auto aZ = src + z * aStride;
190 auto wZ = weight + z * 4;
191 summer[0] += wZ[0] * aZ[0];
192 summer[1] += wZ[1] * aZ[0];
193 summer[2] += wZ[2] * aZ[0];
194 summer[3] += wZ[3] * aZ[0];
195 }
196 for (int v=0; v<4; ++v) {
197 auto dstValue = std::min(summer[v], maxValue);
198 dstValue = std::max(dstValue, minValue);
199 dstY[v] = dstValue;
200 }
201 }
202 }
203 }
MNNPackedMatMul(float * C,const float * A,const float * B,const size_t * parameter,const float * postParameters,const float * bias)204 void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias) {
205 return _MNNPackedMatMulRemain(C, A, B, 16, parameter, postParameters, bias, 16);
206 }
207
MNNPackedMatMulRemain(float * C,const float * A,const float * B,size_t eSize,const size_t * parameter,const float * postParameters,const float * bias)208 void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias) {
209 auto aStride = parameter[0] / sizeof(float);
210 _MNNPackedMatMulRemain(C, A, B, eSize, parameter, postParameters, bias, aStride);
211 }
212
213
MNNPackC4ForMatMul_A(float * destOrigin,float const ** sourceGroup,const int32_t * info,const int32_t * el)214 void MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) {
215 int number = info[0];
216 int eReal = info[1];
217 int eDest = info[2];
218 int offset = info[3];
219 for (int n=0; n<number; ++n) {
220 int e = el[4 * n + 0];
221 int l = el[4 * n + 1];
222 int eOffset = el[4 * n + 2];
223 int lOffset = el[4 * n + 3];
224 auto dest = destOrigin + lOffset * eDest + eOffset;
225 auto source = sourceGroup[n];
226
227 for (int y=0; y<e; ++y) {
228 auto yR = y % eDest;
229 for (int x=0; x<l; ++x) {
230 auto xR = x % 4;
231 auto xC = x / 4;
232 dest[(x) * eDest + yR] = source[xC * eReal * 4 + y * 4 * offset + xR];
233 }
234 }
235 }
236 }
237
MNNPackedSparseMatMulEpx1(float * C,const float * A,const float * B,size_t eSize,const size_t * parameter,const float * postParameters,const float * bias,unsigned int * NNZMap,int * dataOffsetMap)238 void MNNPackedSparseMatMulEpx1(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap) {
239
240 auto eP = parameter[0] / sizeof(float);
241 MNN_ASSERT((eP & 0x03) == 0); // In sparse calculate, eP should be evenly divided by 4
242 auto h = parameter[2];
243 auto l = parameter[1];
244 auto cStride = parameter[3] / sizeof(float);
245 auto aStride = eP * l;
246 auto hRemain = parameter[4];
247 auto bExtraStride = parameter[5] / sizeof(float);
248 auto bStride = bExtraStride + l * 4;
249 auto hC4 = UP_DIV(h, 4);
250 float minValue = -std::numeric_limits<float>().max();
251 float maxValue = std::numeric_limits<float>().max();
252 if (nullptr != postParameters) {
253 minValue = postParameters[2];
254 maxValue = postParameters[3];
255 }
256 // MNN_PRINT("MNNPackedSparseMatMul eP:%lu, eSize:%lu, l:%lu, h:%lu, cStride:%lu, aStride:%lu\n", eP, eSize, l, h, cStride, aStride);
257
258 const float* a = A;
259 size_t ie = 0;
260 for (ie = 0; ie < eSize && eP <= eSize; ie += eP) {
261 const int* dataOffset = dataOffsetMap;
262 const int diff = *dataOffset++;
263 a += diff;
264 const float* w = B;
265 float* blockC = C + (ie << 2);
266 const unsigned int* nnz = NNZMap;
267 for (auto ih = 0; ih < h; ih++) {
268 auto ihPack = ih >> 2;
269 auto ihSubIndex = ih & 0x03;
270 auto c = blockC + ihPack * cStride + ihSubIndex;
271 const float initValue = nullptr != bias ? bias[ih] : 0;
272 float acc0 = initValue;
273 float acc1 = initValue;
274 float acc2 = initValue;
275 float acc3 = initValue;
276 float acc4 = initValue;
277 float acc5 = initValue;
278 float acc6 = initValue;
279 float acc7 = initValue;
280 float acc8 = initValue;
281 float acc9 = initValue;
282 float acc10 = initValue;
283 float acc11 = initValue;
284 float acc12 = initValue;
285 float acc13 = initValue;
286 float acc14 = initValue;
287 float acc15 = initValue;
288 const int lElement = *nnz++;
289 for (auto il = 0; il < lElement; il++) {
290
291 const int diff = *dataOffset++;
292 const float a0 = a[0];
293 const float a1 = a[1];
294 const float a2 = a[2];
295 const float a3 = a[3];
296 const float a4 = a[4];
297 const float a5 = a[5];
298 const float a6 = a[6];
299 const float a7 = a[7];
300 const float a8 = a[8];
301 const float a9 = a[9];
302 const float a10 = a[10];
303 const float a11 = a[11];
304 const float a12 = a[12];
305 const float a13 = a[13];
306 const float a14 = a[14];
307 const float a15 = a[15];
308
309 const float oneW = *w++;
310
311 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
312 // formatMatrix(a, {16});
313 // MNN_PRINT("\n");
314 a = a + diff;
315 acc0 += a0 * oneW;
316 acc1 += a1 * oneW;
317 acc2 += a2 * oneW;
318 acc3 += a3 * oneW;
319 acc4 += a4 * oneW;
320 acc5 += a5 * oneW;
321 acc6 += a6 * oneW;
322 acc7 += a7 * oneW;
323 acc8 += a8 * oneW;
324 acc9 += a9 * oneW;
325 acc10 += a10 * oneW;
326 acc11 += a11 * oneW;
327 acc12 += a12 * oneW;
328 acc13 += a13 * oneW;
329 acc14 += a14 * oneW;
330 acc15 += a15 * oneW;
331 }
332 acc0 = std::max(std::min(maxValue, acc0), minValue);
333 acc1 = std::max(std::min(maxValue, acc1), minValue);
334 acc2 = std::max(std::min(maxValue, acc2), minValue);
335 acc3 = std::max(std::min(maxValue, acc3), minValue);
336 acc4 = std::max(std::min(maxValue, acc4), minValue);
337 acc5 = std::max(std::min(maxValue, acc5), minValue);
338 acc6 = std::max(std::min(maxValue, acc6), minValue);
339 acc7 = std::max(std::min(maxValue, acc7), minValue);
340 acc8 = std::max(std::min(maxValue, acc8), minValue);
341 acc9 = std::max(std::min(maxValue, acc9), minValue);
342 acc10 = std::max(std::min(maxValue, acc10), minValue);
343 acc11 = std::max(std::min(maxValue, acc11), minValue);
344 acc12 = std::max(std::min(maxValue, acc12), minValue);
345 acc13 = std::max(std::min(maxValue, acc13), minValue);
346 acc14 = std::max(std::min(maxValue, acc14), minValue);
347 acc15 = std::max(std::min(maxValue, acc15), minValue);
348
349 // how to store faster: st4 / transpose /
350 c[0] = acc0;
351 c[4] = acc1;
352 c[4 * 2] = acc2;
353 c[4 * 3] = acc3;
354 c[4 * 4] = acc4;
355 c[4 * 5] = acc5;
356 c[4 * 6] = acc6;
357 c[4 * 7] = acc7;
358 c[4 * 8] = acc8;
359 c[4 * 9] = acc9;
360 c[4 * 10] = acc10;
361 c[4 * 11] = acc11;
362 c[4 * 12] = acc12;
363 c[4 * 13] = acc13;
364 c[4 * 14] = acc14;
365 c[4 * 15] = acc15;
366 }
367 a += aStride;
368 }
369 // const float* blockA = A + ie * l;
370 if (eSize & 0x08) {
371 const int* dataOffset = dataOffsetMap;
372 const int diff = *dataOffset++;
373 // a = blockA + diff;
374 a += diff;
375 const float* w = B;
376 float* blockC = C + (ie << 2);
377 const unsigned int* nnz = NNZMap;
378 for (auto ih = 0; ih < h; ih++) {
379 auto ihPack = ih >> 2;
380 auto ihSubIndex = ih & 0x03;
381 auto c = blockC + ihPack * cStride + ihSubIndex;
382 const float initValue = nullptr != bias ? bias[ih] : 0;
383 float acc0 = initValue;
384 float acc1 = initValue;
385 float acc2 = initValue;
386 float acc3 = initValue;
387 float acc4 = initValue;
388 float acc5 = initValue;
389 float acc6 = initValue;
390 float acc7 = initValue;
391
392 const int lElement = *nnz++;
393 for (auto il = 0; il < lElement; il++) {
394 const int diff = *dataOffset++;
395 const float a0 = a[0];
396 const float a1 = a[1];
397 const float a2 = a[2];
398 const float a3 = a[3];
399 const float a4 = a[4];
400 const float a5 = a[5];
401 const float a6 = a[6];
402 const float a7 = a[7];
403 const float oneW = *w++;
404 // MNN_PRINT("8-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-7]:", ie, a - A, w - B - 1, c - C, oneW);
405 // formatMatrix(a, {8});
406 // MNN_PRINT("\n");
407 a = a + diff;
408 acc0 += a0 * oneW;
409 acc1 += a1 * oneW;
410 acc2 += a2 * oneW;
411 acc3 += a3 * oneW;
412 acc4 += a4 * oneW;
413 acc5 += a5 * oneW;
414 acc6 += a6 * oneW;
415 acc7 += a7 * oneW;
416 }
417 acc0 = std::max(std::min(maxValue, acc0), minValue);
418 acc1 = std::max(std::min(maxValue, acc1), minValue);
419 acc2 = std::max(std::min(maxValue, acc2), minValue);
420 acc3 = std::max(std::min(maxValue, acc3), minValue);
421 acc4 = std::max(std::min(maxValue, acc4), minValue);
422 acc5 = std::max(std::min(maxValue, acc5), minValue);
423 acc6 = std::max(std::min(maxValue, acc6), minValue);
424 acc7 = std::max(std::min(maxValue, acc7), minValue);
425 // how to store faster: st4 / transpose /
426 c[0] = acc0;
427 c[4] = acc1;
428 c[4 * 2] = acc2;
429 c[4 * 3] = acc3;
430 c[4 * 4] = acc4;
431 c[4 * 5] = acc5;
432 c[4 * 6] = acc6;
433 c[4 * 7] = acc7;
434 }
435 ie += 8;
436 a += 8;
437 }
438
439 if (eSize & 0x04) {
440 const int* dataOffset = dataOffsetMap;
441 const int diff = *dataOffset++;
442 // const float* a = blockA + diff;
443 a += diff;
444 const float* w = B;
445 float* blockC = C + (ie << 2);
446 const unsigned int* nnz = NNZMap;
447 for (auto ih = 0; ih < h; ih++) {
448 auto ihPack = ih >> 2;
449 auto ihSubIndex = ih & 0x03;
450 auto c = blockC + ihPack * cStride + ihSubIndex;
451 const float initValue = nullptr != bias ? bias[ih] : 0;
452 float acc0 = initValue;
453 float acc1 = initValue;
454 float acc2 = initValue;
455 float acc3 = initValue;
456
457 const int lElement = *nnz++;
458 for (auto il = 0; il < lElement; il++) {
459 const int diff = *dataOffset++;
460 const float a0 = a[0];
461 const float a1 = a[1];
462 const float a2 = a[2];
463 const float a3 = a[3];
464 const float oneW = *w++;
465 // MNN_PRINT("4-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-3]:", ie, a - A, w - B - 1, c - C, oneW);
466 // formatMatrix(a, {4});
467 // MNN_PRINT("\n");
468 a = a + diff;
469 acc0 += a0 * oneW;
470 acc1 += a1 * oneW;
471 acc2 += a2 * oneW;
472 acc3 += a3 * oneW;
473 }
474 acc0 = std::max(std::min(maxValue, acc0), minValue);
475 acc1 = std::max(std::min(maxValue, acc1), minValue);
476 acc2 = std::max(std::min(maxValue, acc2), minValue);
477 acc3 = std::max(std::min(maxValue, acc3), minValue);
478 // how to store faster: st4 / transpose /
479 c[0] = acc0;
480 c[4] = acc1;
481 c[4 * 2] = acc2;
482 c[4 * 3] = acc3;
483 }
484 ie += 4;
485 a += 4;
486 }
487 if (eSize & 0x02) {
488 const int* dataOffset = dataOffsetMap;
489 const int diff = *dataOffset++;
490 // const float* a = blockA + diff;
491 a += diff;
492 const float* w = B;
493 float* blockC = C + (ie << 2);
494 const unsigned int* nnz = NNZMap;
495 for (auto ih = 0; ih < h; ih++) {
496 auto ihPack = ih >> 2;
497 auto ihSubIndex = ih & 0x03;
498 auto c = blockC + ihPack * cStride + ihSubIndex;
499 const float initValue = nullptr != bias ? bias[ih] : 0;
500 float acc0 = initValue;
501 float acc1 = initValue;
502
503 const int lElement = *nnz++;
504 for (auto il = 0; il < lElement; il++) {
505 const int diff = *dataOffset++;
506 const float a0 = a[0];
507 const float a1 = a[1];
508 const float oneW = *w++;
509 // MNN_PRINT("2-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-1]:", ie, a - A, w - B - 1, c - C, oneW);
510 // formatMatrix(a, {2});
511 // MNN_PRINT("\n");
512 a = a + diff;
513 acc0 += a0 * oneW;
514 acc1 += a1 * oneW;
515 }
516 acc0 = std::max(std::min(maxValue, acc0), minValue);
517 acc1 = std::max(std::min(maxValue, acc1), minValue);
518 // how to store faster: st4 / transpose /
519 c[0] = acc0;
520 c[4] = acc1;
521 }
522 ie += 2;
523 a += 2;
524 }
525 if (eSize & 0x01) {
526 const int* dataOffset = dataOffsetMap;
527 const int diff = *dataOffset++;
528 // const float* a = blockA + diff;
529 a += diff;
530 const float* w = B;
531 float* blockC = C + (ie << 2);
532 const unsigned int* nnz = NNZMap;
533 for (auto ih = 0; ih < h; ih++) {
534 auto ihPack = ih >> 2;
535 auto ihSubIndex = ih & 0x03;
536 auto c = blockC + ihPack * cStride + ihSubIndex;
537 const float initValue = nullptr != bias ? bias[ih] : 0;
538 float acc0 = initValue;
539
540 const int lElement = *nnz++;
541 for (auto il = 0; il < lElement; il++) {
542 const int diff = *dataOffset++;
543 const float a0 = a[0];
544 const float oneW = *w++;
545
546 // MNN_PRINT("1-loop: ie:%zu, a offset:%ld, c offset:%ld, w offset:%ld, w value:%f, a value[0]:", ie, a - A, w - B - 1, c - C, oneW);
547 // formatMatrix(a, {1});
548 // MNN_PRINT("\n");
549 a = a + diff;
550 acc0 += a0 * oneW;
551 }
552 acc0 = std::max(std::min(maxValue, acc0), minValue);
553 // how to store faster: st4 / transpose /
554 c[0] = acc0;
555 }
556 ie += 1;
557 // a += 1;
558 }
559
560 return;
561 }
562
MNNPackedSparseMatMulEpx4(float * C,const float * A,const float * B,size_t eSize,const size_t * parameter,const float * postParameters,const float * bias,unsigned int * NNZMap,int * dataOffsetMap)563 void MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap) {
564
565 auto eP = parameter[0] / sizeof(float);
566 MNN_ASSERT((eP & 0x03) == 0); // In sparse calculate, eP should be evenly divided by 4
567 auto h = parameter[2];
568 auto l = parameter[1];
569 auto cStride = parameter[3] / sizeof(float);
570 auto aStride = eP * l;
571 auto hRemain = parameter[4];
572 auto bExtraStride = parameter[5] / sizeof(float);
573 auto bStride = bExtraStride + l * 4;
574 auto hC4 = UP_DIV(h, 4);
575 float minValue = -std::numeric_limits<float>().max();
576 float maxValue = std::numeric_limits<float>().max();
577 if (nullptr != postParameters) {
578 minValue = postParameters[2];
579 maxValue = postParameters[3];
580 }
581 // MNN_PRINT("MNNPackedSparseMatMul 16x4 eP:%lu, eSize:%lu, l:%lu, h:%lu, cStride:%lu, aStride:%lu\n", eP, eSize, l, h, cStride, aStride);
582 const int sparseBlockOC = 4;
583 const float* a = A;
584 size_t ie = 0;
585 for (ie = 0; ie < eSize && eP <= eSize; ie += eP) {
586 const int* dataOffset = dataOffsetMap;
587 const int diff = *dataOffset++;
588 a += diff;
589 const float* w = B;
590 float* blockC = C + (ie << 2);
591 const unsigned int* nnz = NNZMap;
592
593 size_t ih = 0;
594 for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
595 auto ihPack = ih >> 2;
596 auto c = blockC + ihPack * cStride;
597
598 float initValue[4] = {0, 0, 0, 0};
599 if (nullptr != bias) {
600 memcpy(initValue, bias + ih, 4 * sizeof(float));
601 }
602 float acc0[4];
603 float acc1[4];
604 float acc2[4];
605 float acc3[4];
606 float acc4[4];
607 float acc5[4];
608 float acc6[4];
609 float acc7[4];
610 float acc8[4];
611 float acc9[4];
612 float acc10[4];
613 float acc11[4];
614 float acc12[4];
615 float acc13[4];
616 float acc14[4];
617 float acc15[4];
618 memcpy(acc0, initValue, 4 * sizeof(float));
619 memcpy(acc1, initValue, 4 * sizeof(float));
620 memcpy(acc2, initValue, 4 * sizeof(float));
621 memcpy(acc3, initValue, 4 * sizeof(float));
622 memcpy(acc4, initValue, 4 * sizeof(float));
623 memcpy(acc5, initValue, 4 * sizeof(float));
624 memcpy(acc6, initValue, 4 * sizeof(float));
625 memcpy(acc7, initValue, 4 * sizeof(float));
626 memcpy(acc8, initValue, 4 * sizeof(float));
627 memcpy(acc9, initValue, 4 * sizeof(float));
628 memcpy(acc10, initValue, 4 * sizeof(float));
629 memcpy(acc11, initValue, 4 * sizeof(float));
630 memcpy(acc12, initValue, 4 * sizeof(float));
631 memcpy(acc13, initValue, 4 * sizeof(float));
632 memcpy(acc14, initValue, 4 * sizeof(float));
633 memcpy(acc15, initValue, 4 * sizeof(float));
634
635 const int lElement = *nnz++;
636 for (auto il = 0; il < lElement; il++) {
637
638 const int diff = *dataOffset++;
639 const float a0 = a[0];
640 const float a1 = a[1];
641 const float a2 = a[2];
642 const float a3 = a[3];
643 const float a4 = a[4];
644 const float a5 = a[5];
645 const float a6 = a[6];
646 const float a7 = a[7];
647 const float a8 = a[8];
648 const float a9 = a[9];
649 const float a10 = a[10];
650 const float a11 = a[11];
651 const float a12 = a[12];
652 const float a13 = a[13];
653 const float a14 = a[14];
654 const float a15 = a[15];
655
656 const float wv[4] = {*w++, *w++, *w++, *w++};
657
658 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
659 // formatMatrix(a, {16});
660 // MNN_PRINT("\n");
661 a = a + diff;
662 for (int lane = 0; lane < 4; lane++) {
663 acc0[lane] += a0 * wv[lane];
664 acc1[lane] += a1 * wv[lane];
665 acc2[lane] += a2 * wv[lane];
666 acc3[lane] += a3 * wv[lane];
667 acc4[lane] += a4 * wv[lane];
668 acc5[lane] += a5 * wv[lane];
669 acc6[lane] += a6 * wv[lane];
670 acc7[lane] += a7 * wv[lane];
671 acc8[lane] += a8 * wv[lane];
672 acc9[lane] += a9 * wv[lane];
673 acc10[lane] += a10 * wv[lane];
674 acc11[lane] += a11 * wv[lane];
675 acc12[lane] += a12 * wv[lane];
676 acc13[lane] += a13 * wv[lane];
677 acc14[lane] += a14 * wv[lane];
678 acc15[lane] += a15 * wv[lane];
679 }
680 }
681
682 for (int lane = 0; lane < 4; lane++) {
683 acc0[lane] = std::max(std::min(maxValue, acc0[lane]), minValue);
684 acc1[lane] = std::max(std::min(maxValue, acc1[lane]), minValue);
685 acc2[lane] = std::max(std::min(maxValue, acc2[lane]), minValue);
686 acc3[lane] = std::max(std::min(maxValue, acc3[lane]), minValue);
687 acc4[lane] = std::max(std::min(maxValue, acc4[lane]), minValue);
688 acc5[lane] = std::max(std::min(maxValue, acc5[lane]), minValue);
689 acc6[lane] = std::max(std::min(maxValue, acc6[lane]), minValue);
690 acc7[lane] = std::max(std::min(maxValue, acc7[lane]), minValue);
691 acc8[lane] = std::max(std::min(maxValue, acc8[lane]), minValue);
692 acc9[lane] = std::max(std::min(maxValue, acc9[lane]), minValue);
693 acc10[lane] = std::max(std::min(maxValue, acc10[lane]), minValue);
694 acc11[lane] = std::max(std::min(maxValue, acc11[lane]), minValue);
695 acc12[lane] = std::max(std::min(maxValue, acc12[lane]), minValue);
696 acc13[lane] = std::max(std::min(maxValue, acc13[lane]), minValue);
697 acc14[lane] = std::max(std::min(maxValue, acc14[lane]), minValue);
698 acc15[lane] = std::max(std::min(maxValue, acc15[lane]), minValue);
699 }
700
701 memcpy(c, acc0, 4 * sizeof(float)); // store continuous c
702 memcpy(c + 4, acc1, 4 * sizeof(float));
703 memcpy(c + 4 * 2, acc2, 4 * sizeof(float));
704 memcpy(c + 4 * 3, acc3, 4 * sizeof(float));
705 memcpy(c + 4 * 4, acc4, 4 * sizeof(float));
706 memcpy(c + 4 * 5, acc5, 4 * sizeof(float));
707 memcpy(c + 4 * 6, acc6, 4 * sizeof(float));
708 memcpy(c + 4 * 7, acc7, 4 * sizeof(float));
709 memcpy(c + 4 * 8, acc8, 4 * sizeof(float));
710 memcpy(c + 4 * 9, acc9, 4 * sizeof(float));
711 memcpy(c + 4 * 10, acc10, 4 * sizeof(float));
712 memcpy(c + 4 * 11, acc11, 4 * sizeof(float));
713 memcpy(c + 4 * 12, acc12, 4 * sizeof(float));
714 memcpy(c + 4 * 13, acc13, 4 * sizeof(float));
715 memcpy(c + 4 * 14, acc14, 4 * sizeof(float));
716 memcpy(c + 4 * 15, acc15, 4 * sizeof(float));
717 }
718
719 blockC += (h >> 2) * cStride;
720 for (; ih < h; ih++) {
721 auto ihSubIndex = ih & 0x03;
722 auto c = blockC + ihSubIndex;
723 const float initValue = nullptr != bias ? bias[ih] : 0;
724 float acc0 = initValue;
725 float acc1 = initValue;
726 float acc2 = initValue;
727 float acc3 = initValue;
728 float acc4 = initValue;
729 float acc5 = initValue;
730 float acc6 = initValue;
731 float acc7 = initValue;
732 float acc8 = initValue;
733 float acc9 = initValue;
734 float acc10 = initValue;
735 float acc11 = initValue;
736 float acc12 = initValue;
737 float acc13 = initValue;
738 float acc14 = initValue;
739 float acc15 = initValue;
740 const int lElement = *nnz++;
741 for (auto il = 0; il < lElement; il++) {
742
743 const int diff = *dataOffset++;
744 const float a0 = a[0];
745 const float a1 = a[1];
746 const float a2 = a[2];
747 const float a3 = a[3];
748 const float a4 = a[4];
749 const float a5 = a[5];
750 const float a6 = a[6];
751 const float a7 = a[7];
752 const float a8 = a[8];
753 const float a9 = a[9];
754 const float a10 = a[10];
755 const float a11 = a[11];
756 const float a12 = a[12];
757 const float a13 = a[13];
758 const float a14 = a[14];
759 const float a15 = a[15];
760
761 const float oneW = *w++;
762
763 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
764 // formatMatrix(a, {16});
765 // MNN_PRINT("\n");
766 a = a + diff;
767 acc0 += a0 * oneW;
768 acc1 += a1 * oneW;
769 acc2 += a2 * oneW;
770 acc3 += a3 * oneW;
771 acc4 += a4 * oneW;
772 acc5 += a5 * oneW;
773 acc6 += a6 * oneW;
774 acc7 += a7 * oneW;
775 acc8 += a8 * oneW;
776 acc9 += a9 * oneW;
777 acc10 += a10 * oneW;
778 acc11 += a11 * oneW;
779 acc12 += a12 * oneW;
780 acc13 += a13 * oneW;
781 acc14 += a14 * oneW;
782 acc15 += a15 * oneW;
783 }
784 acc0 = std::max(std::min(maxValue, acc0), minValue);
785 acc1 = std::max(std::min(maxValue, acc1), minValue);
786 acc2 = std::max(std::min(maxValue, acc2), minValue);
787 acc3 = std::max(std::min(maxValue, acc3), minValue);
788 acc4 = std::max(std::min(maxValue, acc4), minValue);
789 acc5 = std::max(std::min(maxValue, acc5), minValue);
790 acc6 = std::max(std::min(maxValue, acc6), minValue);
791 acc7 = std::max(std::min(maxValue, acc7), minValue);
792 acc8 = std::max(std::min(maxValue, acc8), minValue);
793 acc9 = std::max(std::min(maxValue, acc9), minValue);
794 acc10 = std::max(std::min(maxValue, acc10), minValue);
795 acc11 = std::max(std::min(maxValue, acc11), minValue);
796 acc12 = std::max(std::min(maxValue, acc12), minValue);
797 acc13 = std::max(std::min(maxValue, acc13), minValue);
798 acc14 = std::max(std::min(maxValue, acc14), minValue);
799 acc15 = std::max(std::min(maxValue, acc15), minValue);
800
801 // how to store faster: st4 / transpose /
802 c[0] = acc0;
803 c[4] = acc1;
804 c[4 * 2] = acc2;
805 c[4 * 3] = acc3;
806 c[4 * 4] = acc4;
807 c[4 * 5] = acc5;
808 c[4 * 6] = acc6;
809 c[4 * 7] = acc7;
810 c[4 * 8] = acc8;
811 c[4 * 9] = acc9;
812 c[4 * 10] = acc10;
813 c[4 * 11] = acc11;
814 c[4 * 12] = acc12;
815 c[4 * 13] = acc13;
816 c[4 * 14] = acc14;
817 c[4 * 15] = acc15;
818 }
819 a += aStride;
820 }
821 // const float* blockA = A + ie * l;
822 if (eSize & 0x08) {
823 const int* dataOffset = dataOffsetMap;
824 const int diff = *dataOffset++;
825 // a = blockA + diff;
826 a += diff;
827 const float* w = B;
828 float* blockC = C + (ie << 2);
829 const unsigned int* nnz = NNZMap;
830
831 size_t ih = 0;
832 for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
833 auto ihPack = ih >> 2;
834 auto c = blockC + ihPack * cStride;
835 float initValue[4] = {0, 0, 0, 0};
836 if (nullptr != bias) {
837 memcpy(initValue, bias + ih, 4 * sizeof(float));
838 }
839 float acc0[4];
840 float acc1[4];
841 float acc2[4];
842 float acc3[4];
843 float acc4[4];
844 float acc5[4];
845 float acc6[4];
846 float acc7[4];
847 memcpy(acc0, initValue, 4 * sizeof(float));
848 memcpy(acc1, initValue, 4 * sizeof(float));
849 memcpy(acc2, initValue, 4 * sizeof(float));
850 memcpy(acc3, initValue, 4 * sizeof(float));
851 memcpy(acc4, initValue, 4 * sizeof(float));
852 memcpy(acc5, initValue, 4 * sizeof(float));
853 memcpy(acc6, initValue, 4 * sizeof(float));
854 memcpy(acc7, initValue, 4 * sizeof(float));
855 const int lElement = *nnz++;
856 for (auto il = 0; il < lElement; il++) {
857
858 const int diff = *dataOffset++;
859 const float a0 = a[0];
860 const float a1 = a[1];
861 const float a2 = a[2];
862 const float a3 = a[3];
863 const float a4 = a[4];
864 const float a5 = a[5];
865 const float a6 = a[6];
866 const float a7 = a[7];
867 const float wv[4] = {*w++, *w++, *w++, *w++};
868 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
869 // formatMatrix(a, {16});
870 // MNN_PRINT("\n");
871 a = a + diff;
872 for (int lane = 0; lane < 4; lane++) {
873 acc0[lane] += a0 * wv[lane];
874 acc1[lane] += a1 * wv[lane];
875 acc2[lane] += a2 * wv[lane];
876 acc3[lane] += a3 * wv[lane];
877 acc4[lane] += a4 * wv[lane];
878 acc5[lane] += a5 * wv[lane];
879 acc6[lane] += a6 * wv[lane];
880 acc7[lane] += a7 * wv[lane];
881 }
882 }
883
884 for (int lane = 0; lane < 4; lane++) {
885 acc0[lane] = std::max(std::min(maxValue, acc0[lane]), minValue);
886 acc1[lane] = std::max(std::min(maxValue, acc1[lane]), minValue);
887 acc2[lane] = std::max(std::min(maxValue, acc2[lane]), minValue);
888 acc3[lane] = std::max(std::min(maxValue, acc3[lane]), minValue);
889 acc4[lane] = std::max(std::min(maxValue, acc4[lane]), minValue);
890 acc5[lane] = std::max(std::min(maxValue, acc5[lane]), minValue);
891 acc6[lane] = std::max(std::min(maxValue, acc6[lane]), minValue);
892 acc7[lane] = std::max(std::min(maxValue, acc7[lane]), minValue);
893 }
894
895 memcpy(c, acc0, 4 * sizeof(float)); // store continuous c
896 memcpy(c + 4, acc1, 4 * sizeof(float));
897 memcpy(c + 4 * 2, acc2, 4 * sizeof(float));
898 memcpy(c + 4 * 3, acc3, 4 * sizeof(float));
899 memcpy(c + 4 * 4, acc4, 4 * sizeof(float));
900 memcpy(c + 4 * 5, acc5, 4 * sizeof(float));
901 memcpy(c + 4 * 6, acc6, 4 * sizeof(float));
902 memcpy(c + 4 * 7, acc7, 4 * sizeof(float));
903 }
904 blockC += (ih >> 2) * cStride;
905 for (; ih < h; ih++) {
906 auto ihSubIndex = ih & 0x03;
907 auto c = blockC + ihSubIndex;
908 const float initValue = nullptr != bias ? bias[ih] : 0;
909 float acc0 = initValue;
910 float acc1 = initValue;
911 float acc2 = initValue;
912 float acc3 = initValue;
913 float acc4 = initValue;
914 float acc5 = initValue;
915 float acc6 = initValue;
916 float acc7 = initValue;
917
918 const int lElement = *nnz++;
919 for (auto il = 0; il < lElement; il++) {
920 const int diff = *dataOffset++;
921 const float a0 = a[0];
922 const float a1 = a[1];
923 const float a2 = a[2];
924 const float a3 = a[3];
925 const float a4 = a[4];
926 const float a5 = a[5];
927 const float a6 = a[6];
928 const float a7 = a[7];
929 const float oneW = *w++;
930 // MNN_PRINT("8-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-7]:", ie, a - A, w - B - 1, c - C, oneW);
931 // formatMatrix(a, {8});
932 // MNN_PRINT("\n");
933 a = a + diff;
934 acc0 += a0 * oneW;
935 acc1 += a1 * oneW;
936 acc2 += a2 * oneW;
937 acc3 += a3 * oneW;
938 acc4 += a4 * oneW;
939 acc5 += a5 * oneW;
940 acc6 += a6 * oneW;
941 acc7 += a7 * oneW;
942 }
943 acc0 = std::max(std::min(maxValue, acc0), minValue);
944 acc1 = std::max(std::min(maxValue, acc1), minValue);
945 acc2 = std::max(std::min(maxValue, acc2), minValue);
946 acc3 = std::max(std::min(maxValue, acc3), minValue);
947 acc4 = std::max(std::min(maxValue, acc4), minValue);
948 acc5 = std::max(std::min(maxValue, acc5), minValue);
949 acc6 = std::max(std::min(maxValue, acc6), minValue);
950 acc7 = std::max(std::min(maxValue, acc7), minValue);
951 // how to store faster: st4 / transpose /
952 c[0] = acc0;
953 c[4] = acc1;
954 c[4 * 2] = acc2;
955 c[4 * 3] = acc3;
956 c[4 * 4] = acc4;
957 c[4 * 5] = acc5;
958 c[4 * 6] = acc6;
959 c[4 * 7] = acc7;
960 }
961 ie += 8;
962 a += 8;
963 }
964
965 if (eSize & 0x04) {
966 const int* dataOffset = dataOffsetMap;
967 const int diff = *dataOffset++;
968 // const float* a = blockA + diff;
969 a += diff;
970 const float* w = B;
971 float* blockC = C + (ie << 2);
972 const unsigned int* nnz = NNZMap;
973
974 size_t ih = 0;
975 for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
976 auto ihPack = ih >> 2;
977 auto c = blockC + ihPack * cStride;
978 float initValue[4] = {0, 0, 0, 0};
979 if (nullptr != bias) {
980 memcpy(initValue, bias + ih, 4 * sizeof(float));
981 }
982 float acc0[4];
983 float acc1[4];
984 float acc2[4];
985 float acc3[4];
986 memcpy(acc0, initValue, 4 * sizeof(float));
987 memcpy(acc1, initValue, 4 * sizeof(float));
988 memcpy(acc2, initValue, 4 * sizeof(float));
989 memcpy(acc3, initValue, 4 * sizeof(float));
990
991 const int lElement = *nnz++;
992 for (auto il = 0; il < lElement; il++) {
993 const int diff = *dataOffset++;
994 const float a0 = a[0];
995 const float a1 = a[1];
996 const float a2 = a[2];
997 const float a3 = a[3];
998 const float wv[4] = {*w++, *w++, *w++, *w++};
999 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
1000 // formatMatrix(a, {16});
1001 // MNN_PRINT("\n");
1002 a = a + diff;
1003 for (int lane = 0; lane < 4; lane++) {
1004 acc0[lane] += a0 * wv[lane];
1005 acc1[lane] += a1 * wv[lane];
1006 acc2[lane] += a2 * wv[lane];
1007 acc3[lane] += a3 * wv[lane];
1008 }
1009 }
1010
1011 for (int lane = 0; lane < 4; lane++) {
1012 acc0[lane] = std::max(std::min(maxValue, acc0[lane]), minValue);
1013 acc1[lane] = std::max(std::min(maxValue, acc1[lane]), minValue);
1014 acc2[lane] = std::max(std::min(maxValue, acc2[lane]), minValue);
1015 acc3[lane] = std::max(std::min(maxValue, acc3[lane]), minValue);
1016 }
1017
1018 memcpy(c, acc0, 4 * sizeof(float)); // store continuous c
1019 memcpy(c + 4, acc1, 4 * sizeof(float));
1020 memcpy(c + 4 * 2, acc2, 4 * sizeof(float));
1021 memcpy(c + 4 * 3, acc3, 4 * sizeof(float));
1022 }
1023 blockC += (ih >> 2) * cStride;
1024 for (; ih < h; ih++) {
1025 auto ihSubIndex = ih & 0x03;
1026 auto c = blockC + ihSubIndex;
1027 const float initValue = nullptr != bias ? bias[ih] : 0;
1028 float acc0 = initValue;
1029 float acc1 = initValue;
1030 float acc2 = initValue;
1031 float acc3 = initValue;
1032
1033 const int lElement = *nnz++;
1034 for (auto il = 0; il < lElement; il++) {
1035 const int diff = *dataOffset++;
1036 const float a0 = a[0];
1037 const float a1 = a[1];
1038 const float a2 = a[2];
1039 const float a3 = a[3];
1040 const float oneW = *w++;
1041 // MNN_PRINT("4-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-3]:", ie, a - A, w - B - 1, c - C, oneW);
1042 // formatMatrix(a, {4});
1043 // MNN_PRINT("\n");
1044 a = a + diff;
1045 acc0 += a0 * oneW;
1046 acc1 += a1 * oneW;
1047 acc2 += a2 * oneW;
1048 acc3 += a3 * oneW;
1049 }
1050 acc0 = std::max(std::min(maxValue, acc0), minValue);
1051 acc1 = std::max(std::min(maxValue, acc1), minValue);
1052 acc2 = std::max(std::min(maxValue, acc2), minValue);
1053 acc3 = std::max(std::min(maxValue, acc3), minValue);
1054 // how to store faster: st4 / transpose /
1055 c[0] = acc0;
1056 c[4] = acc1;
1057 c[4 * 2] = acc2;
1058 c[4 * 3] = acc3;
1059 }
1060 ie += 4;
1061 a += 4;
1062 }
1063 if (eSize & 0x02) {
1064 const int* dataOffset = dataOffsetMap;
1065 const int diff = *dataOffset++;
1066 // const float* a = blockA + diff;
1067 a += diff;
1068 const float* w = B;
1069 float* blockC = C + (ie << 2);
1070 const unsigned int* nnz = NNZMap;
1071
1072 size_t ih = 0;
1073 for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
1074 auto ihPack = ih >> 2;
1075 auto c = blockC + ihPack * cStride;
1076 float initValue[4] = {0, 0, 0, 0};
1077 if (nullptr != bias) {
1078 memcpy(initValue, bias + ih, 4 * sizeof(float));
1079 }
1080 float acc0[4];
1081 float acc1[4];
1082 memcpy(acc0, initValue, 4 * sizeof(float));
1083 memcpy(acc1, initValue, 4 * sizeof(float));
1084 const int lElement = *nnz++;
1085 for (auto il = 0; il < lElement; il++) {
1086
1087 const int diff = *dataOffset++;
1088 const float a0 = a[0];
1089 const float a1 = a[1];
1090 const float wv[4] = {*w++, *w++, *w++, *w++};
1091 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
1092 // formatMatrix(a, {16});
1093 // MNN_PRINT("\n");
1094 a = a + diff;
1095 for (int lane = 0; lane < 4; lane++) {
1096 acc0[lane] += a0 * wv[lane];
1097 acc1[lane] += a1 * wv[lane];
1098 }
1099 }
1100
1101 for (int lane = 0; lane < 4; lane++) {
1102 acc0[lane] = std::max(std::min(maxValue, acc0[lane]), minValue);
1103 acc1[lane] = std::max(std::min(maxValue, acc1[lane]), minValue);
1104 }
1105
1106 memcpy(c, acc0, 4 * sizeof(float)); // store continuous c
1107 memcpy(c + 4, acc1, 4 * sizeof(float));
1108 }
1109 blockC += (ih >> 2) * cStride;
1110 for (; ih < h; ih++) {
1111 auto ihPack = ih >> 2;
1112 auto ihSubIndex = ih & 0x03;
1113 auto c = blockC + ihSubIndex;
1114 const float initValue = nullptr != bias ? bias[ih] : 0;
1115 float acc0 = initValue;
1116 float acc1 = initValue;
1117
1118 const int lElement = *nnz++;
1119 for (auto il = 0; il < lElement; il++) {
1120 const int diff = *dataOffset++;
1121 const float a0 = a[0];
1122 const float a1 = a[1];
1123 const float oneW = *w++;
1124 // MNN_PRINT("2-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-1]:", ie, a - A, w - B - 1, c - C, oneW);
1125 // formatMatrix(a, {2});
1126 // MNN_PRINT("\n");
1127 a = a + diff;
1128 acc0 += a0 * oneW;
1129 acc1 += a1 * oneW;
1130 }
1131 acc0 = std::max(std::min(maxValue, acc0), minValue);
1132 acc1 = std::max(std::min(maxValue, acc1), minValue);
1133 // how to store faster: st4 / transpose /
1134 c[0] = acc0;
1135 c[4] = acc1;
1136 }
1137 ie += 2;
1138 a += 2;
1139 }
1140 if (eSize & 0x01) {
1141 const int* dataOffset = dataOffsetMap;
1142 const int diff = *dataOffset++;
1143 // const float* a = blockA + diff;
1144 a += diff;
1145 const float* w = B;
1146 float* blockC = C + (ie << 2);
1147 const unsigned int* nnz = NNZMap;
1148
1149 size_t ih = 0;
1150 for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
1151 auto ihPack = ih >> 2;
1152 auto c = blockC + ihPack * cStride;
1153 float initValue[4] = {0, 0, 0, 0};
1154 if (nullptr != bias) {
1155 memcpy(initValue, bias + ih, 4 * sizeof(float));
1156 }
1157 float acc0[4];
1158 memcpy(acc0, initValue, 4 * sizeof(float));
1159 const int lElement = *nnz++;
1160 for (auto il = 0; il < lElement; il++) {
1161
1162 const int diff = *dataOffset++;
1163 const float a0 = a[0];
1164 const float wv[4] = {*w++, *w++, *w++, *w++};
1165 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
1166 // formatMatrix(a, {16});
1167 // MNN_PRINT("\n");
1168 a = a + diff;
1169 for (int lane = 0; lane < 4; lane++) {
1170 acc0[lane] += a0 * wv[lane];
1171 }
1172 }
1173
1174 for (int lane = 0; lane < 4; lane++) {
1175 acc0[lane] = std::max(std::min(maxValue, acc0[lane]), minValue);
1176 }
1177 memcpy(c, acc0, 4 * sizeof(float)); // store continuous c
1178 }
1179 blockC += (ih >> 2) * cStride;
1180 for (; ih < h; ih++) {
1181 auto ihSubIndex = ih & 0x03;
1182 auto c = blockC + ihSubIndex;
1183 const float initValue = nullptr != bias ? bias[ih] : 0;
1184 float acc0 = initValue;
1185
1186 const int lElement = *nnz++;
1187 for (auto il = 0; il < lElement; il++) {
1188 const int diff = *dataOffset++;
1189 const float a0 = a[0];
1190 const float oneW = *w++;
1191
1192 // MNN_PRINT("1-loop: ie:%zu, a offset:%ld, c offset:%ld, w offset:%ld, w value:%f, a value[0]:", ie, a - A, w - B - 1, c - C, oneW);
1193 // formatMatrix(a, {1});
1194 // MNN_PRINT("\n");
1195 a = a + diff;
1196 acc0 += a0 * oneW;
1197 }
1198 acc0 = std::max(std::min(maxValue, acc0), minValue);
1199 // how to store faster: st4 / transpose /
1200 c[0] = acc0;
1201 }
1202 ie += 1;
1203 // a += 1;
1204 }
1205
1206 return;
1207 }
1208
1209 #endif
1210
1211 #ifndef MNN_USE_SSE
1212 #ifndef MNN_USE_NEON
MNNTranspose32Bit(int32_t * dstO,const int32_t * srcO,int32_t * dim)1213 void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim) {
1214 int w = dim[0];
1215 int h = dim[1];
1216 int srcStride = dim[2];
1217 int dstStride = dim[3];
1218 for (int i=0; i<h; ++i) {
1219 auto si = srcO + i;
1220 auto di = dstO + i * dstStride;
1221 for (int j=0; j<w; ++j) {
1222 auto sj = si + j * srcStride;
1223 auto dj = di + j;
1224 *dj = *sj;
1225 }
1226 }
1227 }
1228 #endif
MNNFunctionInit()1229 void MNNFunctionInit() {
1230 // Do nothing
1231 }
1232 #endif
1233
1234 #ifdef MNN_USE_NEON
1235 #include <arm_neon.h>
1236 #endif
1237
1238 #define UNIT 4
1239 using Vec4 = MNN::Math::Vec<float, 4>;
1240
1241 #ifndef MNN_USE_NEON
1242
1243 #ifndef MNN_USE_SSE
1244
MNNCopyC4WithStride(const float * source,float * dest,size_t srcStride,size_t dstStride,size_t count)1245 void MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) {
1246 for (int i = 0; i < count; ++i) {
1247 auto s = source + i * srcStride;
1248 auto d = dest + i * dstStride;
1249 for (int j = 0; j < 4; ++j) {
1250 d[j] = s[j];
1251 }
1252 }
1253 }
1254
MNNAddC4WithStride(const float * source,float * dest,size_t srcStride,size_t dstStride,size_t count)1255 void MNNAddC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) {
1256 for (int i = 0; i < count; ++i) {
1257 auto s = source + i * srcStride;
1258 auto d = dest + i * dstStride;
1259 for (int j = 0; j < 4; ++j) {
1260 d[j] += s[j];
1261 }
1262 }
1263 }
1264
MNNReluWithSlopeChannel(float * dst,const float * src,const float * slope,size_t sizeQuad,size_t depthQuad)1265 void MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) {
1266 for (int j = 0; j < depthQuad; j++) {
1267 const float* slopeZ = slope + 4 * j;
1268 const float* srcZ = src + 4 * j * sizeQuad;
1269 float* dstZ = dst + 4 * j * sizeQuad;
1270 for (int i = 0; i < sizeQuad; i++) {
1271 for (int c = 0; c < 4; c++) {
1272 if (srcZ[4 * i + c] < 0) {
1273 dstZ[4 * i + c] = srcZ[4 * i + c] * slopeZ[c];
1274 } else {
1275 dstZ[4 * i + c] = srcZ[4 * i + c];
1276 }
1277 }
1278 }
1279 }
1280 }
1281
MNNPackC4(float * dst,const float * src,size_t area,size_t depth)1282 void MNNPackC4(float* dst, const float* src, size_t area, size_t depth) {
1283 int depthC4 = depth / 4;
1284 int depthRemain = depthC4 * 4;
1285 int remain = depth - depthRemain;
1286 int z, x, y;
1287 const float* srcChannel[4];
1288 const float* srcOffset = src;
1289 for(z = 0; z < depthC4; ++z) {
1290 for(y = 0; y < 4; ++y) {
1291 srcChannel[y] = srcOffset + area * y;
1292 }
1293 for(x = 0; x < area; ++x) {
1294 for(y = 0; y < 4; ++y) {
1295 dst[0] = srcChannel[y][0];
1296 srcChannel[y]++;
1297 dst++;
1298 }
1299 }
1300 srcOffset += area * 4;
1301 }
1302 if(remain > 0){
1303 for(y = 0; y < remain; ++y) {
1304 srcChannel[y] = srcOffset + area * y;
1305 }
1306 for(x = 0; x < area; ++x) {
1307 for(y = 0; y < remain; ++y) {
1308 dst[0] = srcChannel[y][0];
1309 srcChannel[y]++;
1310 dst++;
1311 }
1312 for(y = remain; y < 4; ++y) {
1313 dst[0] = 0;
1314 dst++;
1315 }
1316 }
1317 }
1318 }
1319
MNNUnpackC4(float * dst,const float * src,size_t area,size_t depth)1320 void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth) {
1321 int depthC4 = depth / 4;
1322 int depthRemain = depthC4 * 4;
1323 int remain = depth - depthRemain;
1324 int z, x, y;
1325 const float* srcChannel[4];
1326 const float* srcOffset = src;
1327 for(z = 0; z < depthC4; ++z) {
1328 for(y = 0; y < 4; ++y) {
1329 srcChannel[y] = srcOffset + y;
1330 for(x = 0; x < area; ++x) {
1331 dst[0] = srcChannel[y][0];
1332 srcChannel[y] += 4;
1333 dst++;
1334 }
1335 }
1336 srcOffset += area * 4;
1337 }
1338 if(remain > 0){
1339 for(y = 0; y < remain; ++y) {
1340 srcChannel[y] = srcOffset + y;
1341 for(x = 0; x < area; ++x) {
1342 dst[0] = srcChannel[y][0];
1343 srcChannel[y] += 4;
1344 dst++;
1345 }
1346 }
1347 }
1348 }
1349
MNNExpC8(float * dest,const float * source,const float * parameters,size_t countC8)1350 void MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
1351 auto count = countC8 * 8;
1352 auto param = parameters[0];
1353 float xLimit = 87;
1354 for (int i = 0; i < count; ++i) {
1355 auto x = -source[i];
1356 x = ALIMAX(x, -xLimit);
1357 x = ALIMIN(x, xLimit);
1358 int div = (x * parameters[1]);
1359 int div2 = (div + 127) << 23;
1360 auto xReamin = x - div * param;
1361 float expBasic = *(float*)(&div2);
1362 auto t = xReamin;
1363 auto expRemain =
1364 ((((parameters[7] * t + parameters[6]) * t + parameters[5]) * t + parameters[4]) * t + parameters[3]) * t +
1365 parameters[2];
1366 dest[i] = expBasic * expRemain;
1367 }
1368 }
1369
MNNReluInt8(int8_t * dst,const int8_t * src,size_t size)1370 void MNNReluInt8(int8_t* dst, const int8_t* src, size_t size) {
1371 int i;
1372 for (i = 0; i < size; ++i) {
1373 if (src[i] < 0) {
1374 dst[i] = 0;
1375 } else {
1376 dst[i] = src[i];
1377 }
1378 }
1379 }
1380 #endif // no MNN_USE_SSE
1381
MNNMaxFloat(float * input,float * maxBuffer,int32_t inputCountUnit)1382 void MNNMaxFloat(float* input, float* maxBuffer, int32_t inputCountUnit) {
1383 for (int i = 0; i < inputCountUnit; i++) {
1384 for (int j = 0; j < UNIT; j++) {
1385 for (int m = 0; m < 2; m++) {
1386 maxBuffer[j] = std::max(input[i * UNIT * 2 + j * 2 + m], maxBuffer[j]);
1387 }
1388 }
1389 }
1390 }
MNNMinFloat(float * input,float * minBuffer,int32_t inputCountUnit)1391 void MNNMinFloat(float* input, float* minBuffer, int32_t inputCountUnit) {
1392 for (int i = 0; i < inputCountUnit; i++) {
1393 for (int j = 0; j < UNIT; j++) {
1394 for (int m = 0; m < 2; m++) {
1395 minBuffer[j] = std::min(input[i * UNIT * 2 + j * 2 + m], minBuffer[j]);
1396 }
1397 }
1398 }
1399 }
MNNScaleAndAddBias(float * dst,const float * src,const float * bias,const float * alpha,size_t planeNumber,size_t biasNumber)1400 void MNNScaleAndAddBias(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber,
1401 size_t biasNumber) {
1402 for (int z = 0; z < biasNumber; ++z) {
1403 float* dstZ = dst + planeNumber * 4 * z;
1404 const float* srcZ = src + planeNumber * 4 * z;
1405 auto biasZ = Vec4::load(bias + 4 * z);
1406 auto alphaZ = Vec4::load(alpha + 4 * z);
1407 for (int p = 0; p < planeNumber; ++p) {
1408 float* dstX = dstZ + 4 * p;
1409 const float* srcX = srcZ + 4 * p;
1410 Vec4::save(dstX, (Vec4::load(srcX) * alphaZ) + biasZ);
1411 }
1412 }
1413 }
1414
1415
1416
MNNUInt8ToInt16WithOffsetC4Common(int16_t * dst,const uint8_t * src,size_t zeroPoint,size_t sizeQuad,size_t dstStride,size_t srcStride)1417 void MNNUInt8ToInt16WithOffsetC4Common(int16_t* dst, const uint8_t* src, size_t zeroPoint, size_t sizeQuad,
1418 size_t dstStride, size_t srcStride) {
1419 dstStride /= sizeof(int16_t);
1420 srcStride /= sizeof(uint8_t);
1421 for (int z = 0; z < sizeQuad; ++z) {
1422 auto dstZ = dst + dstStride * z;
1423 auto srcZ = src + srcStride * z;
1424 for (int j = 0; j < 4; ++j) {
1425 dstZ[j] = (int16_t)((int32_t)srcZ[j] - (int32_t)zeroPoint);
1426 }
1427 }
1428 }
1429
MNNUInt8ToInt16WithOffsetC4Fast(int16_t * colAddr,const uint8_t * srcStart,size_t zeroPoint,size_t sizeQuad,size_t depthQuad,size_t dstZStep,size_t srcZStep)1430 void MNNUInt8ToInt16WithOffsetC4Fast(int16_t* colAddr, const uint8_t* srcStart, size_t zeroPoint, size_t sizeQuad,
1431 size_t depthQuad, size_t dstZStep, size_t srcZStep) {
1432 dstZStep /= sizeof(int16_t);
1433 srcZStep /= sizeof(uint8_t);
1434 for (int sz = 0; sz < depthQuad; ++sz) {
1435 auto dstZ = colAddr + sz * dstZStep;
1436 auto srcZ = srcStart + sz * srcZStep;
1437 MNNUInt8ToInt16WithOffsetC4Common(dstZ, srcZ, zeroPoint, sizeQuad, 4 * sizeof(int16_t), 4 * sizeof(uint8_t));
1438 }
1439 }
1440
MNNPowC8(float * dest,const float * source,const float * powfParam,size_t betaInt,size_t countC8)1441 void MNNPowC8(float* dest, const float* source, const float* powfParam, size_t betaInt, size_t countC8) {
1442 const int count = countC8 * 8;
1443 const float powfConstant = powfParam[6];
1444 for (int i = 0; i < count; ++i) {
1445 float result = 1, x, xInv = 1 / source[i];
1446 for (int j = 0; j < betaInt; result *= xInv, ++j)
1447 ;
1448 for (x = source[i]; x >= 1.25; x /= 1.5, result *= powfConstant)
1449 ;
1450 float t = x - 1;
1451 float powRemain =
1452 powfParam[0] +
1453 t * (powfParam[1] + t * (powfParam[2] + t * (powfParam[3] + t * (powfParam[4] + t * powfParam[5]))));
1454 result *= powRemain;
1455 dest[i] = result;
1456 }
1457 }
1458
1459
1460 #endif // no MNN_USE_NEON
1461
1462
MNNPackC4Uint8(uint8_t * dst,const uint8_t * src,size_t area,size_t depth)1463 void MNNPackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth) {
1464 int z, x;
1465 int cur = 0;
1466 memset(dst, 0, area * UP_DIV(depth, 4) * 4 * sizeof(uint8_t));
1467 for (z = 0; z < depth; ++z) {
1468 int plane = z / 4;
1469 uint8_t* dstPlane = plane * area * 4 + dst;
1470 int offset = z % 4;
1471 for (x = 0; x < area; ++x) {
1472 dstPlane[4 * x + offset] = src[cur++];
1473 }
1474 }
1475 }
1476
MNNUnpackC4Uint8(uint8_t * dst,const uint8_t * src,size_t area,size_t depth)1477 void MNNUnpackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth) {
1478 int x;
1479 int z;
1480 int cur = 0;
1481 for (z = 0; z < depth; ++z) {
1482 int plane = z / 4;
1483 const uint8_t* srcPlane = plane * area * 4 + src;
1484 int offset = z % 4;
1485 for (x = 0; x < area; ++x) {
1486 dst[cur++] = srcPlane[4 * x + offset];
1487 }
1488 }
1489 }
1490
MNNUnpackTransposeUint8(uint8_t * dst,const uint8_t * src,size_t area,size_t depth)1491 void MNNUnpackTransposeUint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth) {
1492 if (depth == 4) {
1493 ::memcpy(dst, src, area * depth * sizeof(uint8_t));
1494 return;
1495 }
1496 #ifdef MNN_USE_NEON
1497 if (depth == 3) {
1498 uint8x16x4_t rgba;
1499 rgba.val[3] = vdupq_n_u8(0);
1500 int sta = 0;
1501 int staC16 = (int)area / 16;
1502 for (int i = 0; i < staC16; sta += 16, ++i) {
1503 auto rgb = vld3q_u8(src + sta * 3);
1504 rgba.val[0] = rgb.val[0];
1505 rgba.val[1] = rgb.val[1];
1506 rgba.val[2] = rgb.val[2];
1507 vst4q_u8(dst + 4 * sta, rgba);
1508 }
1509 sta = staC16 * 16;
1510
1511 for (; sta < area; ++sta) {
1512 auto s = src + sta * 3;
1513 auto d = dst + sta * 4;
1514 d[0] = s[0];
1515 d[1] = s[1];
1516 d[2] = s[2];
1517 d[3] = 0;
1518 }
1519
1520 return;
1521 }
1522 if (depth == 1) {
1523 uint8x16x4_t rgba;
1524 rgba.val[1] = vdupq_n_u8(0);
1525 rgba.val[2] = vdupq_n_u8(0);
1526 rgba.val[3] = vdupq_n_u8(0);
1527 int sta = 0;
1528 for (; sta < area; sta += 16) {
1529 rgba.val[0] = vld1q_u8(src + sta);
1530 vst4q_u8(dst + 4 * sta, rgba);
1531 }
1532
1533 for (; sta < area; ++sta) {
1534 auto s = src + sta;
1535 auto d = dst + sta * 4;
1536 d[0] = s[0];
1537 d[1] = 0;
1538 d[2] = 0;
1539 d[3] = 0;
1540 }
1541
1542 return;
1543 }
1544 #endif
1545 int c = (int)depth;
1546 int cDiv4 = c / 4;
1547 int cAlign = cDiv4 * 4;
1548 for (int hi = 0; hi < area; ++hi) {
1549 auto srcHeight = (src + hi * c);
1550 auto dstHeight = (dst + hi * 4);
1551 for (int ci = 0; ci < cDiv4; ++ci) {
1552 for (int i = 0; i < 4; ++i) {
1553 dstHeight[ci * area * 4 + i] = srcHeight[4 * ci + i];
1554 }
1555 }
1556 }
1557
1558 if (cAlign == c) {
1559 return;
1560 }
1561
1562 int cReamin = c - cAlign;
1563 auto srcAlign = src + cAlign;
1564 auto dstAlign = dst + area * cAlign;
1565
1566 for (int hi = 0; hi < area; ++hi) {
1567 auto srcHeight = srcAlign + hi * c;
1568 auto dstHeight = dstAlign + hi * 4;
1569 for (int i = 0; i < 4; ++i) {
1570 dstHeight[i] = 0;
1571 }
1572 for (int ci = 0; ci < cReamin; ++ci) {
1573 dstHeight[ci] = srcHeight[ci];
1574 }
1575 }
1576 }
1577
MNNUnpackTranspose(float * dst,const float * src,size_t area,size_t depth)1578 void MNNUnpackTranspose(float* dst, const float* src, size_t area, size_t depth) {
1579 #ifdef MNN_USE_NEON
1580 if (1 == depth) {
1581 auto zeroValue = vmovq_n_f32(0.0f);
1582 int areaC4 = (int)area / 4;
1583 int remain = areaC4 * 4;
1584 for (int i = 0; i < areaC4; ++i) {
1585 auto srcCur = src + 4 * i;
1586 auto dstCur = dst + 16 * i;
1587 auto srcValue = vld1q_f32(srcCur);
1588 float32x4x4_t dstValue;
1589 dstValue.val[0] = srcValue;
1590 dstValue.val[1] = zeroValue;
1591 dstValue.val[2] = zeroValue;
1592 dstValue.val[3] = zeroValue;
1593 vst4q_f32(dstCur, dstValue);
1594 }
1595 for (int i = remain; i < area; ++i) {
1596 dst[4 * i + 0] = src[i];
1597 dst[4 * i + 1] = 0.0f;
1598 dst[4 * i + 2] = 0.0f;
1599 dst[4 * i + 3] = 0.0f;
1600 }
1601 return;
1602 }
1603 if (3 == depth) {
1604 auto zeroValue = vmovq_n_f32(0.0f);
1605 int areaC4 = (int)area / 4;
1606 int remain = areaC4 * 4;
1607 for (int i = 0; i < areaC4; ++i) {
1608 auto srcCur = src + 12 * i;
1609 auto dstCur = dst + 16 * i;
1610 auto srcValue = vld3q_f32(srcCur);
1611 float32x4x4_t dstValue;
1612 dstValue.val[0] = srcValue.val[0];
1613 dstValue.val[1] = srcValue.val[1];
1614 dstValue.val[2] = srcValue.val[2];
1615 dstValue.val[3] = zeroValue;
1616 vst4q_f32(dstCur, dstValue);
1617 }
1618 for (int i = remain; i < area; ++i) {
1619 dst[4 * i + 0] = src[3 * i + 0];
1620 dst[4 * i + 1] = src[3 * i + 1];
1621 dst[4 * i + 2] = src[3 * i + 2];
1622 dst[4 * i + 3] = 0.0f;
1623 }
1624 return;
1625 }
1626 #endif
1627 int c = (int)depth;
1628 int cDiv4 = c / 4;
1629 int cAlign = cDiv4 * 4;
1630 for (int hi = 0; hi < area; ++hi) {
1631 const float* srcHeight = src + hi * c;
1632 float* dstHeight = dst + hi * 4;
1633 for (int ci = 0; ci < cDiv4; ++ci) {
1634 Vec4::save(dstHeight + 4 * ci * area, Vec4::load(srcHeight + 4 * ci));
1635 }
1636 }
1637
1638 if (cAlign == c) {
1639 return;
1640 }
1641
1642 int cReamin = c - cAlign;
1643 auto srcAlign = src + cAlign;
1644 auto dstAlign = dst + area * cAlign;
1645
1646 #ifdef MNN_USE_NEON
1647 auto zeroVector = vdupq_n_f32(0.0f);
1648 #endif
1649
1650 for (int hi = 0; hi < area; ++hi) {
1651 const float* srcHeight = srcAlign + hi * c;
1652 float* dstHeight = dstAlign + hi * 4;
1653 #ifdef MNN_USE_NEON
1654 vst1q_f32(dstHeight, zeroVector);
1655 #else
1656 for (int i = 0; i < 4; ++i) {
1657 dstHeight[i] = 0;
1658 }
1659 #endif
1660 for (int ci = 0; ci < cReamin; ++ci) {
1661 dstHeight[ci] = srcHeight[ci];
1662 }
1663 }
1664 }
1665
MNNPackTransposeUint8(uint8_t * dst,const uint8_t * src,size_t area,size_t depth)1666 void MNNPackTransposeUint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth) {
1667 if (1 == area) {
1668 ::memcpy(dst, src, depth * sizeof(uint8_t));
1669 return;
1670 }
1671 int c = (int)depth;
1672 int cDiv4 = c / 4;
1673 int cAlign = cDiv4 * 4;
1674 if (cAlign == c) {
1675 int32_t* dst32 = (int32_t*)dst;
1676 const int32_t* src32 = (int32_t*)src;
1677 for (int hi = 0; hi < area; ++hi) {
1678 auto srcHeight = src32 + hi;
1679 auto dstHeight = dst32 + hi * cDiv4;
1680 for (int ci = 0; ci < cDiv4; ++ci) {
1681 dstHeight[ci] = srcHeight[ci * area];
1682 }
1683 }
1684 return;
1685 }
1686
1687 for (int hi = 0; hi < area; ++hi) {
1688 auto srcHeight = src + hi * 4;
1689 auto dstHeight = dst + hi * c;
1690 for (int ci = 0; ci < cDiv4; ++ci) {
1691 for (int i = 0; i < 4; ++i) {
1692 dstHeight[ci * 4 + i] = srcHeight[4 * ci * area + i];
1693 }
1694 }
1695 }
1696
1697 int cReamin = c - cAlign;
1698 auto srcAlign = src + area * cAlign;
1699 auto dstAlign = dst + cAlign;
1700
1701 for (int hi = 0; hi < area; ++hi) {
1702 auto srcHeight = srcAlign + hi * 4;
1703 auto dstHeight = dstAlign + hi * c;
1704
1705 for (int ci = 0; ci < cReamin; ++ci) {
1706 dstHeight[ci] = srcHeight[ci];
1707 }
1708 }
1709 }
1710
MNNPackTranspose(float * dst,const float * src,size_t area,size_t depth)1711 void MNNPackTranspose(float* dst, const float* src, size_t area, size_t depth) {
1712 int c = (int)depth;
1713 int cDiv4 = c / 4;
1714 int cAlign = cDiv4 * 4;
1715 for (int hi = 0; hi < area; ++hi) {
1716 const float* srcHeight = src + hi * 4;
1717 float* dstHeight = dst + hi * c;
1718 for (int ci = 0; ci < cDiv4; ++ci) {
1719 Vec4::save(dstHeight + 4 * ci, Vec4::load(srcHeight + 4 * ci * area));
1720 }
1721 }
1722
1723 if (cAlign == c) {
1724 return;
1725 }
1726
1727 int cReamin = c - cAlign;
1728 auto srcAlign = src + area * cAlign;
1729 auto dstAlign = dst + cAlign;
1730
1731 for (int hi = 0; hi < area; ++hi) {
1732 const float* srcHeight = srcAlign + hi * 4;
1733 float* dstHeight = dstAlign + hi * c;
1734
1735 for (int ci = 0; ci < cReamin; ++ci) {
1736 dstHeight[ci] = srcHeight[ci];
1737 }
1738 }
1739 }
1740
MNNExp(float * dst,const float * src,size_t dataSize)1741 void MNNExp(float* dst, const float* src, size_t dataSize) {
1742 int countC8 = (int)dataSize / 8;
1743 if (countC8 > 0) {
1744 // Align to eight so asm is easier to write
1745 static float parameters[] = {
1746 (float)log(2.0f), 1.0f / (float)log(2.0f), 1.0f, 1.0f, 0.5f, 1.0f / 6.0f, 1.0f / 24.0f, 1.0f / 120.0f};
1747 MNNExpC8(dst, src, parameters, countC8);
1748 }
1749 int remain = countC8 * 8;
1750 auto param = log(2.0f);
1751 float xLimit = 87;
1752 for (int i = remain; i < dataSize; i++) {
1753 /*Origin Function*/
1754 //dst[i] = expf(-src[i]);
1755 /*Approciate Function*/
1756
1757 auto x = -src[i];
1758 x = ALIMAX(x, -xLimit);
1759 x = ALIMIN(x, xLimit);
1760
1761 int div = (x / param);
1762 int div2 = (div + 127) << 23;
1763 auto xReamin = x - div * param;
1764 float expBasic = *(float*)(&div2);
1765
1766 auto t = xReamin;
1767 auto expRemain = ((((1.0f / 120 * t + 1.0f / 24) * t + 1.0f / 6) * t + 0.5f) * t + 1.0f) * t + 1.0f;
1768 dst[i] = expBasic * expRemain;
1769 }
1770 }
1771
1772 // Lambert's series with 7 divisions
1773 // reference from
1774 // https://varietyofsound.wordpress.com/2011/02/14/efficient-tanh-computation-using-lamberts-continued-fraction/
tanhf_poly(float value)1775 inline float tanhf_poly(float value) {
1776 if (value > 5.0) {
1777 return 1.0;
1778 } else if (value <= -5.0) {
1779 return -1.0;
1780 } else {
1781 float x2 = value * value;
1782 float a = value * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
1783 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
1784 return a / b;
1785 }
1786 }
MNNTanh(float * dst,const float * src,size_t dataSize)1787 void MNNTanh(float* dst, const float* src, size_t dataSize) {
1788 /* Origin Code
1789 for (int i = 0; i < dataSize; i++) {
1790 // outputData[i] = 1 - 2 / (expf(2 * inputData[i]) + 1);
1791 dst[i] = tanhf_poly(src[i]);
1792 }
1793 */
1794 for (int i = 0; i < dataSize; ++i) {
1795 dst[i] = src[i] + src[i];
1796 }
1797 MNNExp(dst, dst, dataSize);
1798 for (int i = 0; i < dataSize; i++) {
1799 // outputData[i] = 1 - 2 / (expf(2 * inputData[i]) + 1);
1800 auto expX2 = dst[i];
1801 dst[i] = (1.0f - expX2) / (1.0f + expX2);
1802 }
1803 }
1804
MNNReluWithSlope(float * dst,const float * src,size_t sizeQuad,float slope)1805 void MNNReluWithSlope(float* dst, const float* src, size_t sizeQuad, float slope) {
1806 float slopeValue[4];
1807 for (int i=0; i<4; ++i) {
1808 slopeValue[i] = slope;
1809 }
1810 MNNReluWithSlopeChannel(dst, src, slopeValue, sizeQuad, 1);
1811 }
1812
MNNReluWithSlopeCommon(float * dst,const float * src,size_t size,float slope)1813 void MNNReluWithSlopeCommon(float* dst, const float* src, size_t size, float slope) {
1814 int sizeQuad = size / 4;
1815 int start = 0;
1816 if (sizeQuad > 0) {
1817 MNNReluWithSlope(dst, src, sizeQuad, slope);
1818 start = sizeQuad * 4;
1819 }
1820 for (int j = start; j < size; j++) {
1821 if (src[j] < 0) {
1822 dst[j] = src[j] * slope;
1823 } else {
1824 dst[j] = src[j];
1825 }
1826 }
1827 }
1828
MNNHardSwishCommon(float * dst,const float * src,size_t size)1829 void MNNHardSwishCommon(float* dst, const float* src, size_t size) {
1830 int sizeQuad = size / 4;
1831 int start = 0;
1832 #ifdef MNN_USE_SSE
1833 if (sizeQuad > 0) {
1834 MNNHardSwish(dst, src, sizeQuad);
1835 start = sizeQuad * 4;
1836 }
1837 #endif
1838 #ifdef MNN_USE_NEON
1839 float32x4_t zero = vdupq_n_f32(0.f);
1840 float32x4_t three = vdupq_n_f32(3.f);
1841 float32x4_t six = vdupq_n_f32(6.f);
1842 float32x4_t divsix = vdupq_n_f32(1.0f/6.f);
1843 for (int i = 0; i < sizeQuad; i++) {
1844 auto x = vld1q_f32(src + 4 * i);
1845 auto y = vmulq_f32(vmulq_f32(x, vminq_f32(vmaxq_f32(vaddq_f32(x, three), zero), six)), divsix);
1846 vst1q_f32(dst + 4 * i, y);
1847 }
1848 start = sizeQuad * 4;
1849 #endif
1850 for (int j = start; j < size; j++) {
1851 if (src[j] <= -3) {
1852 dst[j] = 0;
1853 } else if (src[j] >= 3){
1854 dst[j] = src[j];
1855 } else {
1856 dst[j] = src[j] * (src[j] + 3) / 6.f;
1857 }
1858 }
1859 }
1860
MNNGeluCommon(float * dst,const float * src,size_t size)1861 void MNNGeluCommon(float* dst, const float* src, size_t size) {
1862 int sizeQuad = size / 8;
1863 int start = 0;
1864 #ifdef MNN_USE_SSE
1865 if (sizeQuad > 0) {
1866 MNNGelu(dst, src, sizeQuad);
1867 start = sizeQuad * 8;
1868 }
1869 #endif
1870 auto tanhf_poly = [](float value) -> float {
1871 if (value > 5.0f) {
1872 return 1.0f;
1873 } else if (value <= -5.0f) {
1874 return -1.0f;
1875 } else {
1876 float x2 = value * value;
1877 float a = value * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
1878 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
1879 return a / b;
1880 }
1881 };
1882 for (int i = start; i < size; i++) {
1883 float temp = 0.044715f * src[i] * src[i] * src[i];
1884 temp = 0.79788458f * (temp + src[i]);
1885 dst[i] = (1.0f + tanhf_poly(temp)) * src[i] * 0.5f;
1886 }
1887 }
1888
MNNScaleAndAddBiasScalar(float * dst,const float * src,float bias,float alpha,size_t number)1889 void MNNScaleAndAddBiasScalar(float* dst, const float* src, float bias, float alpha, size_t number) {
1890 int numberC4 = (int)number / 4;
1891 int start = 0;
1892 if (numberC4 > 0) {
1893 float biasC4[4] = {
1894 bias,
1895 bias,
1896 bias,
1897 bias
1898 };
1899 float alphaC4[4] = {
1900 alpha,
1901 alpha,
1902 alpha,
1903 alpha
1904 };
1905 MNNScaleAndAddBias(dst, src, biasC4, alphaC4, numberC4, 1);
1906 start = numberC4 * 4;
1907 }
1908 for (int i=start; i<number; ++i) {
1909 dst[i] = src[i] * alpha + bias;
1910 }
1911 }
MNNAxByClamp(float * C,const float * A,const float * B,size_t width,size_t cStride,size_t aStride,size_t bStride,size_t height,const float * parameters)1912 void MNNAxByClamp(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height, const float* parameters) {
1913 int widthC4 = (int)width / 4;
1914 if (widthC4 > 0) {
1915 auto minF = Vec4(parameters[2]);
1916 auto maxF = Vec4(parameters[3]);
1917 auto alpha = Vec4(parameters[0]);
1918 auto beta = Vec4(parameters[1]);
1919 for (int y = 0; y < height; ++y) {
1920 auto a = A + aStride * y;
1921 auto b = B + bStride * y;
1922 auto c = C + cStride * y;
1923 for (int x = 0; x < width; ++x) {
1924 auto av = Vec4::load(a + 4 * x);
1925 auto bv = Vec4::load(b + 4 * x);
1926 auto cv = av * alpha + bv * beta;
1927 cv = Vec4::min(cv, maxF);
1928 cv = Vec4::max(cv, minF);
1929 Vec4::save(c + 4 * x, cv);
1930 }
1931 }
1932 width = width - 4*widthC4;
1933 C = C + widthC4 * 4;
1934 A = A + widthC4 * 4;
1935 B = B + widthC4 * 4;
1936 }
1937 if (width > 0) {
1938 auto minF = parameters[2];
1939 auto maxF = parameters[3];
1940 auto alpha = parameters[0];
1941 auto beta = parameters[1];
1942 for (int y = 0; y < height; ++y) {
1943 auto a = A + aStride * y;
1944 auto b = B + bStride * y;
1945 auto c = C + cStride * y;
1946 for (int x = 0; x < width; ++x) {
1947 auto av = a[x];
1948 auto bv = b[x];
1949 auto cv = av * alpha + bv * beta;
1950 cv = std::min(cv, maxF);
1951 cv = std::max(cv, minF);
1952 c[x] = cv;
1953 }
1954 }
1955 }
1956 }
1957 #ifndef MNN_USE_NEON
MNNAxByClampBroadcastUnit(float * C,const float * A,const float * B,size_t width,size_t cStride,size_t aStride,size_t height,const float * parameters)1958 void MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) {
1959 auto minF = Vec4(parameters[2]);
1960 auto maxF = Vec4(parameters[3]);
1961 auto beta = Vec4(parameters[1]);
1962 for (int y = 0; y < height; ++y) {
1963 auto a = A + aStride * y;
1964 auto b = B + 4 * y;
1965 auto bv = Vec4::load(b);
1966 auto c = C + cStride * y;
1967 for (int x = 0; x < width; ++x) {
1968 auto av = Vec4::load(a + 4 * x);
1969 auto cv = av + bv * beta;
1970 cv = Vec4::min(cv, maxF);
1971 cv = Vec4::max(cv, minF);
1972 Vec4::save(c + 4 * x, cv);
1973 }
1974 }
1975 }
MNNVectorTop1Float(float * input,float * maxValue,int32_t * maxIndex,size_t inputCountUnit)1976 void MNNVectorTop1Float(float* input, float* maxValue, int32_t* maxIndex, size_t inputCountUnit) {
1977 float maxV = input[0];
1978 int maxIdx = 0;
1979 for (int i = 0; i < inputCountUnit; i++) {
1980 int offset = i * UNIT;
1981 for (int j = 0; j < UNIT; j++) {
1982 if (input[offset + j] > maxV) {
1983 maxV = input[offset + j];
1984 maxIdx = offset + j;
1985 }
1986 }
1987 }
1988 maxValue[0] = maxV;
1989 maxIndex[0] = maxIdx;
1990 }
1991
MNNVectorTop1Int32(int32_t * input,int32_t * maxValue,int32_t * maxIndex,size_t inputCountUnit)1992 void MNNVectorTop1Int32(int32_t* input, int32_t* maxValue, int32_t* maxIndex, size_t inputCountUnit) {
1993 int32_t maxV = input[0];
1994 int maxIdx = 0;
1995 for (int i = 0; i < inputCountUnit; i++) {
1996 int offset = i * UNIT;
1997 for (int j = 0; j < UNIT; j++) {
1998 if (input[offset + j] > maxV) {
1999 maxV = input[offset + j];
2000 maxIdx = offset + j;
2001 }
2002 }
2003 }
2004 maxValue[0] = maxV;
2005 maxIndex[0] = maxIdx;
2006 }
2007
2008 #endif
2009
MNNComputeMatMulForE_1(const float * A,const float * B,float * C,const float * biasPtr,const MatMulParam * param,size_t tId)2010 void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId) {
2011 auto l = param->l;
2012 auto h = param->h;
2013 auto numberThread = param->numberThread;
2014 auto lC4 = l / 4;
2015 auto lR = lC4 * 4;
2016 if (param->BTranspose) {
2017 for (int y=tId; y<h; y+=numberThread) {
2018 Vec4 sumValue = Vec4(0.0f);
2019 auto by = B + y * l;
2020 for (int x=0; x<lC4; ++x) {
2021 sumValue = sumValue + Vec4::load(A + x * 4) * Vec4::load(by + x * 4);
2022 }
2023 float sumRemain = 0.0f;
2024 for (int x=lR; x<l; ++x) {
2025 sumRemain = sumRemain + A[x] * by[x];
2026 }
2027 if (nullptr != biasPtr) {
2028 sumRemain += biasPtr[y];
2029 }
2030 C[y] = sumRemain + sumValue[0] + sumValue[1] + sumValue[2] + sumValue[3];
2031 }
2032 } else {
2033 auto hC4 = h / 4;
2034 auto hR = hC4 * 4;
2035 for (int y=tId; y<hC4; y+=numberThread) {
2036 auto bs = B + 4 * y;
2037 Vec4 sumValue = Vec4(0.0f);
2038 if (biasPtr != nullptr) {
2039 sumValue = Vec4::load(biasPtr + 4 * y);
2040 }
2041 auto srcY = A + y * l;
2042 for (int x=0; x<l; ++x) {
2043 sumValue = sumValue + Vec4(A[x]) * Vec4::load(bs + h * x);
2044 }
2045 Vec4::save(C + 4 * y, sumValue);
2046 }
2047 for (int y=hR + tId; y<h; y+=numberThread) {
2048 auto bs = B + y;
2049 float sumValue = 0.0f;
2050 if (biasPtr != nullptr) {
2051 sumValue = biasPtr[y];
2052 }
2053 auto srcY = A + y * l;
2054 for (int x=0; x<l; ++x) {
2055 sumValue = sumValue + A[x] * bs[h * x];
2056 }
2057 C[y] = sumValue;
2058 }
2059 }
2060 }
2061
MNNComputeMatMulForH_1(const float * A,const float * B,float * C,const float * biasPtr,const MatMulParam * param,size_t tId)2062 void MNNComputeMatMulForH_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId) {
2063 int e = param->e;
2064 int l = param->l;
2065 int numberThread = param->numberThread;
2066 if (param->ATranspose) {
2067 float biasValue = 0.0f;
2068 if (nullptr != biasPtr) {
2069 biasValue = *biasPtr;
2070 }
2071 auto eC4 = e / 4;
2072 auto eR = eC4 * 4;
2073 for (int y=tId; y<eC4; y+=numberThread) {
2074 Vec4 sumValue = Vec4(biasValue);
2075 auto srcY = A + y * 4;
2076 for (int x=0; x<l; ++x) {
2077 sumValue = sumValue + Vec4::load(srcY + x * e) * Vec4(B[x]);
2078 }
2079 Vec4::save(C + 4 * y, sumValue);
2080 }
2081 if (0 == tId) {
2082 for (int y=eR; y<e; ++y) {
2083 float sumValue = biasValue;
2084 auto srcY = A + y;
2085 for (int x=0; x<l; ++x) {
2086 sumValue = sumValue + srcY[x * e] * B[x];
2087 }
2088 C[y] = sumValue;
2089 }
2090 }
2091 return;
2092 }
2093 float biasValue = 0.0f;
2094 if (nullptr != biasPtr) {
2095 biasValue = *biasPtr;
2096 }
2097 auto lC4 = l / 4;
2098 auto lR = lC4 * 4;
2099 for (int y=tId; y<e; y+=numberThread) {
2100 Vec4 sumValue = Vec4(biasValue);
2101 auto srcY = A + y * l;
2102 for (int x=0; x<lC4; ++x) {
2103 sumValue = sumValue + Vec4::load(srcY + 4 * x) * Vec4::load(B + 4 * x);
2104 }
2105 float sumSingle = sumValue[0] + sumValue[1] + sumValue[2] + sumValue[3];
2106 for (int x=lR; x<l; ++x) {
2107 sumSingle += srcY[x] * B[x];
2108 }
2109 C[y] = sumSingle;
2110 }
2111 }
2112
MNNPackC4Int16(int16_t * dst,const int16_t * src,size_t area,size_t depth)2113 void MNNPackC4Int16(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
2114 int z, x;
2115 int cur = 0;
2116 memset(dst, 0, area * UP_DIV(depth, 4) * 4 * sizeof(int16_t));
2117 for (z = 0; z < depth; ++z) {
2118 int plane = z / 4;
2119 int16_t* dstPlane = plane * area * 4 + dst;
2120 int offset = z % 4;
2121 for (x = 0; x < area; ++x) {
2122 dstPlane[4 * x + offset] = src[cur++];
2123 }
2124 }
2125 }
2126
MNNUnpackC4Int16(int16_t * dst,const int16_t * src,size_t area,size_t depth)2127 void MNNUnpackC4Int16(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
2128 int x;
2129 int z;
2130 int cur = 0;
2131 for (z = 0; z < depth; ++z) {
2132 int plane = z / 4;
2133 const int16_t* srcPlane = plane * area * 4 + src;
2134 int offset = z % 4;
2135 for (x = 0; x < area; ++x) {
2136 dst[cur++] = srcPlane[4 * x + offset];
2137 }
2138 }
2139 }
2140
MNNUnpackTransposeInt16(int16_t * dst,const int16_t * src,size_t area,size_t depth)2141 void MNNUnpackTransposeInt16(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
2142 if (depth == 4) {
2143 ::memcpy(dst, src, area * depth * sizeof(int16_t));
2144 return;
2145 }
2146 int c = (int)depth;
2147 int cDiv4 = c / 4;
2148 int cAlign = cDiv4 * 4;
2149 for (int hi = 0; hi < area; ++hi) {
2150 auto srcHeight = (src + hi * c);
2151 auto dstHeight = (dst + hi * 4);
2152 for (int ci = 0; ci < cDiv4; ++ci) {
2153 for (int i = 0; i < 4; ++i) {
2154 dstHeight[ci * area * 4 + i] = srcHeight[4 * ci + i];
2155 }
2156 }
2157 }
2158
2159 if (cAlign == c) {
2160 return;
2161 }
2162
2163 int cReamin = c - cAlign;
2164 auto srcAlign = src + cAlign;
2165 auto dstAlign = dst + area * cAlign;
2166
2167 for (int hi = 0; hi < area; ++hi) {
2168 auto srcHeight = srcAlign + hi * c;
2169 auto dstHeight = dstAlign + hi * 4;
2170 for (int i = 0; i < 4; ++i) {
2171 dstHeight[i] = 0;
2172 }
2173 for (int ci = 0; ci < cReamin; ++ci) {
2174 dstHeight[ci] = srcHeight[ci];
2175 }
2176 }
2177 }
MNNPackTransposeInt16(int16_t * dst,const int16_t * src,size_t area,size_t depth)2178 void MNNPackTransposeInt16(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
2179 if (1 == area) {
2180 ::memcpy(dst, src, depth * sizeof(int16_t));
2181 return;
2182 }
2183 int c = (int)depth;
2184 int cDiv4 = c / 4;
2185 int cAlign = cDiv4 * 4;
2186 if (cAlign == c) {
2187 int64_t* dst32 = (int64_t*)dst;
2188 const int64_t* src32 = (int64_t*)src;
2189 for (int hi = 0; hi < area; ++hi) {
2190 auto srcHeight = src32 + hi;
2191 auto dstHeight = dst32 + hi * cDiv4;
2192 for (int ci = 0; ci < cDiv4; ++ci) {
2193 dstHeight[ci] = srcHeight[ci * area];
2194 }
2195 }
2196 return;
2197 }
2198
2199 for (int hi = 0; hi < area; ++hi) {
2200 auto srcHeight = src + hi * 4;
2201 auto dstHeight = dst + hi * c;
2202 for (int ci = 0; ci < cDiv4; ++ci) {
2203 for (int i = 0; i < 4; ++i) {
2204 dstHeight[ci * 4 + i] = srcHeight[4 * ci * area + i];
2205 }
2206 }
2207 }
2208
2209 int cReamin = c - cAlign;
2210 auto srcAlign = src + area * cAlign;
2211 auto dstAlign = dst + cAlign;
2212
2213 for (int hi = 0; hi < area; ++hi) {
2214 auto srcHeight = srcAlign + hi * 4;
2215 auto dstHeight = dstAlign + hi * c;
2216
2217 for (int ci = 0; ci < cReamin; ++ci) {
2218 dstHeight[ci] = srcHeight[ci];
2219 }
2220 }
2221 }
2222
MNNCopyC4Int16WithStride(const float * sourceF,float * destF,size_t srcStride,size_t dstStride,size_t count)2223 void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count) {
2224 auto source = (int16_t*)sourceF;
2225 auto dest = (int16_t*)destF;
2226 for (int i = 0; i < count; ++i) {
2227 auto s = source + i * srcStride;
2228 auto d = dest + i * dstStride;
2229 *(int64_t*)(d) = *((int64_t*)s);
2230 }
2231 }
2232
2233
MNNSin(float * dst,const float * src,size_t dataSize)2234 void MNNSin(float* dst, const float* src, size_t dataSize) {
2235 for (int i = 0; i < dataSize; i++) {
2236 dst[i] = sinf(src[i]);
2237 }
2238 }
2239
MNNSigmoid(float * dst,const float * src,size_t dataSize)2240 void MNNSigmoid(float* dst, const float* src, size_t dataSize) {
2241 MNNExp(dst, src, dataSize);
2242 for (int i = 0; i < dataSize; ++i) {
2243 dst[i] = 1.0f / (1.0f + dst[i]);
2244 }
2245 }
2246
2247 /**
2248 Modified from https://github.com/alibaba/MNN/pull/1359
2249 Thanks for https://github.com/hroken
2250 */
MNNSigmoidLowp(float * dst,const float * src,size_t dataSize)2251 void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize) {
2252 MNNExp(dst, src, dataSize);
2253 #ifdef MNN_USE_NEON
2254 int dataC4 = (int)dataSize / 4;
2255 if(dataC4 > 0) {
2256 // neon optimization for sigmid cpu
2257 float32x4_t value = vdupq_n_f32(1.0f);
2258 float32x4_t out = vld1q_f32(dst);
2259 for (int i = 1; i < dataC4; ++i) {
2260 out = vrecpeq_f32(vaddq_f32(value,out));
2261 vst1q_f32(dst ,out);
2262 dst += 4;
2263 out = vld1q_f32(dst);
2264 }
2265 out = vrecpeq_f32(vaddq_f32(value,out));
2266 vst1q_f32(dst, out);
2267 dataSize = dataSize - 4 * dataC4;
2268 }
2269 #endif
2270 for (int i = 0; i < dataSize; ++i) {
2271 dst[i] = 1.0f / (1.0f + dst[i]);
2272 }
2273 }
2274
MNNMultiAndDestTransformCommon23(float ** cacheLine,const float * weigth,float * dest,int cacheLineSize,int ow,const float * bias,const float * parameters)2275 void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
2276 int unit = ow / 2;
2277 MNN_ASSERT(cacheLineSize >= 1);
2278 auto biasF = Vec4::load(bias);
2279 auto minF = Vec4(parameters[2]);
2280 auto maxF = Vec4(parameters[3]);
2281 for (int x = 0; x < unit; ++x) {
2282 auto offset = 4 * 4 * x;
2283 int i = 0;
2284 Vec4 m0 = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
2285 Vec4 m1 = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
2286 Vec4 m2 = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
2287 Vec4 m3 = Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
2288
2289 for (i = 1; i < cacheLineSize; ++i) {
2290 m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
2291 m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
2292 m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
2293 m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
2294 }
2295
2296 auto o0 = m0 + m1 + m2 + biasF;
2297 auto o1 = m1 - m2 + m3 + biasF;
2298 o0 = Vec4::min(maxF, o0);
2299 o1 = Vec4::min(maxF, o1);
2300 o0 = Vec4::max(minF, o0);
2301 o1 = Vec4::max(minF, o1);
2302 Vec4::save(dest + 8 * x + 0 * 4, o0);
2303 Vec4::save(dest + 8 * x + 1 * 4, o1);
2304 }
2305 if (unit * 2 < ow) {
2306 auto offset = 4 * 4 * unit;
2307 int i = 0;
2308 Vec4 m0 = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
2309 Vec4 m1 = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
2310 Vec4 m2 = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
2311
2312 for (i = 1; i < cacheLineSize; ++i) {
2313 m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
2314 m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
2315 m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
2316 }
2317 auto o0 = m0 + m1 + m2 + biasF;
2318 o0 = Vec4::min(maxF, o0);
2319 o0 = Vec4::max(minF, o0);
2320 Vec4::save(dest + 8 * unit + 0 * 4, o0);
2321 }
2322 }
2323 extern "C" {
2324 void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit);
2325 }
2326
MNNSourceTransformCommonF23(const float * source,float * dest,int unit,int iw,int pad,int su,int eu)2327 void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) {
2328 for (int x = 0; x < su; ++x) {
2329 auto dstX = dest + 4 * 4 * x;
2330 auto sx = x * 2 - (int)pad;
2331 auto ex = sx + 4;
2332
2333 auto clampSx = std::max(sx, 0);
2334 auto clampEx = std::min(ex, (int)iw);
2335
2336 Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
2337 for (int i = clampSx; i < clampEx; ++i) {
2338 v[i - sx] = Vec4::load(source + 4 * i);
2339 }
2340 auto m0 = v[0] - v[2];
2341 auto m1 = v[1] + v[2];
2342 auto m2 = v[2] - v[1];
2343 auto m3 = v[3] - v[1];
2344
2345 Vec4::save(dstX + 4 * 0, m0);
2346 Vec4::save(dstX + 4 * 1, m1);
2347 Vec4::save(dstX + 4 * 2, m2);
2348 Vec4::save(dstX + 4 * 3, m3);
2349 }
2350 MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su);
2351
2352 for (int x = eu; x < unit; ++x) {
2353 auto dstX = dest + 4 * 4 * x;
2354 auto sx = x * 2 - (int)pad;
2355 auto ex = sx + 4;
2356
2357 auto clampSx = std::max(sx, 0);
2358 auto clampEx = std::min(ex, (int)iw);
2359
2360 Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
2361 for (int i = clampSx; i < clampEx; ++i) {
2362 v[i - sx] = Vec4::load(source + 4 * i);
2363 }
2364 auto m0 = v[0] - v[2];
2365 auto m1 = v[1] + v[2];
2366 auto m2 = v[2] - v[1];
2367 auto m3 = v[3] - v[1];
2368
2369 Vec4::save(dstX + 4 * 0, m0);
2370 Vec4::save(dstX + 4 * 1, m1);
2371 Vec4::save(dstX + 4 * 2, m2);
2372 Vec4::save(dstX + 4 * 3, m3);
2373 }
2374 }
2375
2376 #ifndef MNN_USE_NEON
MNNConvDwF23MulTransUnit(float ** cacheLine,const float * weigth,float * dest,size_t ow,const float * bias,const float * parameters)2377 void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameters) {
2378 int unit = ow / 2;
2379 auto w00 = Vec4::load(weigth + 0 * 16 + 4 * 0);
2380 auto w01 = Vec4::load(weigth + 0 * 16 + 4 * 1);
2381 auto w02 = Vec4::load(weigth + 0 * 16 + 4 * 2);
2382 auto w03 = Vec4::load(weigth + 0 * 16 + 4 * 3);
2383 auto w10 = Vec4::load(weigth + 1 * 16 + 4 * 0);
2384 auto w11 = Vec4::load(weigth + 1 * 16 + 4 * 1);
2385 auto w12 = Vec4::load(weigth + 1 * 16 + 4 * 2);
2386 auto w13 = Vec4::load(weigth + 1 * 16 + 4 * 3);
2387 auto w20 = Vec4::load(weigth + 2 * 16 + 4 * 0);
2388 auto w21 = Vec4::load(weigth + 2 * 16 + 4 * 1);
2389 auto w22 = Vec4::load(weigth + 2 * 16 + 4 * 2);
2390 auto w23 = Vec4::load(weigth + 2 * 16 + 4 * 3);
2391 auto biasF = Vec4::load(bias);
2392 auto minF = Vec4(parameters[2]);
2393 auto maxF = Vec4(parameters[3]);
2394 for (int x = 0; x < unit; ++x) {
2395 auto offset = 4 * 4 * x;
2396 int i = 0;
2397 Vec4 m0 = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
2398 Vec4 m1 = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
2399 Vec4 m2 = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
2400 Vec4 m3 = w03 * Vec4::load(cacheLine[0] + offset + 4 * 3);
2401
2402 m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
2403 m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
2404 m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
2405 m3 = m3 + w13 * Vec4::load(cacheLine[1] + offset + 4 * 3);
2406
2407 m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
2408 m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
2409 m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
2410 m3 = m3 + w23 * Vec4::load(cacheLine[2] + offset + 4 * 3);
2411
2412 auto o0 = m0 + m1 + m2 + biasF;
2413 auto o1 = m1 - m2 + m3 + biasF;
2414 o0 = Vec4::min(maxF, o0);
2415 o1 = Vec4::min(maxF, o1);
2416 o0 = Vec4::max(minF, o0);
2417 o1 = Vec4::max(minF, o1);
2418 Vec4::save(dest + 8 * x + 0 * 4, o0);
2419 Vec4::save(dest + 8 * x + 1 * 4, o1);
2420 }
2421 if (unit * 2 < ow) {
2422 auto offset = 4 * 4 * unit;
2423 Vec4 m0 = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
2424 Vec4 m1 = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
2425 Vec4 m2 = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
2426
2427 m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
2428 m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
2429 m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
2430
2431 m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
2432 m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
2433 m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
2434 auto o0 = m0 + m1 + m2 + biasF;
2435 o0 = Vec4::min(maxF, o0);
2436 o0 = Vec4::max(minF, o0);
2437 Vec4::save(dest + 8 * unit + 0 * 4, o0);
2438 }
2439 }
MNNConvDwF23SourceTransUnit(const float * source,float * dest,size_t unit)2440 void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) {
2441 if (unit <= 0) {
2442 return;
2443 }
2444 Vec4 v0 = Vec4::load(source + 4 * 0);
2445 Vec4 v1 = Vec4::load(source + 4 * 1);
2446 Vec4 v2;
2447 Vec4 v3;
2448 source += 8;
2449
2450 for (int x = 0; x < unit; ++x) {
2451 v2 = Vec4::load(source + 0 * 4);
2452 v3 = Vec4::load(source + 1 * 4);
2453 auto m0 = v0 - v2;
2454 auto m1 = v1 + v2;
2455 auto m2 = v2 - v1;
2456 auto m3 = v3 - v1;
2457
2458 Vec4::save(dest + 4 * 0, m0);
2459 Vec4::save(dest + 4 * 1, m1);
2460 Vec4::save(dest + 4 * 2, m2);
2461 Vec4::save(dest + 4 * 3, m3);
2462
2463 source += 8;
2464 dest += 16;
2465
2466 v0 = v2;
2467 v1 = v3;
2468 }
2469 }
2470 #endif
2471
2472 namespace MNN {
2473
2474 static CoreFunctions* gCoreFunction = nullptr;
2475
MNNCoreFunctionInit()2476 void MNNCoreFunctionInit() {
2477 gCoreFunction = new CoreFunctions;
2478 // MatMul
2479 gCoreFunction->MNNGetMatMulPackMode = MNNGetMatMulPackMode;
2480 gCoreFunction->MNNGetSparseMatMulPackMode = MNNGetSparseMatMulPackMode;
2481 gCoreFunction->MNNPackC4ForMatMul_A = MNNPackC4ForMatMul_A;
2482 gCoreFunction->MNNPackForMatMul_B = MNNPackForMatMul_B;
2483 gCoreFunction->MNNPackedMatMul = MNNPackedMatMul;
2484 gCoreFunction->MNNPackedMatMulRemain = MNNPackedMatMulRemain;
2485
2486 gCoreFunction->MNNPackForSparseMatMul_B = MNNPackForSparseMatMul_B; // sparse packing B
2487 gCoreFunction->MNNPackedSparseMatMulEpx4 = MNNPackedSparseMatMulEpx4;
2488 gCoreFunction->MNNPackedSparseMatMulEpx1 = MNNPackedSparseMatMulEpx1;
2489
2490 gCoreFunction->MNNComputeMatMulForE_1 = MNNComputeMatMulForE_1;
2491 gCoreFunction->MNNComputeMatMulForH_1 = MNNComputeMatMulForH_1;
2492
2493
2494 // Lowp
2495 gCoreFunction->MNNFp32ToLowp = nullptr;
2496 gCoreFunction->MNNLowpToFp32 = nullptr;
2497 gCoreFunction->bytes = 4;// sizeof(float)
2498
2499 // Packed Function
2500 gCoreFunction->pack = 4;
2501 gCoreFunction->MNNPackCUnit = MNNPackC4;
2502 gCoreFunction->MNNUnpackCUnit = MNNUnpackC4;
2503
2504 // FIXME: MNNPackTranspose and MNNUnpackTranspose is reverted
2505 gCoreFunction->MNNUnpackCUnitTranspose = MNNPackTranspose;
2506 gCoreFunction->MNNPackCUnitTranspose = MNNUnpackTranspose;
2507 gCoreFunction->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnit;
2508 gCoreFunction->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwise;
2509 gCoreFunction->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWise;
2510 gCoreFunction->MNNSourceTransformCommonF23 = MNNSourceTransformCommonF23;
2511 gCoreFunction->MNNConvDwF23MulTransUnit = MNNConvDwF23MulTransUnit;
2512 gCoreFunction->MNNMultiAndDestTransformCommon23 = MNNMultiAndDestTransformCommon23;
2513 gCoreFunction->MNNMatrixAdd = MNNMatrixAdd;
2514 gCoreFunction->MNNMatrixSub = MNNMatrixSub;
2515 gCoreFunction->MNNStrassenMergeCFunction = MNNStrassenMergeCFunction;
2516 gCoreFunction->penalty = 1.5f;
2517 gCoreFunction->MNNScaleAndAddBias = MNNScaleAndAddBias;
2518 gCoreFunction->MNNAddC4WithStride = MNNAddC4WithStride;
2519 gCoreFunction->MNNCopyC4WithStride = MNNCopyC4WithStride;
2520
2521 gCoreFunction->chooseWinoSourceTransform = WinogradFunction::chooseSourceTransform;
2522 gCoreFunction->chooseWinoDestTransform = WinogradFunction::chooseDestTransform;
2523 gCoreFunction->MNNDeconvRunForLineDepthwise = MNNDeconvRunForLineDepthwise;
2524 gCoreFunction->MNNDeconvRunForUnitDepthWise = MNNDeconvRunForUnitDepthWise;
2525 gCoreFunction->MNNSelectBinaryFunctionForFloat = CPUBinary::selectForFloat;
2526 gCoreFunction->MNNSelectUnaryFunctionForFloat = CPUUnary::selectForFloat;
2527 gCoreFunction->MNNReluWithSlopeChannel = MNNReluWithSlopeChannel;
2528 gCoreFunction->MNNPoolingAvg = (decltype(gCoreFunction->MNNPoolingAvg))(poolingAvg<float, Vec4, 4>);
2529 // Set min value as 1 << 24
2530 gCoreFunction->MNNPoolingMax = (decltype(gCoreFunction->MNNPoolingMax))(poolingMax<float, Vec4, 4, -16777216>);
2531
2532 #ifdef MNN_USE_ARMV82
2533 cpuinfo_arm_isa gCPUInfo;
2534 cpuinfo_arm_init(&gCPUInfo);
2535 gCoreFunction->supportFp16arith = gCPUInfo.fp16arith;
2536 gCoreFunction->supportSDot = gCPUInfo.dot;
2537 #endif
2538 MNNFunctionInit();
2539 }
MNNGetCoreFunctions()2540 CoreFunctions* MNNGetCoreFunctions() {
2541 return gCoreFunction;
2542 }
2543 };
2544