1 //
2 //  CommonOptFunction.cpp
3 //  MNN
4 //
5 //  Created by MNN on 2018/09/06.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #include "CommonOptFunction.h"
10 #include "ConvOpt.h"
11 #include "WinogradOptFunction.hpp"
12 #include <string.h>
13 #include <algorithm>
14 #include <math.h>
15 #include "math/Vec.hpp"
16 #include <vector>
17 #include "../CPURuntime.hpp"
18 #include "core/MemoryFormater.h"
19 #include "core/OpCommonUtils.hpp"
20 // TODO: Find better way to optimize it
21 #include "../CPUBinary.hpp"
22 #include "../CPUUnary.hpp"
23 #include "../CPUPool.hpp"
24 #ifndef MNN_USE_SSE
MNNInt8ToInt16(int16_t * dest,const int8_t * source,size_t count)25 void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count) {
26     // Should not be called
27     MNN_ASSERT(false);
28 }
29 #endif
30 
31 /*
32     source: source matrix is h x l
33     transpose: if false, export compressed matrix as h x l, other export as l x h.
34  */
MNNPackForSparseMatMul_B(float * dest,unsigned int * NNZMap,int * dataOffsetMap,int sparseBlockOC,const float * source,size_t h,size_t l,const int eP,bool transpose)35 void MNNPackForSparseMatMul_B(float* dest, unsigned int* NNZMap, int* dataOffsetMap, int sparseBlockOC, const float* source, size_t h, size_t l, const int eP, bool transpose) {
36     // 1. in convolution, source B layout is OC x (IC * KH * KW),
37     //    the dest layout of weight is BCSC(block compressed sparse colum) format, which is OC(!=0) x (IC*KH*KW!=0), as a canceled result, just do BCSR, transpose should be false.
38     // 2. in ordinary sparse MatMul, transpose is corresponding to BCSR or BCSC
39 
40     // BCSR
41     if (transpose) {
42         int rowOffset = 0;
43         for (int i = 0; i < l; i += 1) {
44             *NNZMap = 0;
45             for(int j = 0; j < h; j += sparseBlockOC) {
46                 if(!MNN::OpCommonUtils::checkAllZeros(source + j * l + i, l, sparseBlockOC, 1)) {
47                     *dest = *(source + j * l + l);
48                     dest++;
49                     *NNZMap = *NNZMap + 1;
50                     *dataOffsetMap = rowOffset;
51                     dataOffsetMap++;
52                     rowOffset = 0;
53                 }
54                 rowOffset += eP;
55             }
56             NNZMap++;
57             rowOffset -= h * eP;
58         }
59     } else { // BCSC
60         int columOffset = 0;
61         int i = 0;
62         for (; i + sparseBlockOC <= h; i += sparseBlockOC) {
63             *NNZMap = 0;
64             for(int j = 0; j < l; j += 1) {
65                 if (!MNN::OpCommonUtils::checkAllZeros(source, l, sparseBlockOC, 1)) {
66                     for (int ioc = 0; ioc < sparseBlockOC; ioc++) {
67                         *dest = *(source + ioc * l);
68                         dest++;
69                     }
70                     *NNZMap = *NNZMap + 1;
71                     *dataOffsetMap = columOffset;
72                     dataOffsetMap++;
73                     columOffset = 0;
74                 }
75                 columOffset += eP;
76                 source++;
77             }
78             NNZMap++;
79             source += l * (sparseBlockOC - 1);
80             columOffset -= l * eP;
81         }
82 
83         for (; i < h; i++) {
84             *NNZMap = 0;
85             for(int j = 0; j < l; j++) {
86                 if (*source != 0.0f) {
87                     *dest = *source;
88                     dest++;
89                     *NNZMap = *NNZMap + 1;
90                     *dataOffsetMap = columOffset;
91                     dataOffsetMap++;
92                     columOffset = 0;
93                 }
94                 columOffset += eP;
95                 source++;
96             }
97             NNZMap++;
98             columOffset -= l * eP;
99         }
100 
101         *dataOffsetMap = columOffset; //
102     }
103     return;
104 }
105 
106 #ifndef MNN_USE_NEON
107 
MNNGetMatMulPackMode(int * eP,int * lP,int * hP)108 void MNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
109     *eP = 16;
110     *lP = 1;
111     *hP = 4;
112 }
113 
MNNGetSparseMatMulPackMode(int * eP,int * lP,int * hP)114 void MNNGetSparseMatMulPackMode(int* eP, int *lP, int* hP) {
115     *eP = 16;
116     *lP = 1;
117     *hP = 4;
118     // hp is corresponding to sparse block along right matrix colum dimension. in ramdom sparse, it is 1.
119     return;
120 }
121 
MNNPackForMatMul_B(float * dest,const float * source,size_t h,size_t l,bool transpose)122 void MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose) {
123     auto hP = h / 4;
124     auto hR = hP * 4;
125     if (hR != h) {
126         ::memset(dest, 0, UP_DIV(h, 4)*4*l*sizeof(float));
127     }
128     if (!transpose) {
129         for (int y=0; y<hP; ++y) {
130             auto destY = dest + y * 4 * l;
131             auto sourceY = source + y * 4;
132             for (int x=0; x<l; ++x) {
133                 ::memcpy(destY + 4 * x, sourceY + x * h, 4 * sizeof(float));
134             }
135         }
136         auto hRemain = h - hR;
137         if (hRemain > 0) {
138             auto destY = dest + hP * 4 * l;
139             auto sourceY = source + hP * 4;
140             for (int x=0; x<l; ++x) {
141                 ::memcpy(destY + 4 * x, sourceY + x * h, hRemain * sizeof(float));
142             }
143         }
144         return;
145     }
146     MNNPackC4(dest, source, l, h);
147 }
148 
_MNNPackedMatMulRemain(float * C,const float * A,const float * B,size_t eSize,const size_t * parameter,const float * postParameters,const float * bias,int aStride)149 static void _MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, int aStride) {
150     auto h = parameter[2];
151     auto l = parameter[1];
152     auto cStride = parameter[3] / sizeof(float);
153     auto hRemain = parameter[4];
154     auto bExtraStride = parameter[5] / sizeof(float);
155     auto bStride = bExtraStride + l * 4;
156     auto hC4 = UP_DIV(h, 4);
157     for (int y=0; y<hC4; ++y) {
158         ::memset(C + y * cStride, 0, eSize * 4 * sizeof(float));
159     }
160     float alpha = 1.0f;
161     float beta = 0.0f;
162     float minValue = -std::numeric_limits<float>().max();
163     float maxValue = std::numeric_limits<float>().max();
164     if (nullptr != postParameters) {
165         minValue = postParameters[2];
166         maxValue = postParameters[3];
167         alpha = postParameters[0];
168         beta = postParameters[1];
169     }
170 
171     for (int x=0; x<eSize; ++x) {
172         auto dst = C + 4 * x;
173         auto src = A + x;
174         for (int y=0; y<hC4; ++y) {
175             auto dstY = dst + y * cStride;
176             auto weight = B + y * bStride;
177             float summer[4] = {
178                 0.0f,
179                 0.0f,
180                 0.0f,
181                 0.0f,
182             };
183             if (nullptr != bias) {
184                 for (int v=0; v<4; ++v) {
185                     summer[v] = bias[4 * y + v];
186                 }
187             }
188             for (int z=0; z<l; ++z) {
189                 auto aZ = src + z * aStride;
190                 auto wZ = weight + z * 4;
191                 summer[0] += wZ[0] * aZ[0];
192                 summer[1] += wZ[1] * aZ[0];
193                 summer[2] += wZ[2] * aZ[0];
194                 summer[3] += wZ[3] * aZ[0];
195             }
196             for (int v=0; v<4; ++v) {
197                 auto dstValue = std::min(summer[v], maxValue);
198                 dstValue = std::max(dstValue, minValue);
199                 dstY[v] = dstValue;
200             }
201         }
202     }
203 }
MNNPackedMatMul(float * C,const float * A,const float * B,const size_t * parameter,const float * postParameters,const float * bias)204 void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias) {
205     return _MNNPackedMatMulRemain(C, A, B, 16, parameter, postParameters, bias, 16);
206 }
207 
MNNPackedMatMulRemain(float * C,const float * A,const float * B,size_t eSize,const size_t * parameter,const float * postParameters,const float * bias)208 void MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias) {
209     auto aStride = parameter[0] / sizeof(float);
210     _MNNPackedMatMulRemain(C, A, B, eSize, parameter, postParameters, bias, aStride);
211 }
212 
213 
MNNPackC4ForMatMul_A(float * destOrigin,float const ** sourceGroup,const int32_t * info,const int32_t * el)214 void MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el) {
215     int number = info[0];
216     int eReal = info[1];
217     int eDest = info[2];
218     int offset = info[3];
219     for (int n=0; n<number; ++n) {
220         int e = el[4 * n + 0];
221         int l = el[4 * n + 1];
222         int eOffset = el[4 * n + 2];
223         int lOffset = el[4 * n + 3];
224         auto dest = destOrigin + lOffset * eDest + eOffset;
225         auto source = sourceGroup[n];
226 
227         for (int y=0; y<e; ++y) {
228             auto yR = y % eDest;
229             for (int x=0; x<l; ++x) {
230                 auto xR = x % 4;
231                 auto xC = x / 4;
232                 dest[(x) * eDest + yR] = source[xC * eReal * 4 + y * 4 * offset + xR];
233             }
234         }
235     }
236 }
237 
MNNPackedSparseMatMulEpx1(float * C,const float * A,const float * B,size_t eSize,const size_t * parameter,const float * postParameters,const float * bias,unsigned int * NNZMap,int * dataOffsetMap)238 void MNNPackedSparseMatMulEpx1(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap) {
239 
240     auto eP = parameter[0] / sizeof(float);
241     MNN_ASSERT((eP & 0x03) == 0); // In sparse calculate, eP should be evenly divided by 4
242     auto h = parameter[2];
243     auto l = parameter[1];
244     auto cStride = parameter[3] / sizeof(float);
245     auto aStride = eP * l;
246     auto hRemain = parameter[4];
247     auto bExtraStride = parameter[5] / sizeof(float);
248     auto bStride = bExtraStride + l * 4;
249     auto hC4 = UP_DIV(h, 4);
250     float minValue = -std::numeric_limits<float>().max();
251     float maxValue = std::numeric_limits<float>().max();
252     if (nullptr != postParameters) {
253         minValue = postParameters[2];
254         maxValue = postParameters[3];
255     }
256     // MNN_PRINT("MNNPackedSparseMatMul eP:%lu, eSize:%lu, l:%lu, h:%lu, cStride:%lu, aStride:%lu\n", eP, eSize, l, h, cStride, aStride);
257 
258     const float* a = A;
259     size_t ie = 0;
260     for (ie = 0; ie < eSize && eP <= eSize; ie += eP) {
261         const int* dataOffset = dataOffsetMap;
262         const int diff = *dataOffset++;
263         a += diff;
264         const float* w = B;
265         float* blockC = C + (ie << 2);
266         const unsigned int* nnz = NNZMap;
267         for (auto ih = 0; ih < h; ih++) {
268             auto ihPack = ih >> 2;
269             auto ihSubIndex = ih & 0x03;
270             auto c = blockC + ihPack * cStride + ihSubIndex;
271             const float initValue = nullptr != bias ? bias[ih] : 0;
272             float acc0 = initValue;
273             float acc1 = initValue;
274             float acc2 = initValue;
275             float acc3 = initValue;
276             float acc4 = initValue;
277             float acc5 = initValue;
278             float acc6 = initValue;
279             float acc7 = initValue;
280             float acc8 = initValue;
281             float acc9 = initValue;
282             float acc10 = initValue;
283             float acc11 = initValue;
284             float acc12 = initValue;
285             float acc13 = initValue;
286             float acc14 = initValue;
287             float acc15 = initValue;
288             const int lElement = *nnz++;
289             for (auto il = 0; il < lElement; il++) {
290 
291                 const int diff = *dataOffset++;
292                 const float a0 = a[0];
293                 const float a1 = a[1];
294                 const float a2 = a[2];
295                 const float a3 = a[3];
296                 const float a4 = a[4];
297                 const float a5 = a[5];
298                 const float a6 = a[6];
299                 const float a7 = a[7];
300                 const float a8 = a[8];
301                 const float a9 = a[9];
302                 const float a10 = a[10];
303                 const float a11 = a[11];
304                 const float a12 = a[12];
305                 const float a13 = a[13];
306                 const float a14 = a[14];
307                 const float a15 = a[15];
308 
309                 const float oneW = *w++;
310 
311                 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
312                 // formatMatrix(a, {16});
313                 // MNN_PRINT("\n");
314                 a = a + diff;
315                 acc0 += a0 * oneW;
316                 acc1 += a1 * oneW;
317                 acc2 += a2 * oneW;
318                 acc3 += a3 * oneW;
319                 acc4 += a4 * oneW;
320                 acc5 += a5 * oneW;
321                 acc6 += a6 * oneW;
322                 acc7 += a7 * oneW;
323                 acc8 += a8 * oneW;
324                 acc9 += a9 * oneW;
325                 acc10 += a10 * oneW;
326                 acc11 += a11 * oneW;
327                 acc12 += a12 * oneW;
328                 acc13 += a13 * oneW;
329                 acc14 += a14 * oneW;
330                 acc15 += a15 * oneW;
331             }
332             acc0  = std::max(std::min(maxValue, acc0), minValue);
333             acc1  = std::max(std::min(maxValue, acc1), minValue);
334             acc2  = std::max(std::min(maxValue, acc2), minValue);
335             acc3  = std::max(std::min(maxValue, acc3), minValue);
336             acc4  = std::max(std::min(maxValue, acc4), minValue);
337             acc5  = std::max(std::min(maxValue, acc5), minValue);
338             acc6  = std::max(std::min(maxValue, acc6), minValue);
339             acc7  = std::max(std::min(maxValue, acc7), minValue);
340             acc8  = std::max(std::min(maxValue, acc8), minValue);
341             acc9  = std::max(std::min(maxValue, acc9), minValue);
342             acc10 = std::max(std::min(maxValue, acc10), minValue);
343             acc11 = std::max(std::min(maxValue, acc11), minValue);
344             acc12 = std::max(std::min(maxValue, acc12), minValue);
345             acc13 = std::max(std::min(maxValue, acc13), minValue);
346             acc14 = std::max(std::min(maxValue, acc14), minValue);
347             acc15 = std::max(std::min(maxValue, acc15), minValue);
348 
349             // how to store faster: st4 / transpose /
350             c[0] = acc0;
351             c[4] = acc1;
352             c[4 * 2] = acc2;
353             c[4 * 3] = acc3;
354             c[4 * 4] = acc4;
355             c[4 * 5] = acc5;
356             c[4 * 6] = acc6;
357             c[4 * 7] = acc7;
358             c[4 * 8] = acc8;
359             c[4 * 9] = acc9;
360             c[4 * 10] = acc10;
361             c[4 * 11] = acc11;
362             c[4 * 12] = acc12;
363             c[4 * 13] = acc13;
364             c[4 * 14] = acc14;
365             c[4 * 15] = acc15;
366         }
367         a += aStride;
368     }
369     // const float* blockA = A + ie * l;
370     if (eSize & 0x08) {
371         const int* dataOffset = dataOffsetMap;
372         const int diff = *dataOffset++;
373         // a = blockA + diff;
374         a += diff;
375         const float* w = B;
376         float* blockC = C + (ie << 2);
377         const unsigned int* nnz = NNZMap;
378         for (auto ih = 0; ih < h; ih++) {
379             auto ihPack = ih >> 2;
380             auto ihSubIndex = ih & 0x03;
381             auto c = blockC + ihPack * cStride + ihSubIndex;
382             const float initValue = nullptr != bias ? bias[ih] : 0;
383             float acc0 = initValue;
384             float acc1 = initValue;
385             float acc2 = initValue;
386             float acc3 = initValue;
387             float acc4 = initValue;
388             float acc5 = initValue;
389             float acc6 = initValue;
390             float acc7 = initValue;
391 
392             const int lElement = *nnz++;
393             for (auto il = 0; il < lElement; il++) {
394                 const int diff = *dataOffset++;
395                 const float a0 = a[0];
396                 const float a1 = a[1];
397                 const float a2 = a[2];
398                 const float a3 = a[3];
399                 const float a4 = a[4];
400                 const float a5 = a[5];
401                 const float a6 = a[6];
402                 const float a7 = a[7];
403                 const float oneW = *w++;
404                 // MNN_PRINT("8-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-7]:", ie, a - A, w - B - 1, c - C, oneW);
405                 // formatMatrix(a, {8});
406                 // MNN_PRINT("\n");
407                 a = a + diff;
408                 acc0 += a0 * oneW;
409                 acc1 += a1 * oneW;
410                 acc2 += a2 * oneW;
411                 acc3 += a3 * oneW;
412                 acc4 += a4 * oneW;
413                 acc5 += a5 * oneW;
414                 acc6 += a6 * oneW;
415                 acc7 += a7 * oneW;
416             }
417             acc0  = std::max(std::min(maxValue, acc0), minValue);
418             acc1  = std::max(std::min(maxValue, acc1), minValue);
419             acc2  = std::max(std::min(maxValue, acc2), minValue);
420             acc3  = std::max(std::min(maxValue, acc3), minValue);
421             acc4  = std::max(std::min(maxValue, acc4), minValue);
422             acc5  = std::max(std::min(maxValue, acc5), minValue);
423             acc6  = std::max(std::min(maxValue, acc6), minValue);
424             acc7  = std::max(std::min(maxValue, acc7), minValue);
425             // how to store faster: st4 / transpose /
426             c[0] = acc0;
427             c[4] = acc1;
428             c[4 * 2] = acc2;
429             c[4 * 3] = acc3;
430             c[4 * 4] = acc4;
431             c[4 * 5] = acc5;
432             c[4 * 6] = acc6;
433             c[4 * 7] = acc7;
434         }
435         ie += 8;
436         a += 8;
437     }
438 
439     if (eSize & 0x04) {
440         const int* dataOffset = dataOffsetMap;
441         const int diff = *dataOffset++;
442         // const float* a = blockA + diff;
443         a += diff;
444         const float* w = B;
445         float* blockC = C + (ie << 2);
446         const unsigned int* nnz = NNZMap;
447         for (auto ih = 0; ih < h; ih++) {
448             auto ihPack = ih >> 2;
449             auto ihSubIndex = ih & 0x03;
450             auto c = blockC + ihPack * cStride + ihSubIndex;
451             const float initValue = nullptr != bias ? bias[ih] : 0;
452             float acc0 = initValue;
453             float acc1 = initValue;
454             float acc2 = initValue;
455             float acc3 = initValue;
456 
457             const int lElement = *nnz++;
458             for (auto il = 0; il < lElement; il++) {
459                 const int diff = *dataOffset++;
460                 const float a0 = a[0];
461                 const float a1 = a[1];
462                 const float a2 = a[2];
463                 const float a3 = a[3];
464                 const float oneW = *w++;
465                 // MNN_PRINT("4-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-3]:", ie, a - A, w - B - 1, c - C, oneW);
466                 // formatMatrix(a, {4});
467                 // MNN_PRINT("\n");
468                 a = a + diff;
469                 acc0 += a0 * oneW;
470                 acc1 += a1 * oneW;
471                 acc2 += a2 * oneW;
472                 acc3 += a3 * oneW;
473             }
474             acc0  = std::max(std::min(maxValue, acc0), minValue);
475             acc1  = std::max(std::min(maxValue, acc1), minValue);
476             acc2  = std::max(std::min(maxValue, acc2), minValue);
477             acc3  = std::max(std::min(maxValue, acc3), minValue);
478             // how to store faster: st4 / transpose /
479             c[0] = acc0;
480             c[4] = acc1;
481             c[4 * 2] = acc2;
482             c[4 * 3] = acc3;
483         }
484         ie += 4;
485         a += 4;
486     }
487     if (eSize & 0x02) {
488         const int* dataOffset = dataOffsetMap;
489         const int diff = *dataOffset++;
490         // const float* a = blockA + diff;
491         a += diff;
492         const float* w = B;
493         float* blockC = C + (ie << 2);
494         const unsigned int* nnz = NNZMap;
495         for (auto ih = 0; ih < h; ih++) {
496             auto ihPack = ih >> 2;
497             auto ihSubIndex = ih & 0x03;
498             auto c = blockC + ihPack * cStride + ihSubIndex;
499             const float initValue = nullptr != bias ? bias[ih] : 0;
500             float acc0 = initValue;
501             float acc1 = initValue;
502 
503             const int lElement = *nnz++;
504             for (auto il = 0; il < lElement; il++) {
505                 const int diff = *dataOffset++;
506                 const float a0 = a[0];
507                 const float a1 = a[1];
508                 const float oneW = *w++;
509                 // MNN_PRINT("2-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-1]:", ie, a - A, w - B - 1, c - C, oneW);
510                 // formatMatrix(a, {2});
511                 // MNN_PRINT("\n");
512                 a = a + diff;
513                 acc0 += a0 * oneW;
514                 acc1 += a1 * oneW;
515             }
516             acc0  = std::max(std::min(maxValue, acc0), minValue);
517             acc1  = std::max(std::min(maxValue, acc1), minValue);
518             // how to store faster: st4 / transpose /
519             c[0] = acc0;
520             c[4] = acc1;
521         }
522         ie += 2;
523         a += 2;
524     }
525     if (eSize & 0x01) {
526         const int* dataOffset = dataOffsetMap;
527         const int diff = *dataOffset++;
528         // const float* a = blockA + diff;
529         a += diff;
530         const float* w = B;
531         float* blockC = C + (ie << 2);
532         const unsigned int* nnz = NNZMap;
533         for (auto ih = 0; ih < h; ih++) {
534             auto ihPack = ih >> 2;
535             auto ihSubIndex = ih & 0x03;
536             auto c = blockC + ihPack * cStride + ihSubIndex;
537             const float initValue = nullptr != bias ? bias[ih] : 0;
538             float acc0 = initValue;
539 
540             const int lElement = *nnz++;
541             for (auto il = 0; il < lElement; il++) {
542                 const int diff = *dataOffset++;
543                 const float a0 = a[0];
544                 const float oneW = *w++;
545 
546                 // MNN_PRINT("1-loop: ie:%zu, a offset:%ld, c offset:%ld, w offset:%ld, w value:%f, a value[0]:", ie, a - A, w - B - 1, c - C, oneW);
547                 // formatMatrix(a, {1});
548                 // MNN_PRINT("\n");
549                 a = a + diff;
550                 acc0 += a0 * oneW;
551             }
552             acc0  = std::max(std::min(maxValue, acc0), minValue);
553             // how to store faster: st4 / transpose /
554             c[0] = acc0;
555         }
556         ie += 1;
557         // a += 1;
558     }
559 
560     return;
561 }
562 
MNNPackedSparseMatMulEpx4(float * C,const float * A,const float * B,size_t eSize,const size_t * parameter,const float * postParameters,const float * bias,unsigned int * NNZMap,int * dataOffsetMap)563 void MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap) {
564 
565     auto eP = parameter[0] / sizeof(float);
566     MNN_ASSERT((eP & 0x03) == 0); // In sparse calculate, eP should be evenly divided by 4
567     auto h = parameter[2];
568     auto l = parameter[1];
569     auto cStride = parameter[3] / sizeof(float);
570     auto aStride = eP * l;
571     auto hRemain = parameter[4];
572     auto bExtraStride = parameter[5] / sizeof(float);
573     auto bStride = bExtraStride + l * 4;
574     auto hC4 = UP_DIV(h, 4);
575     float minValue = -std::numeric_limits<float>().max();
576     float maxValue = std::numeric_limits<float>().max();
577     if (nullptr != postParameters) {
578         minValue = postParameters[2];
579         maxValue = postParameters[3];
580     }
581     // MNN_PRINT("MNNPackedSparseMatMul 16x4 eP:%lu, eSize:%lu, l:%lu, h:%lu, cStride:%lu, aStride:%lu\n", eP, eSize, l, h, cStride, aStride);
582     const int sparseBlockOC = 4;
583     const float* a = A;
584     size_t ie = 0;
585     for (ie = 0; ie < eSize && eP <= eSize; ie += eP) {
586         const int* dataOffset = dataOffsetMap;
587         const int diff = *dataOffset++;
588         a += diff;
589         const float* w = B;
590         float* blockC = C + (ie << 2);
591         const unsigned int* nnz = NNZMap;
592 
593         size_t ih = 0;
594         for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
595             auto ihPack = ih >> 2;
596             auto c = blockC + ihPack * cStride;
597 
598             float initValue[4] = {0, 0, 0, 0};
599             if (nullptr != bias) {
600                 memcpy(initValue, bias + ih, 4 * sizeof(float));
601             }
602             float acc0[4];
603             float acc1[4];
604             float acc2[4];
605             float acc3[4];
606             float acc4[4];
607             float acc5[4];
608             float acc6[4];
609             float acc7[4];
610             float acc8[4];
611             float acc9[4];
612             float acc10[4];
613             float acc11[4];
614             float acc12[4];
615             float acc13[4];
616             float acc14[4];
617             float acc15[4];
618             memcpy(acc0, initValue, 4 * sizeof(float));
619             memcpy(acc1, initValue, 4 * sizeof(float));
620             memcpy(acc2, initValue, 4 * sizeof(float));
621             memcpy(acc3, initValue, 4 * sizeof(float));
622             memcpy(acc4, initValue, 4 * sizeof(float));
623             memcpy(acc5, initValue, 4 * sizeof(float));
624             memcpy(acc6, initValue, 4 * sizeof(float));
625             memcpy(acc7, initValue, 4 * sizeof(float));
626             memcpy(acc8, initValue, 4 * sizeof(float));
627             memcpy(acc9, initValue, 4 * sizeof(float));
628             memcpy(acc10, initValue, 4 * sizeof(float));
629             memcpy(acc11, initValue, 4 * sizeof(float));
630             memcpy(acc12, initValue, 4 * sizeof(float));
631             memcpy(acc13, initValue, 4 * sizeof(float));
632             memcpy(acc14, initValue, 4 * sizeof(float));
633             memcpy(acc15, initValue, 4 * sizeof(float));
634 
635             const int lElement = *nnz++;
636             for (auto il = 0; il < lElement; il++) {
637 
638                 const int diff = *dataOffset++;
639                 const float a0 = a[0];
640                 const float a1 = a[1];
641                 const float a2 = a[2];
642                 const float a3 = a[3];
643                 const float a4 = a[4];
644                 const float a5 = a[5];
645                 const float a6 = a[6];
646                 const float a7 = a[7];
647                 const float a8 = a[8];
648                 const float a9 = a[9];
649                 const float a10 = a[10];
650                 const float a11 = a[11];
651                 const float a12 = a[12];
652                 const float a13 = a[13];
653                 const float a14 = a[14];
654                 const float a15 = a[15];
655 
656                 const float wv[4] = {*w++, *w++, *w++, *w++};
657 
658                 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
659                 // formatMatrix(a, {16});
660                 // MNN_PRINT("\n");
661                 a = a + diff;
662                 for (int lane = 0; lane < 4; lane++) {
663                     acc0[lane] += a0 * wv[lane];
664                     acc1[lane] += a1 * wv[lane];
665                     acc2[lane] += a2 * wv[lane];
666                     acc3[lane] += a3 * wv[lane];
667                     acc4[lane] += a4 * wv[lane];
668                     acc5[lane] += a5 * wv[lane];
669                     acc6[lane] += a6 * wv[lane];
670                     acc7[lane] += a7 * wv[lane];
671                     acc8[lane] += a8 * wv[lane];
672                     acc9[lane] += a9 * wv[lane];
673                     acc10[lane] += a10 * wv[lane];
674                     acc11[lane] += a11 * wv[lane];
675                     acc12[lane] += a12 * wv[lane];
676                     acc13[lane] += a13 * wv[lane];
677                     acc14[lane] += a14 * wv[lane];
678                     acc15[lane] += a15 * wv[lane];
679                 }
680             }
681 
682             for (int lane = 0; lane < 4; lane++) {
683                 acc0[lane]  = std::max(std::min(maxValue, acc0[lane]), minValue);
684                 acc1[lane]  = std::max(std::min(maxValue, acc1[lane]), minValue);
685                 acc2[lane]  = std::max(std::min(maxValue, acc2[lane]), minValue);
686                 acc3[lane]  = std::max(std::min(maxValue, acc3[lane]), minValue);
687                 acc4[lane]  = std::max(std::min(maxValue, acc4[lane]), minValue);
688                 acc5[lane]  = std::max(std::min(maxValue, acc5[lane]), minValue);
689                 acc6[lane]  = std::max(std::min(maxValue, acc6[lane]), minValue);
690                 acc7[lane]  = std::max(std::min(maxValue, acc7[lane]), minValue);
691                 acc8[lane]  = std::max(std::min(maxValue, acc8[lane]), minValue);
692                 acc9[lane]  = std::max(std::min(maxValue, acc9[lane]), minValue);
693                 acc10[lane] = std::max(std::min(maxValue, acc10[lane]), minValue);
694                 acc11[lane] = std::max(std::min(maxValue, acc11[lane]), minValue);
695                 acc12[lane] = std::max(std::min(maxValue, acc12[lane]), minValue);
696                 acc13[lane] = std::max(std::min(maxValue, acc13[lane]), minValue);
697                 acc14[lane] = std::max(std::min(maxValue, acc14[lane]), minValue);
698                 acc15[lane] = std::max(std::min(maxValue, acc15[lane]), minValue);
699             }
700 
701             memcpy(c, acc0, 4 * sizeof(float));  // store continuous c
702             memcpy(c + 4, acc1, 4 * sizeof(float));
703             memcpy(c + 4 * 2, acc2, 4 * sizeof(float));
704             memcpy(c + 4 * 3, acc3, 4 * sizeof(float));
705             memcpy(c + 4 * 4, acc4, 4 * sizeof(float));
706             memcpy(c + 4 * 5, acc5, 4 * sizeof(float));
707             memcpy(c + 4 * 6, acc6, 4 * sizeof(float));
708             memcpy(c + 4 * 7, acc7, 4 * sizeof(float));
709             memcpy(c + 4 * 8, acc8, 4 * sizeof(float));
710             memcpy(c + 4 * 9, acc9, 4 * sizeof(float));
711             memcpy(c + 4 * 10, acc10, 4 * sizeof(float));
712             memcpy(c + 4 * 11, acc11, 4 * sizeof(float));
713             memcpy(c + 4 * 12, acc12, 4 * sizeof(float));
714             memcpy(c + 4 * 13, acc13, 4 * sizeof(float));
715             memcpy(c + 4 * 14, acc14, 4 * sizeof(float));
716             memcpy(c + 4 * 15, acc15, 4 * sizeof(float));
717         }
718 
719         blockC += (h >> 2) * cStride;
720         for (; ih < h; ih++) {
721             auto ihSubIndex = ih & 0x03;
722             auto c = blockC + ihSubIndex;
723             const float initValue = nullptr != bias ? bias[ih] : 0;
724             float acc0 = initValue;
725             float acc1 = initValue;
726             float acc2 = initValue;
727             float acc3 = initValue;
728             float acc4 = initValue;
729             float acc5 = initValue;
730             float acc6 = initValue;
731             float acc7 = initValue;
732             float acc8 = initValue;
733             float acc9 = initValue;
734             float acc10 = initValue;
735             float acc11 = initValue;
736             float acc12 = initValue;
737             float acc13 = initValue;
738             float acc14 = initValue;
739             float acc15 = initValue;
740             const int lElement = *nnz++;
741             for (auto il = 0; il < lElement; il++) {
742 
743                 const int diff = *dataOffset++;
744                 const float a0 = a[0];
745                 const float a1 = a[1];
746                 const float a2 = a[2];
747                 const float a3 = a[3];
748                 const float a4 = a[4];
749                 const float a5 = a[5];
750                 const float a6 = a[6];
751                 const float a7 = a[7];
752                 const float a8 = a[8];
753                 const float a9 = a[9];
754                 const float a10 = a[10];
755                 const float a11 = a[11];
756                 const float a12 = a[12];
757                 const float a13 = a[13];
758                 const float a14 = a[14];
759                 const float a15 = a[15];
760 
761                 const float oneW = *w++;
762 
763                 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
764                 // formatMatrix(a, {16});
765                 // MNN_PRINT("\n");
766                 a = a + diff;
767                 acc0 += a0 * oneW;
768                 acc1 += a1 * oneW;
769                 acc2 += a2 * oneW;
770                 acc3 += a3 * oneW;
771                 acc4 += a4 * oneW;
772                 acc5 += a5 * oneW;
773                 acc6 += a6 * oneW;
774                 acc7 += a7 * oneW;
775                 acc8 += a8 * oneW;
776                 acc9 += a9 * oneW;
777                 acc10 += a10 * oneW;
778                 acc11 += a11 * oneW;
779                 acc12 += a12 * oneW;
780                 acc13 += a13 * oneW;
781                 acc14 += a14 * oneW;
782                 acc15 += a15 * oneW;
783             }
784             acc0  = std::max(std::min(maxValue, acc0), minValue);
785             acc1  = std::max(std::min(maxValue, acc1), minValue);
786             acc2  = std::max(std::min(maxValue, acc2), minValue);
787             acc3  = std::max(std::min(maxValue, acc3), minValue);
788             acc4  = std::max(std::min(maxValue, acc4), minValue);
789             acc5  = std::max(std::min(maxValue, acc5), minValue);
790             acc6  = std::max(std::min(maxValue, acc6), minValue);
791             acc7  = std::max(std::min(maxValue, acc7), minValue);
792             acc8  = std::max(std::min(maxValue, acc8), minValue);
793             acc9  = std::max(std::min(maxValue, acc9), minValue);
794             acc10 = std::max(std::min(maxValue, acc10), minValue);
795             acc11 = std::max(std::min(maxValue, acc11), minValue);
796             acc12 = std::max(std::min(maxValue, acc12), minValue);
797             acc13 = std::max(std::min(maxValue, acc13), minValue);
798             acc14 = std::max(std::min(maxValue, acc14), minValue);
799             acc15 = std::max(std::min(maxValue, acc15), minValue);
800 
801             // how to store faster: st4 / transpose /
802             c[0] = acc0;
803             c[4] = acc1;
804             c[4 * 2] = acc2;
805             c[4 * 3] = acc3;
806             c[4 * 4] = acc4;
807             c[4 * 5] = acc5;
808             c[4 * 6] = acc6;
809             c[4 * 7] = acc7;
810             c[4 * 8] = acc8;
811             c[4 * 9] = acc9;
812             c[4 * 10] = acc10;
813             c[4 * 11] = acc11;
814             c[4 * 12] = acc12;
815             c[4 * 13] = acc13;
816             c[4 * 14] = acc14;
817             c[4 * 15] = acc15;
818         }
819         a += aStride;
820     }
821     // const float* blockA = A + ie * l;
822     if (eSize & 0x08) {
823         const int* dataOffset = dataOffsetMap;
824         const int diff = *dataOffset++;
825         // a = blockA + diff;
826         a += diff;
827         const float* w = B;
828         float* blockC = C + (ie << 2);
829         const unsigned int* nnz = NNZMap;
830 
831         size_t ih = 0;
832         for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
833             auto ihPack = ih >> 2;
834             auto c = blockC + ihPack * cStride;
835             float initValue[4] = {0, 0, 0, 0};
836             if (nullptr != bias) {
837                 memcpy(initValue, bias + ih, 4 * sizeof(float));
838             }
839             float acc0[4];
840             float acc1[4];
841             float acc2[4];
842             float acc3[4];
843             float acc4[4];
844             float acc5[4];
845             float acc6[4];
846             float acc7[4];
847             memcpy(acc0, initValue, 4 * sizeof(float));
848             memcpy(acc1, initValue, 4 * sizeof(float));
849             memcpy(acc2, initValue, 4 * sizeof(float));
850             memcpy(acc3, initValue, 4 * sizeof(float));
851             memcpy(acc4, initValue, 4 * sizeof(float));
852             memcpy(acc5, initValue, 4 * sizeof(float));
853             memcpy(acc6, initValue, 4 * sizeof(float));
854             memcpy(acc7, initValue, 4 * sizeof(float));
855             const int lElement = *nnz++;
856             for (auto il = 0; il < lElement; il++) {
857 
858                 const int diff = *dataOffset++;
859                 const float a0 = a[0];
860                 const float a1 = a[1];
861                 const float a2 = a[2];
862                 const float a3 = a[3];
863                 const float a4 = a[4];
864                 const float a5 = a[5];
865                 const float a6 = a[6];
866                 const float a7 = a[7];
867                 const float wv[4] = {*w++, *w++, *w++, *w++};
868                 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
869                 // formatMatrix(a, {16});
870                 // MNN_PRINT("\n");
871                 a = a + diff;
872                 for (int lane = 0; lane < 4; lane++) {
873                     acc0[lane] += a0 * wv[lane];
874                     acc1[lane] += a1 * wv[lane];
875                     acc2[lane] += a2 * wv[lane];
876                     acc3[lane] += a3 * wv[lane];
877                     acc4[lane] += a4 * wv[lane];
878                     acc5[lane] += a5 * wv[lane];
879                     acc6[lane] += a6 * wv[lane];
880                     acc7[lane] += a7 * wv[lane];
881                 }
882             }
883 
884             for (int lane = 0; lane < 4; lane++) {
885                 acc0[lane]  = std::max(std::min(maxValue, acc0[lane]), minValue);
886                 acc1[lane]  = std::max(std::min(maxValue, acc1[lane]), minValue);
887                 acc2[lane]  = std::max(std::min(maxValue, acc2[lane]), minValue);
888                 acc3[lane]  = std::max(std::min(maxValue, acc3[lane]), minValue);
889                 acc4[lane]  = std::max(std::min(maxValue, acc4[lane]), minValue);
890                 acc5[lane]  = std::max(std::min(maxValue, acc5[lane]), minValue);
891                 acc6[lane]  = std::max(std::min(maxValue, acc6[lane]), minValue);
892                 acc7[lane]  = std::max(std::min(maxValue, acc7[lane]), minValue);
893             }
894 
895             memcpy(c, acc0, 4 * sizeof(float));  // store continuous c
896             memcpy(c + 4, acc1, 4 * sizeof(float));
897             memcpy(c + 4 * 2, acc2, 4 * sizeof(float));
898             memcpy(c + 4 * 3, acc3, 4 * sizeof(float));
899             memcpy(c + 4 * 4, acc4, 4 * sizeof(float));
900             memcpy(c + 4 * 5, acc5, 4 * sizeof(float));
901             memcpy(c + 4 * 6, acc6, 4 * sizeof(float));
902             memcpy(c + 4 * 7, acc7, 4 * sizeof(float));
903         }
904         blockC += (ih >> 2) * cStride;
905         for (; ih < h; ih++) {
906             auto ihSubIndex = ih & 0x03;
907             auto c = blockC + ihSubIndex;
908             const float initValue = nullptr != bias ? bias[ih] : 0;
909             float acc0 = initValue;
910             float acc1 = initValue;
911             float acc2 = initValue;
912             float acc3 = initValue;
913             float acc4 = initValue;
914             float acc5 = initValue;
915             float acc6 = initValue;
916             float acc7 = initValue;
917 
918             const int lElement = *nnz++;
919             for (auto il = 0; il < lElement; il++) {
920                 const int diff = *dataOffset++;
921                 const float a0 = a[0];
922                 const float a1 = a[1];
923                 const float a2 = a[2];
924                 const float a3 = a[3];
925                 const float a4 = a[4];
926                 const float a5 = a[5];
927                 const float a6 = a[6];
928                 const float a7 = a[7];
929                 const float oneW = *w++;
930                 // MNN_PRINT("8-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-7]:", ie, a - A, w - B - 1, c - C, oneW);
931                 // formatMatrix(a, {8});
932                 // MNN_PRINT("\n");
933                 a = a + diff;
934                 acc0 += a0 * oneW;
935                 acc1 += a1 * oneW;
936                 acc2 += a2 * oneW;
937                 acc3 += a3 * oneW;
938                 acc4 += a4 * oneW;
939                 acc5 += a5 * oneW;
940                 acc6 += a6 * oneW;
941                 acc7 += a7 * oneW;
942             }
943             acc0  = std::max(std::min(maxValue, acc0), minValue);
944             acc1  = std::max(std::min(maxValue, acc1), minValue);
945             acc2  = std::max(std::min(maxValue, acc2), minValue);
946             acc3  = std::max(std::min(maxValue, acc3), minValue);
947             acc4  = std::max(std::min(maxValue, acc4), minValue);
948             acc5  = std::max(std::min(maxValue, acc5), minValue);
949             acc6  = std::max(std::min(maxValue, acc6), minValue);
950             acc7  = std::max(std::min(maxValue, acc7), minValue);
951             // how to store faster: st4 / transpose /
952             c[0] = acc0;
953             c[4] = acc1;
954             c[4 * 2] = acc2;
955             c[4 * 3] = acc3;
956             c[4 * 4] = acc4;
957             c[4 * 5] = acc5;
958             c[4 * 6] = acc6;
959             c[4 * 7] = acc7;
960         }
961         ie += 8;
962         a += 8;
963     }
964 
965     if (eSize & 0x04) {
966         const int* dataOffset = dataOffsetMap;
967         const int diff = *dataOffset++;
968         // const float* a = blockA + diff;
969         a += diff;
970         const float* w = B;
971         float* blockC = C + (ie << 2);
972         const unsigned int* nnz = NNZMap;
973 
974         size_t ih = 0;
975         for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
976             auto ihPack = ih >> 2;
977             auto c = blockC + ihPack * cStride;
978             float initValue[4] = {0, 0, 0, 0};
979             if (nullptr != bias) {
980                 memcpy(initValue, bias + ih, 4 * sizeof(float));
981             }
982             float acc0[4];
983             float acc1[4];
984             float acc2[4];
985             float acc3[4];
986             memcpy(acc0, initValue, 4 * sizeof(float));
987             memcpy(acc1, initValue, 4 * sizeof(float));
988             memcpy(acc2, initValue, 4 * sizeof(float));
989             memcpy(acc3, initValue, 4 * sizeof(float));
990 
991             const int lElement = *nnz++;
992             for (auto il = 0; il < lElement; il++) {
993                 const int diff = *dataOffset++;
994                 const float a0 = a[0];
995                 const float a1 = a[1];
996                 const float a2 = a[2];
997                 const float a3 = a[3];
998                 const float wv[4] = {*w++, *w++, *w++, *w++};
999                 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
1000                 // formatMatrix(a, {16});
1001                 // MNN_PRINT("\n");
1002                 a = a + diff;
1003                 for (int lane = 0; lane < 4; lane++) {
1004                     acc0[lane] += a0 * wv[lane];
1005                     acc1[lane] += a1 * wv[lane];
1006                     acc2[lane] += a2 * wv[lane];
1007                     acc3[lane] += a3 * wv[lane];
1008                 }
1009             }
1010 
1011             for (int lane = 0; lane < 4; lane++) {
1012                 acc0[lane]  = std::max(std::min(maxValue, acc0[lane]), minValue);
1013                 acc1[lane]  = std::max(std::min(maxValue, acc1[lane]), minValue);
1014                 acc2[lane]  = std::max(std::min(maxValue, acc2[lane]), minValue);
1015                 acc3[lane]  = std::max(std::min(maxValue, acc3[lane]), minValue);
1016             }
1017 
1018             memcpy(c, acc0, 4 * sizeof(float));  // store continuous c
1019             memcpy(c + 4, acc1, 4 * sizeof(float));
1020             memcpy(c + 4 * 2, acc2, 4 * sizeof(float));
1021             memcpy(c + 4 * 3, acc3, 4 * sizeof(float));
1022         }
1023         blockC += (ih >> 2) * cStride;
1024         for (; ih < h; ih++) {
1025             auto ihSubIndex = ih & 0x03;
1026             auto c = blockC + ihSubIndex;
1027             const float initValue = nullptr != bias ? bias[ih] : 0;
1028             float acc0 = initValue;
1029             float acc1 = initValue;
1030             float acc2 = initValue;
1031             float acc3 = initValue;
1032 
1033             const int lElement = *nnz++;
1034             for (auto il = 0; il < lElement; il++) {
1035                 const int diff = *dataOffset++;
1036                 const float a0 = a[0];
1037                 const float a1 = a[1];
1038                 const float a2 = a[2];
1039                 const float a3 = a[3];
1040                 const float oneW = *w++;
1041                 // MNN_PRINT("4-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-3]:", ie, a - A, w - B - 1, c - C, oneW);
1042                 // formatMatrix(a, {4});
1043                 // MNN_PRINT("\n");
1044                 a = a + diff;
1045                 acc0 += a0 * oneW;
1046                 acc1 += a1 * oneW;
1047                 acc2 += a2 * oneW;
1048                 acc3 += a3 * oneW;
1049             }
1050             acc0  = std::max(std::min(maxValue, acc0), minValue);
1051             acc1  = std::max(std::min(maxValue, acc1), minValue);
1052             acc2  = std::max(std::min(maxValue, acc2), minValue);
1053             acc3  = std::max(std::min(maxValue, acc3), minValue);
1054             // how to store faster: st4 / transpose /
1055             c[0] = acc0;
1056             c[4] = acc1;
1057             c[4 * 2] = acc2;
1058             c[4 * 3] = acc3;
1059         }
1060         ie += 4;
1061         a += 4;
1062     }
1063     if (eSize & 0x02) {
1064         const int* dataOffset = dataOffsetMap;
1065         const int diff = *dataOffset++;
1066         // const float* a = blockA + diff;
1067         a += diff;
1068         const float* w = B;
1069         float* blockC = C + (ie << 2);
1070         const unsigned int* nnz = NNZMap;
1071 
1072         size_t ih = 0;
1073         for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
1074             auto ihPack = ih >> 2;
1075             auto c = blockC + ihPack * cStride;
1076             float initValue[4] = {0, 0, 0, 0};
1077             if (nullptr != bias) {
1078                 memcpy(initValue, bias + ih, 4 * sizeof(float));
1079             }
1080             float acc0[4];
1081             float acc1[4];
1082             memcpy(acc0, initValue, 4 * sizeof(float));
1083             memcpy(acc1, initValue, 4 * sizeof(float));
1084             const int lElement = *nnz++;
1085             for (auto il = 0; il < lElement; il++) {
1086 
1087                 const int diff = *dataOffset++;
1088                 const float a0 = a[0];
1089                 const float a1 = a[1];
1090                 const float wv[4] = {*w++, *w++, *w++, *w++};
1091                 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
1092                 // formatMatrix(a, {16});
1093                 // MNN_PRINT("\n");
1094                 a = a + diff;
1095                 for (int lane = 0; lane < 4; lane++) {
1096                     acc0[lane] += a0 * wv[lane];
1097                     acc1[lane] += a1 * wv[lane];
1098                 }
1099             }
1100 
1101             for (int lane = 0; lane < 4; lane++) {
1102                 acc0[lane]  = std::max(std::min(maxValue, acc0[lane]), minValue);
1103                 acc1[lane]  = std::max(std::min(maxValue, acc1[lane]), minValue);
1104             }
1105 
1106             memcpy(c, acc0, 4 * sizeof(float));  // store continuous c
1107             memcpy(c + 4, acc1, 4 * sizeof(float));
1108         }
1109         blockC += (ih >> 2) * cStride;
1110         for (; ih < h; ih++) {
1111             auto ihPack = ih >> 2;
1112             auto ihSubIndex = ih & 0x03;
1113             auto c = blockC + ihSubIndex;
1114             const float initValue = nullptr != bias ? bias[ih] : 0;
1115             float acc0 = initValue;
1116             float acc1 = initValue;
1117 
1118             const int lElement = *nnz++;
1119             for (auto il = 0; il < lElement; il++) {
1120                 const int diff = *dataOffset++;
1121                 const float a0 = a[0];
1122                 const float a1 = a[1];
1123                 const float oneW = *w++;
1124                 // MNN_PRINT("2-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-1]:", ie, a - A, w - B - 1, c - C, oneW);
1125                 // formatMatrix(a, {2});
1126                 // MNN_PRINT("\n");
1127                 a = a + diff;
1128                 acc0 += a0 * oneW;
1129                 acc1 += a1 * oneW;
1130             }
1131             acc0  = std::max(std::min(maxValue, acc0), minValue);
1132             acc1  = std::max(std::min(maxValue, acc1), minValue);
1133             // how to store faster: st4 / transpose /
1134             c[0] = acc0;
1135             c[4] = acc1;
1136         }
1137         ie += 2;
1138         a += 2;
1139     }
1140     if (eSize & 0x01) {
1141         const int* dataOffset = dataOffsetMap;
1142         const int diff = *dataOffset++;
1143         // const float* a = blockA + diff;
1144         a += diff;
1145         const float* w = B;
1146         float* blockC = C + (ie << 2);
1147         const unsigned int* nnz = NNZMap;
1148 
1149         size_t ih = 0;
1150         for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
1151             auto ihPack = ih >> 2;
1152             auto c = blockC + ihPack * cStride;
1153             float initValue[4] = {0, 0, 0, 0};
1154             if (nullptr != bias) {
1155                 memcpy(initValue, bias + ih, 4 * sizeof(float));
1156             }
1157             float acc0[4];
1158             memcpy(acc0, initValue, 4 * sizeof(float));
1159             const int lElement = *nnz++;
1160             for (auto il = 0; il < lElement; il++) {
1161 
1162                 const int diff = *dataOffset++;
1163                 const float a0 = a[0];
1164                 const float wv[4] = {*w++, *w++, *w++, *w++};
1165                 // MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
1166                 // formatMatrix(a, {16});
1167                 // MNN_PRINT("\n");
1168                 a = a + diff;
1169                 for (int lane = 0; lane < 4; lane++) {
1170                     acc0[lane] += a0 * wv[lane];
1171                 }
1172             }
1173 
1174             for (int lane = 0; lane < 4; lane++) {
1175                 acc0[lane]  = std::max(std::min(maxValue, acc0[lane]), minValue);
1176             }
1177             memcpy(c, acc0, 4 * sizeof(float));  // store continuous c
1178         }
1179         blockC += (ih >> 2) * cStride;
1180         for (; ih < h; ih++) {
1181             auto ihSubIndex = ih & 0x03;
1182             auto c = blockC + ihSubIndex;
1183             const float initValue = nullptr != bias ? bias[ih] : 0;
1184             float acc0 = initValue;
1185 
1186             const int lElement = *nnz++;
1187             for (auto il = 0; il < lElement; il++) {
1188                 const int diff = *dataOffset++;
1189                 const float a0 = a[0];
1190                 const float oneW = *w++;
1191 
1192                 // MNN_PRINT("1-loop: ie:%zu, a offset:%ld, c offset:%ld, w offset:%ld, w value:%f, a value[0]:", ie, a - A, w - B - 1, c - C, oneW);
1193                 // formatMatrix(a, {1});
1194                 // MNN_PRINT("\n");
1195                 a = a + diff;
1196                 acc0 += a0 * oneW;
1197             }
1198             acc0  = std::max(std::min(maxValue, acc0), minValue);
1199             // how to store faster: st4 / transpose /
1200             c[0] = acc0;
1201         }
1202         ie += 1;
1203         // a += 1;
1204     }
1205 
1206     return;
1207 }
1208 
1209 #endif
1210 
1211 #ifndef MNN_USE_SSE
1212 #ifndef MNN_USE_NEON
MNNTranspose32Bit(int32_t * dstO,const int32_t * srcO,int32_t * dim)1213 void MNNTranspose32Bit(int32_t* dstO, const int32_t* srcO, int32_t* dim) {
1214     int w = dim[0];
1215     int h = dim[1];
1216     int srcStride = dim[2];
1217     int dstStride = dim[3];
1218     for (int i=0; i<h; ++i) {
1219         auto si = srcO + i;
1220         auto di = dstO + i * dstStride;
1221         for (int j=0; j<w; ++j) {
1222             auto sj = si + j * srcStride;
1223             auto dj = di + j;
1224             *dj = *sj;
1225         }
1226     }
1227 }
1228 #endif
MNNFunctionInit()1229 void MNNFunctionInit() {
1230     // Do nothing
1231 }
1232 #endif
1233 
1234 #ifdef MNN_USE_NEON
1235 #include <arm_neon.h>
1236 #endif
1237 
1238 #define UNIT 4
1239 using Vec4 = MNN::Math::Vec<float, 4>;
1240 
1241 #ifndef MNN_USE_NEON
1242 
1243 #ifndef MNN_USE_SSE
1244 
MNNCopyC4WithStride(const float * source,float * dest,size_t srcStride,size_t dstStride,size_t count)1245 void MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) {
1246     for (int i = 0; i < count; ++i) {
1247         auto s = source + i * srcStride;
1248         auto d = dest + i * dstStride;
1249         for (int j = 0; j < 4; ++j) {
1250             d[j] = s[j];
1251         }
1252     }
1253 }
1254 
MNNAddC4WithStride(const float * source,float * dest,size_t srcStride,size_t dstStride,size_t count)1255 void MNNAddC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count) {
1256     for (int i = 0; i < count; ++i) {
1257         auto s = source + i * srcStride;
1258         auto d = dest + i * dstStride;
1259         for (int j = 0; j < 4; ++j) {
1260             d[j] += s[j];
1261         }
1262     }
1263 }
1264 
MNNReluWithSlopeChannel(float * dst,const float * src,const float * slope,size_t sizeQuad,size_t depthQuad)1265 void MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad) {
1266     for (int j = 0; j < depthQuad; j++) {
1267         const float* slopeZ = slope + 4 * j;
1268         const float* srcZ   = src + 4 * j * sizeQuad;
1269         float* dstZ         = dst + 4 * j * sizeQuad;
1270         for (int i = 0; i < sizeQuad; i++) {
1271             for (int c = 0; c < 4; c++) {
1272                 if (srcZ[4 * i + c] < 0) {
1273                     dstZ[4 * i + c] = srcZ[4 * i + c] * slopeZ[c];
1274                 } else {
1275                     dstZ[4 * i + c] = srcZ[4 * i + c];
1276                 }
1277             }
1278         }
1279     }
1280 }
1281 
MNNPackC4(float * dst,const float * src,size_t area,size_t depth)1282 void MNNPackC4(float* dst, const float* src, size_t area, size_t depth) {
1283     int depthC4     = depth / 4;
1284     int depthRemain = depthC4 * 4;
1285     int remain      = depth - depthRemain;
1286     int z, x, y;
1287     const float* srcChannel[4];
1288     const float* srcOffset = src;
1289     for(z = 0; z < depthC4; ++z) {
1290         for(y = 0; y < 4; ++y) {
1291             srcChannel[y] = srcOffset + area * y;
1292         }
1293         for(x = 0; x < area; ++x) {
1294             for(y = 0; y < 4; ++y) {
1295                 dst[0] = srcChannel[y][0];
1296                 srcChannel[y]++;
1297                 dst++;
1298             }
1299         }
1300         srcOffset += area * 4;
1301     }
1302     if(remain > 0){
1303         for(y = 0; y < remain; ++y) {
1304             srcChannel[y] = srcOffset + area * y;
1305         }
1306         for(x = 0; x < area; ++x) {
1307             for(y = 0; y < remain; ++y) {
1308                 dst[0] = srcChannel[y][0];
1309                 srcChannel[y]++;
1310                 dst++;
1311             }
1312             for(y = remain; y < 4; ++y) {
1313                 dst[0] = 0;
1314                 dst++;
1315             }
1316         }
1317     }
1318 }
1319 
MNNUnpackC4(float * dst,const float * src,size_t area,size_t depth)1320 void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth) {
1321     int depthC4     = depth / 4;
1322     int depthRemain = depthC4 * 4;
1323     int remain      = depth - depthRemain;
1324     int z, x, y;
1325     const float* srcChannel[4];
1326     const float* srcOffset = src;
1327     for(z = 0; z < depthC4; ++z) {
1328         for(y = 0; y < 4; ++y) {
1329             srcChannel[y] = srcOffset + y;
1330             for(x = 0; x < area; ++x) {
1331                 dst[0] = srcChannel[y][0];
1332                 srcChannel[y] += 4;
1333                 dst++;
1334             }
1335         }
1336         srcOffset += area * 4;
1337     }
1338     if(remain > 0){
1339         for(y = 0; y < remain; ++y) {
1340             srcChannel[y] = srcOffset + y;
1341             for(x = 0; x < area; ++x) {
1342                 dst[0] = srcChannel[y][0];
1343                 srcChannel[y] += 4;
1344                 dst++;
1345             }
1346         }
1347     }
1348 }
1349 
MNNExpC8(float * dest,const float * source,const float * parameters,size_t countC8)1350 void MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8) {
1351     auto count = countC8 * 8;
1352     auto param = parameters[0];
1353     float xLimit = 87;
1354     for (int i = 0; i < count; ++i) {
1355         auto x         = -source[i];
1356         x = ALIMAX(x, -xLimit);
1357         x = ALIMIN(x, xLimit);
1358         int div        = (x * parameters[1]);
1359         int div2       = (div + 127) << 23;
1360         auto xReamin   = x - div * param;
1361         float expBasic = *(float*)(&div2);
1362         auto t = xReamin;
1363         auto expRemain =
1364             ((((parameters[7] * t + parameters[6]) * t + parameters[5]) * t + parameters[4]) * t + parameters[3]) * t +
1365             parameters[2];
1366         dest[i] = expBasic * expRemain;
1367     }
1368 }
1369 
MNNReluInt8(int8_t * dst,const int8_t * src,size_t size)1370 void MNNReluInt8(int8_t* dst, const int8_t* src, size_t size) {
1371     int i;
1372     for (i = 0; i < size; ++i) {
1373         if (src[i] < 0) {
1374             dst[i] = 0;
1375         } else {
1376             dst[i] = src[i];
1377         }
1378     }
1379 }
1380 #endif // no MNN_USE_SSE
1381 
MNNMaxFloat(float * input,float * maxBuffer,int32_t inputCountUnit)1382 void MNNMaxFloat(float* input, float* maxBuffer, int32_t inputCountUnit) {
1383     for (int i = 0; i < inputCountUnit; i++) {
1384         for (int j = 0; j < UNIT; j++) {
1385             for (int m = 0; m < 2; m++) {
1386                 maxBuffer[j] = std::max(input[i * UNIT * 2 + j * 2 + m], maxBuffer[j]);
1387             }
1388         }
1389     }
1390 }
MNNMinFloat(float * input,float * minBuffer,int32_t inputCountUnit)1391 void MNNMinFloat(float* input, float* minBuffer, int32_t inputCountUnit) {
1392     for (int i = 0; i < inputCountUnit; i++) {
1393         for (int j = 0; j < UNIT; j++) {
1394             for (int m = 0; m < 2; m++) {
1395                 minBuffer[j] = std::min(input[i * UNIT * 2 + j * 2 + m], minBuffer[j]);
1396             }
1397         }
1398     }
1399 }
MNNScaleAndAddBias(float * dst,const float * src,const float * bias,const float * alpha,size_t planeNumber,size_t biasNumber)1400 void MNNScaleAndAddBias(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber,
1401                         size_t biasNumber) {
1402     for (int z = 0; z < biasNumber; ++z) {
1403         float* dstZ         = dst + planeNumber * 4 * z;
1404         const float* srcZ   = src + planeNumber * 4 * z;
1405         auto biasZ = Vec4::load(bias + 4 * z);
1406         auto alphaZ = Vec4::load(alpha + 4 * z);
1407         for (int p = 0; p < planeNumber; ++p) {
1408             float* dstX       = dstZ + 4 * p;
1409             const float* srcX = srcZ + 4 * p;
1410             Vec4::save(dstX, (Vec4::load(srcX) * alphaZ) + biasZ);
1411         }
1412     }
1413 }
1414 
1415 
1416 
MNNUInt8ToInt16WithOffsetC4Common(int16_t * dst,const uint8_t * src,size_t zeroPoint,size_t sizeQuad,size_t dstStride,size_t srcStride)1417 void MNNUInt8ToInt16WithOffsetC4Common(int16_t* dst, const uint8_t* src, size_t zeroPoint, size_t sizeQuad,
1418                                        size_t dstStride, size_t srcStride) {
1419     dstStride /= sizeof(int16_t);
1420     srcStride /= sizeof(uint8_t);
1421     for (int z = 0; z < sizeQuad; ++z) {
1422         auto dstZ = dst + dstStride * z;
1423         auto srcZ = src + srcStride * z;
1424         for (int j = 0; j < 4; ++j) {
1425             dstZ[j] = (int16_t)((int32_t)srcZ[j] - (int32_t)zeroPoint);
1426         }
1427     }
1428 }
1429 
MNNUInt8ToInt16WithOffsetC4Fast(int16_t * colAddr,const uint8_t * srcStart,size_t zeroPoint,size_t sizeQuad,size_t depthQuad,size_t dstZStep,size_t srcZStep)1430 void MNNUInt8ToInt16WithOffsetC4Fast(int16_t* colAddr, const uint8_t* srcStart, size_t zeroPoint, size_t sizeQuad,
1431                                      size_t depthQuad, size_t dstZStep, size_t srcZStep) {
1432     dstZStep /= sizeof(int16_t);
1433     srcZStep /= sizeof(uint8_t);
1434     for (int sz = 0; sz < depthQuad; ++sz) {
1435         auto dstZ = colAddr + sz * dstZStep;
1436         auto srcZ = srcStart + sz * srcZStep;
1437         MNNUInt8ToInt16WithOffsetC4Common(dstZ, srcZ, zeroPoint, sizeQuad, 4 * sizeof(int16_t), 4 * sizeof(uint8_t));
1438     }
1439 }
1440 
MNNPowC8(float * dest,const float * source,const float * powfParam,size_t betaInt,size_t countC8)1441 void MNNPowC8(float* dest, const float* source, const float* powfParam, size_t betaInt, size_t countC8) {
1442     const int count          = countC8 * 8;
1443     const float powfConstant = powfParam[6];
1444     for (int i = 0; i < count; ++i) {
1445         float result = 1, x, xInv = 1 / source[i];
1446         for (int j = 0; j < betaInt; result *= xInv, ++j)
1447             ;
1448         for (x = source[i]; x >= 1.25; x /= 1.5, result *= powfConstant)
1449             ;
1450         float t = x - 1;
1451         float powRemain =
1452             powfParam[0] +
1453             t * (powfParam[1] + t * (powfParam[2] + t * (powfParam[3] + t * (powfParam[4] + t * powfParam[5]))));
1454         result *= powRemain;
1455         dest[i] = result;
1456     }
1457 }
1458 
1459 
1460 #endif // no MNN_USE_NEON
1461 
1462 
MNNPackC4Uint8(uint8_t * dst,const uint8_t * src,size_t area,size_t depth)1463 void MNNPackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth) {
1464     int z, x;
1465     int cur = 0;
1466     memset(dst, 0, area * UP_DIV(depth, 4) * 4 * sizeof(uint8_t));
1467     for (z = 0; z < depth; ++z) {
1468         int plane         = z / 4;
1469         uint8_t* dstPlane = plane * area * 4 + dst;
1470         int offset        = z % 4;
1471         for (x = 0; x < area; ++x) {
1472             dstPlane[4 * x + offset] = src[cur++];
1473         }
1474     }
1475 }
1476 
MNNUnpackC4Uint8(uint8_t * dst,const uint8_t * src,size_t area,size_t depth)1477 void MNNUnpackC4Uint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth) {
1478     int x;
1479     int z;
1480     int cur = 0;
1481     for (z = 0; z < depth; ++z) {
1482         int plane               = z / 4;
1483         const uint8_t* srcPlane = plane * area * 4 + src;
1484         int offset              = z % 4;
1485         for (x = 0; x < area; ++x) {
1486             dst[cur++] = srcPlane[4 * x + offset];
1487         }
1488     }
1489 }
1490 
MNNUnpackTransposeUint8(uint8_t * dst,const uint8_t * src,size_t area,size_t depth)1491 void MNNUnpackTransposeUint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth) {
1492     if (depth == 4) {
1493         ::memcpy(dst, src, area * depth * sizeof(uint8_t));
1494         return;
1495     }
1496 #ifdef MNN_USE_NEON
1497     if (depth == 3) {
1498         uint8x16x4_t rgba;
1499         rgba.val[3] = vdupq_n_u8(0);
1500         int sta     = 0;
1501         int staC16  = (int)area / 16;
1502         for (int i = 0; i < staC16; sta += 16, ++i) {
1503             auto rgb    = vld3q_u8(src + sta * 3);
1504             rgba.val[0] = rgb.val[0];
1505             rgba.val[1] = rgb.val[1];
1506             rgba.val[2] = rgb.val[2];
1507             vst4q_u8(dst + 4 * sta, rgba);
1508         }
1509         sta = staC16 * 16;
1510 
1511         for (; sta < area; ++sta) {
1512             auto s = src + sta * 3;
1513             auto d = dst + sta * 4;
1514             d[0]   = s[0];
1515             d[1]   = s[1];
1516             d[2]   = s[2];
1517             d[3]   = 0;
1518         }
1519 
1520         return;
1521     }
1522     if (depth == 1) {
1523         uint8x16x4_t rgba;
1524         rgba.val[1] = vdupq_n_u8(0);
1525         rgba.val[2] = vdupq_n_u8(0);
1526         rgba.val[3] = vdupq_n_u8(0);
1527         int sta     = 0;
1528         for (; sta < area; sta += 16) {
1529             rgba.val[0] = vld1q_u8(src + sta);
1530             vst4q_u8(dst + 4 * sta, rgba);
1531         }
1532 
1533         for (; sta < area; ++sta) {
1534             auto s = src + sta;
1535             auto d = dst + sta * 4;
1536             d[0]   = s[0];
1537             d[1]   = 0;
1538             d[2]   = 0;
1539             d[3]   = 0;
1540         }
1541 
1542         return;
1543     }
1544 #endif
1545     int c      = (int)depth;
1546     int cDiv4  = c / 4;
1547     int cAlign = cDiv4 * 4;
1548     for (int hi = 0; hi < area; ++hi) {
1549         auto srcHeight = (src + hi * c);
1550         auto dstHeight = (dst + hi * 4);
1551         for (int ci = 0; ci < cDiv4; ++ci) {
1552             for (int i = 0; i < 4; ++i) {
1553                 dstHeight[ci * area * 4 + i] = srcHeight[4 * ci + i];
1554             }
1555         }
1556     }
1557 
1558     if (cAlign == c) {
1559         return;
1560     }
1561 
1562     int cReamin   = c - cAlign;
1563     auto srcAlign = src + cAlign;
1564     auto dstAlign = dst + area * cAlign;
1565 
1566     for (int hi = 0; hi < area; ++hi) {
1567         auto srcHeight = srcAlign + hi * c;
1568         auto dstHeight = dstAlign + hi * 4;
1569         for (int i = 0; i < 4; ++i) {
1570             dstHeight[i] = 0;
1571         }
1572         for (int ci = 0; ci < cReamin; ++ci) {
1573             dstHeight[ci] = srcHeight[ci];
1574         }
1575     }
1576 }
1577 
MNNUnpackTranspose(float * dst,const float * src,size_t area,size_t depth)1578 void MNNUnpackTranspose(float* dst, const float* src, size_t area, size_t depth) {
1579 #ifdef MNN_USE_NEON
1580     if (1 == depth) {
1581         auto zeroValue = vmovq_n_f32(0.0f);
1582         int areaC4     = (int)area / 4;
1583         int remain     = areaC4 * 4;
1584         for (int i = 0; i < areaC4; ++i) {
1585             auto srcCur   = src + 4 * i;
1586             auto dstCur   = dst + 16 * i;
1587             auto srcValue = vld1q_f32(srcCur);
1588             float32x4x4_t dstValue;
1589             dstValue.val[0] = srcValue;
1590             dstValue.val[1] = zeroValue;
1591             dstValue.val[2] = zeroValue;
1592             dstValue.val[3] = zeroValue;
1593             vst4q_f32(dstCur, dstValue);
1594         }
1595         for (int i = remain; i < area; ++i) {
1596             dst[4 * i + 0] = src[i];
1597             dst[4 * i + 1] = 0.0f;
1598             dst[4 * i + 2] = 0.0f;
1599             dst[4 * i + 3] = 0.0f;
1600         }
1601         return;
1602     }
1603     if (3 == depth) {
1604         auto zeroValue = vmovq_n_f32(0.0f);
1605         int areaC4     = (int)area / 4;
1606         int remain     = areaC4 * 4;
1607         for (int i = 0; i < areaC4; ++i) {
1608             auto srcCur   = src + 12 * i;
1609             auto dstCur   = dst + 16 * i;
1610             auto srcValue = vld3q_f32(srcCur);
1611             float32x4x4_t dstValue;
1612             dstValue.val[0] = srcValue.val[0];
1613             dstValue.val[1] = srcValue.val[1];
1614             dstValue.val[2] = srcValue.val[2];
1615             dstValue.val[3] = zeroValue;
1616             vst4q_f32(dstCur, dstValue);
1617         }
1618         for (int i = remain; i < area; ++i) {
1619             dst[4 * i + 0] = src[3 * i + 0];
1620             dst[4 * i + 1] = src[3 * i + 1];
1621             dst[4 * i + 2] = src[3 * i + 2];
1622             dst[4 * i + 3] = 0.0f;
1623         }
1624         return;
1625     }
1626 #endif
1627     int c      = (int)depth;
1628     int cDiv4  = c / 4;
1629     int cAlign = cDiv4 * 4;
1630     for (int hi = 0; hi < area; ++hi) {
1631         const float* srcHeight = src + hi * c;
1632         float* dstHeight       = dst + hi * 4;
1633         for (int ci = 0; ci < cDiv4; ++ci) {
1634             Vec4::save(dstHeight + 4 * ci * area, Vec4::load(srcHeight + 4 * ci));
1635         }
1636     }
1637 
1638     if (cAlign == c) {
1639         return;
1640     }
1641 
1642     int cReamin   = c - cAlign;
1643     auto srcAlign = src + cAlign;
1644     auto dstAlign = dst + area * cAlign;
1645 
1646 #ifdef MNN_USE_NEON
1647     auto zeroVector = vdupq_n_f32(0.0f);
1648 #endif
1649 
1650     for (int hi = 0; hi < area; ++hi) {
1651         const float* srcHeight = srcAlign + hi * c;
1652         float* dstHeight       = dstAlign + hi * 4;
1653 #ifdef MNN_USE_NEON
1654         vst1q_f32(dstHeight, zeroVector);
1655 #else
1656         for (int i = 0; i < 4; ++i) {
1657             dstHeight[i] = 0;
1658         }
1659 #endif
1660         for (int ci = 0; ci < cReamin; ++ci) {
1661             dstHeight[ci] = srcHeight[ci];
1662         }
1663     }
1664 }
1665 
MNNPackTransposeUint8(uint8_t * dst,const uint8_t * src,size_t area,size_t depth)1666 void MNNPackTransposeUint8(uint8_t* dst, const uint8_t* src, size_t area, size_t depth) {
1667     if (1 == area) {
1668         ::memcpy(dst, src, depth * sizeof(uint8_t));
1669         return;
1670     }
1671     int c      = (int)depth;
1672     int cDiv4  = c / 4;
1673     int cAlign = cDiv4 * 4;
1674     if (cAlign == c) {
1675         int32_t* dst32       = (int32_t*)dst;
1676         const int32_t* src32 = (int32_t*)src;
1677         for (int hi = 0; hi < area; ++hi) {
1678             auto srcHeight = src32 + hi;
1679             auto dstHeight = dst32 + hi * cDiv4;
1680             for (int ci = 0; ci < cDiv4; ++ci) {
1681                 dstHeight[ci] = srcHeight[ci * area];
1682             }
1683         }
1684         return;
1685     }
1686 
1687     for (int hi = 0; hi < area; ++hi) {
1688         auto srcHeight = src + hi * 4;
1689         auto dstHeight = dst + hi * c;
1690         for (int ci = 0; ci < cDiv4; ++ci) {
1691             for (int i = 0; i < 4; ++i) {
1692                 dstHeight[ci * 4 + i] = srcHeight[4 * ci * area + i];
1693             }
1694         }
1695     }
1696 
1697     int cReamin   = c - cAlign;
1698     auto srcAlign = src + area * cAlign;
1699     auto dstAlign = dst + cAlign;
1700 
1701     for (int hi = 0; hi < area; ++hi) {
1702         auto srcHeight = srcAlign + hi * 4;
1703         auto dstHeight = dstAlign + hi * c;
1704 
1705         for (int ci = 0; ci < cReamin; ++ci) {
1706             dstHeight[ci] = srcHeight[ci];
1707         }
1708     }
1709 }
1710 
MNNPackTranspose(float * dst,const float * src,size_t area,size_t depth)1711 void MNNPackTranspose(float* dst, const float* src, size_t area, size_t depth) {
1712     int c      = (int)depth;
1713     int cDiv4  = c / 4;
1714     int cAlign = cDiv4 * 4;
1715     for (int hi = 0; hi < area; ++hi) {
1716         const float* srcHeight = src + hi * 4;
1717         float* dstHeight       = dst + hi * c;
1718         for (int ci = 0; ci < cDiv4; ++ci) {
1719             Vec4::save(dstHeight + 4 * ci, Vec4::load(srcHeight + 4 * ci * area));
1720         }
1721     }
1722 
1723     if (cAlign == c) {
1724         return;
1725     }
1726 
1727     int cReamin   = c - cAlign;
1728     auto srcAlign = src + area * cAlign;
1729     auto dstAlign = dst + cAlign;
1730 
1731     for (int hi = 0; hi < area; ++hi) {
1732         const float* srcHeight = srcAlign + hi * 4;
1733         float* dstHeight       = dstAlign + hi * c;
1734 
1735         for (int ci = 0; ci < cReamin; ++ci) {
1736             dstHeight[ci] = srcHeight[ci];
1737         }
1738     }
1739 }
1740 
MNNExp(float * dst,const float * src,size_t dataSize)1741 void MNNExp(float* dst, const float* src, size_t dataSize) {
1742     int countC8        = (int)dataSize / 8;
1743     if (countC8 > 0) {
1744         // Align to eight so asm is easier to write
1745         static float parameters[] = {
1746             (float)log(2.0f), 1.0f / (float)log(2.0f), 1.0f, 1.0f, 0.5f, 1.0f / 6.0f, 1.0f / 24.0f, 1.0f / 120.0f};
1747         MNNExpC8(dst, src, parameters, countC8);
1748     }
1749     int remain = countC8 * 8;
1750     auto param = log(2.0f);
1751     float xLimit = 87;
1752     for (int i = remain; i < dataSize; i++) {
1753         /*Origin Function*/
1754         //dst[i] = expf(-src[i]);
1755         /*Approciate Function*/
1756 
1757         auto x         = -src[i];
1758         x = ALIMAX(x, -xLimit);
1759         x = ALIMIN(x, xLimit);
1760 
1761         int div        = (x / param);
1762         int div2       = (div + 127) << 23;
1763         auto xReamin   = x - div * param;
1764         float expBasic = *(float*)(&div2);
1765 
1766         auto t         = xReamin;
1767         auto expRemain = ((((1.0f / 120 * t + 1.0f / 24) * t + 1.0f / 6) * t + 0.5f) * t + 1.0f) * t + 1.0f;
1768         dst[i]  = expBasic * expRemain;
1769     }
1770 }
1771 
1772 // Lambert's series with 7 divisions
1773 // reference from
1774 // https://varietyofsound.wordpress.com/2011/02/14/efficient-tanh-computation-using-lamberts-continued-fraction/
tanhf_poly(float value)1775 inline float tanhf_poly(float value) {
1776     if (value > 5.0) {
1777         return 1.0;
1778     } else if (value <= -5.0) {
1779         return -1.0;
1780     } else {
1781         float x2 = value * value;
1782         float a  = value * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
1783         float b  = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
1784         return a / b;
1785     }
1786 }
MNNTanh(float * dst,const float * src,size_t dataSize)1787 void MNNTanh(float* dst, const float* src, size_t dataSize) {
1788     /* Origin Code
1789     for (int i = 0; i < dataSize; i++) {
1790         // outputData[i] = 1 - 2 / (expf(2 * inputData[i]) + 1);
1791         dst[i] = tanhf_poly(src[i]);
1792     }
1793      */
1794     for (int i = 0; i < dataSize; ++i) {
1795         dst[i] = src[i] + src[i];
1796     }
1797     MNNExp(dst, dst, dataSize);
1798     for (int i = 0; i < dataSize; i++) {
1799         // outputData[i] = 1 - 2 / (expf(2 * inputData[i]) + 1);
1800         auto expX2 = dst[i];
1801         dst[i] = (1.0f - expX2) / (1.0f + expX2);
1802     }
1803 }
1804 
MNNReluWithSlope(float * dst,const float * src,size_t sizeQuad,float slope)1805 void MNNReluWithSlope(float* dst, const float* src, size_t sizeQuad, float slope) {
1806     float slopeValue[4];
1807     for (int i=0; i<4; ++i) {
1808         slopeValue[i] = slope;
1809     }
1810     MNNReluWithSlopeChannel(dst, src, slopeValue, sizeQuad, 1);
1811 }
1812 
MNNReluWithSlopeCommon(float * dst,const float * src,size_t size,float slope)1813 void MNNReluWithSlopeCommon(float* dst, const float* src, size_t size, float slope) {
1814     int sizeQuad = size / 4;
1815     int start = 0;
1816     if (sizeQuad > 0) {
1817         MNNReluWithSlope(dst, src, sizeQuad, slope);
1818         start = sizeQuad * 4;
1819     }
1820     for (int j = start; j < size; j++) {
1821         if (src[j] < 0) {
1822             dst[j] = src[j] * slope;
1823         } else {
1824             dst[j] = src[j];
1825         }
1826     }
1827 }
1828 
MNNHardSwishCommon(float * dst,const float * src,size_t size)1829 void MNNHardSwishCommon(float* dst, const float* src, size_t size) {
1830     int sizeQuad = size / 4;
1831     int start = 0;
1832 #ifdef MNN_USE_SSE
1833     if (sizeQuad > 0) {
1834         MNNHardSwish(dst, src, sizeQuad);
1835         start = sizeQuad * 4;
1836     }
1837 #endif
1838 #ifdef MNN_USE_NEON
1839     float32x4_t zero = vdupq_n_f32(0.f);
1840     float32x4_t three = vdupq_n_f32(3.f);
1841     float32x4_t six = vdupq_n_f32(6.f);
1842     float32x4_t divsix = vdupq_n_f32(1.0f/6.f);
1843     for (int i = 0; i < sizeQuad; i++) {
1844         auto x = vld1q_f32(src + 4 * i);
1845         auto y = vmulq_f32(vmulq_f32(x, vminq_f32(vmaxq_f32(vaddq_f32(x, three), zero), six)), divsix);
1846         vst1q_f32(dst + 4 * i, y);
1847     }
1848     start = sizeQuad * 4;
1849 #endif
1850     for (int j = start; j < size; j++) {
1851         if (src[j] <= -3) {
1852             dst[j] = 0;
1853         } else if (src[j] >= 3){
1854             dst[j] = src[j];
1855         } else {
1856             dst[j] = src[j] * (src[j] + 3) / 6.f;
1857         }
1858     }
1859 }
1860 
MNNGeluCommon(float * dst,const float * src,size_t size)1861 void MNNGeluCommon(float* dst, const float* src, size_t size) {
1862     int sizeQuad = size / 8;
1863     int start = 0;
1864 #ifdef MNN_USE_SSE
1865     if (sizeQuad > 0) {
1866         MNNGelu(dst, src, sizeQuad);
1867         start = sizeQuad * 8;
1868     }
1869 #endif
1870     auto tanhf_poly = [](float value) -> float {
1871         if (value > 5.0f) {
1872             return 1.0f;
1873         } else if (value <= -5.0f) {
1874             return -1.0f;
1875         } else {
1876             float x2 = value * value;
1877             float a  = value * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
1878             float b  = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
1879             return a / b;
1880         }
1881     };
1882     for (int i = start; i < size; i++) {
1883         float temp = 0.044715f * src[i] * src[i] * src[i];
1884         temp = 0.79788458f * (temp + src[i]);
1885         dst[i] = (1.0f + tanhf_poly(temp)) * src[i] * 0.5f;
1886     }
1887 }
1888 
MNNScaleAndAddBiasScalar(float * dst,const float * src,float bias,float alpha,size_t number)1889 void MNNScaleAndAddBiasScalar(float* dst, const float* src, float bias, float alpha, size_t number) {
1890     int numberC4 = (int)number / 4;
1891     int start = 0;
1892     if (numberC4 > 0) {
1893         float biasC4[4] = {
1894             bias,
1895             bias,
1896             bias,
1897             bias
1898         };
1899         float alphaC4[4] = {
1900             alpha,
1901             alpha,
1902             alpha,
1903             alpha
1904         };
1905         MNNScaleAndAddBias(dst, src, biasC4, alphaC4, numberC4, 1);
1906         start = numberC4 * 4;
1907     }
1908     for (int i=start; i<number; ++i) {
1909         dst[i] = src[i] * alpha + bias;
1910     }
1911 }
MNNAxByClamp(float * C,const float * A,const float * B,size_t width,size_t cStride,size_t aStride,size_t bStride,size_t height,const float * parameters)1912 void MNNAxByClamp(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t bStride, size_t height, const float* parameters) {
1913     int widthC4 = (int)width / 4;
1914     if (widthC4 > 0) {
1915         auto minF = Vec4(parameters[2]);
1916         auto maxF = Vec4(parameters[3]);
1917         auto alpha = Vec4(parameters[0]);
1918         auto beta = Vec4(parameters[1]);
1919         for (int y = 0; y < height; ++y) {
1920             auto a = A + aStride * y;
1921             auto b = B + bStride * y;
1922             auto c = C + cStride * y;
1923             for (int x = 0; x < width; ++x) {
1924                 auto av = Vec4::load(a + 4 * x);
1925                 auto bv = Vec4::load(b + 4 * x);
1926                 auto cv = av * alpha + bv * beta;
1927                 cv = Vec4::min(cv, maxF);
1928                 cv = Vec4::max(cv, minF);
1929                 Vec4::save(c + 4 * x, cv);
1930             }
1931         }
1932         width = width - 4*widthC4;
1933         C = C + widthC4 * 4;
1934         A = A + widthC4 * 4;
1935         B = B + widthC4 * 4;
1936     }
1937     if (width > 0) {
1938         auto minF = parameters[2];
1939         auto maxF = parameters[3];
1940         auto alpha = parameters[0];
1941         auto beta = parameters[1];
1942         for (int y = 0; y < height; ++y) {
1943             auto a = A + aStride * y;
1944             auto b = B + bStride * y;
1945             auto c = C + cStride * y;
1946             for (int x = 0; x < width; ++x) {
1947                 auto av = a[x];
1948                 auto bv = b[x];
1949                 auto cv = av * alpha + bv * beta;
1950                 cv = std::min(cv, maxF);
1951                 cv = std::max(cv, minF);
1952                 c[x] = cv;
1953             }
1954         }
1955     }
1956 }
1957 #ifndef MNN_USE_NEON
MNNAxByClampBroadcastUnit(float * C,const float * A,const float * B,size_t width,size_t cStride,size_t aStride,size_t height,const float * parameters)1958 void MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) {
1959     auto minF = Vec4(parameters[2]);
1960     auto maxF = Vec4(parameters[3]);
1961     auto beta = Vec4(parameters[1]);
1962     for (int y = 0; y < height; ++y) {
1963         auto a = A + aStride * y;
1964         auto b = B + 4 * y;
1965         auto bv = Vec4::load(b);
1966         auto c = C + cStride * y;
1967         for (int x = 0; x < width; ++x) {
1968             auto av = Vec4::load(a + 4 * x);
1969             auto cv = av + bv * beta;
1970             cv = Vec4::min(cv, maxF);
1971             cv = Vec4::max(cv, minF);
1972             Vec4::save(c + 4 * x, cv);
1973         }
1974     }
1975 }
MNNVectorTop1Float(float * input,float * maxValue,int32_t * maxIndex,size_t inputCountUnit)1976 void MNNVectorTop1Float(float* input, float* maxValue, int32_t* maxIndex, size_t inputCountUnit) {
1977     float maxV = input[0];
1978     int maxIdx = 0;
1979     for (int i = 0; i < inputCountUnit; i++) {
1980         int offset = i * UNIT;
1981         for (int j = 0; j < UNIT; j++) {
1982             if (input[offset + j] > maxV) {
1983                 maxV = input[offset + j];
1984                 maxIdx = offset + j;
1985             }
1986         }
1987     }
1988     maxValue[0] = maxV;
1989     maxIndex[0] = maxIdx;
1990 }
1991 
MNNVectorTop1Int32(int32_t * input,int32_t * maxValue,int32_t * maxIndex,size_t inputCountUnit)1992 void MNNVectorTop1Int32(int32_t* input, int32_t* maxValue, int32_t* maxIndex, size_t inputCountUnit) {
1993     int32_t maxV = input[0];
1994     int maxIdx = 0;
1995     for (int i = 0; i < inputCountUnit; i++) {
1996         int offset = i * UNIT;
1997         for (int j = 0; j < UNIT; j++) {
1998             if (input[offset + j] > maxV) {
1999                 maxV = input[offset + j];
2000                 maxIdx = offset + j;
2001             }
2002         }
2003     }
2004     maxValue[0] = maxV;
2005     maxIndex[0] = maxIdx;
2006 }
2007 
2008 #endif
2009 
MNNComputeMatMulForE_1(const float * A,const float * B,float * C,const float * biasPtr,const MatMulParam * param,size_t tId)2010 void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId) {
2011     auto l = param->l;
2012     auto h = param->h;
2013     auto numberThread = param->numberThread;
2014     auto lC4 = l / 4;
2015     auto lR = lC4 * 4;
2016     if (param->BTranspose) {
2017         for (int y=tId; y<h; y+=numberThread) {
2018             Vec4 sumValue = Vec4(0.0f);
2019             auto by = B + y * l;
2020             for (int x=0; x<lC4; ++x) {
2021                 sumValue = sumValue + Vec4::load(A + x * 4) * Vec4::load(by + x * 4);
2022             }
2023             float sumRemain = 0.0f;
2024             for (int x=lR; x<l; ++x) {
2025                 sumRemain = sumRemain + A[x] * by[x];
2026             }
2027             if (nullptr != biasPtr) {
2028                 sumRemain += biasPtr[y];
2029             }
2030             C[y] = sumRemain + sumValue[0] + sumValue[1] + sumValue[2] + sumValue[3];
2031         }
2032     } else {
2033         auto hC4 = h / 4;
2034         auto hR = hC4 * 4;
2035         for (int y=tId; y<hC4; y+=numberThread) {
2036             auto bs = B + 4 * y;
2037             Vec4 sumValue = Vec4(0.0f);
2038             if (biasPtr != nullptr) {
2039                 sumValue = Vec4::load(biasPtr + 4 * y);
2040             }
2041             auto srcY = A + y * l;
2042             for (int x=0; x<l; ++x) {
2043                 sumValue = sumValue + Vec4(A[x]) * Vec4::load(bs + h * x);
2044             }
2045             Vec4::save(C + 4 * y, sumValue);
2046         }
2047         for (int y=hR + tId; y<h; y+=numberThread) {
2048             auto bs = B + y;
2049             float sumValue = 0.0f;
2050             if (biasPtr != nullptr) {
2051                 sumValue = biasPtr[y];
2052             }
2053             auto srcY = A + y * l;
2054             for (int x=0; x<l; ++x) {
2055                 sumValue = sumValue + A[x] * bs[h * x];
2056             }
2057             C[y] = sumValue;
2058         }
2059     }
2060 }
2061 
MNNComputeMatMulForH_1(const float * A,const float * B,float * C,const float * biasPtr,const MatMulParam * param,size_t tId)2062 void MNNComputeMatMulForH_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId) {
2063     int e = param->e;
2064     int l = param->l;
2065     int numberThread = param->numberThread;
2066     if (param->ATranspose) {
2067         float biasValue = 0.0f;
2068         if (nullptr != biasPtr) {
2069             biasValue = *biasPtr;
2070         }
2071         auto eC4 = e / 4;
2072         auto eR = eC4 * 4;
2073         for (int y=tId; y<eC4; y+=numberThread) {
2074             Vec4 sumValue = Vec4(biasValue);
2075             auto srcY = A + y * 4;
2076             for (int x=0; x<l; ++x) {
2077                 sumValue = sumValue + Vec4::load(srcY + x * e) * Vec4(B[x]);
2078             }
2079             Vec4::save(C + 4 * y, sumValue);
2080         }
2081         if (0 == tId) {
2082             for (int y=eR; y<e; ++y) {
2083                 float sumValue = biasValue;
2084                 auto srcY = A + y;
2085                 for (int x=0; x<l; ++x) {
2086                     sumValue = sumValue + srcY[x * e] * B[x];
2087                 }
2088                 C[y] = sumValue;
2089             }
2090         }
2091         return;
2092     }
2093     float biasValue = 0.0f;
2094     if (nullptr != biasPtr) {
2095         biasValue = *biasPtr;
2096     }
2097     auto lC4 = l / 4;
2098     auto lR = lC4 * 4;
2099     for (int y=tId; y<e; y+=numberThread) {
2100         Vec4 sumValue = Vec4(biasValue);
2101         auto srcY = A + y * l;
2102         for (int x=0; x<lC4; ++x) {
2103             sumValue = sumValue + Vec4::load(srcY + 4 * x) * Vec4::load(B + 4 * x);
2104         }
2105         float sumSingle = sumValue[0] + sumValue[1] + sumValue[2] + sumValue[3];
2106         for (int x=lR; x<l; ++x) {
2107             sumSingle += srcY[x] * B[x];
2108         }
2109         C[y] = sumSingle;
2110     }
2111 }
2112 
MNNPackC4Int16(int16_t * dst,const int16_t * src,size_t area,size_t depth)2113 void MNNPackC4Int16(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
2114     int z, x;
2115     int cur = 0;
2116     memset(dst, 0, area * UP_DIV(depth, 4) * 4 * sizeof(int16_t));
2117     for (z = 0; z < depth; ++z) {
2118         int plane       = z / 4;
2119         int16_t* dstPlane = plane * area * 4 + dst;
2120         int offset      = z % 4;
2121         for (x = 0; x < area; ++x) {
2122             dstPlane[4 * x + offset] = src[cur++];
2123         }
2124     }
2125 }
2126 
MNNUnpackC4Int16(int16_t * dst,const int16_t * src,size_t area,size_t depth)2127 void MNNUnpackC4Int16(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
2128     int x;
2129     int z;
2130     int cur = 0;
2131     for (z = 0; z < depth; ++z) {
2132         int plane             = z / 4;
2133         const int16_t* srcPlane = plane * area * 4 + src;
2134         int offset            = z % 4;
2135         for (x = 0; x < area; ++x) {
2136             dst[cur++] = srcPlane[4 * x + offset];
2137         }
2138     }
2139 }
2140 
MNNUnpackTransposeInt16(int16_t * dst,const int16_t * src,size_t area,size_t depth)2141 void MNNUnpackTransposeInt16(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
2142     if (depth == 4) {
2143         ::memcpy(dst, src, area * depth * sizeof(int16_t));
2144         return;
2145     }
2146     int c      = (int)depth;
2147     int cDiv4  = c / 4;
2148     int cAlign = cDiv4 * 4;
2149     for (int hi = 0; hi < area; ++hi) {
2150         auto srcHeight = (src + hi * c);
2151         auto dstHeight = (dst + hi * 4);
2152         for (int ci = 0; ci < cDiv4; ++ci) {
2153             for (int i = 0; i < 4; ++i) {
2154                 dstHeight[ci * area * 4 + i] = srcHeight[4 * ci + i];
2155             }
2156         }
2157     }
2158 
2159     if (cAlign == c) {
2160         return;
2161     }
2162 
2163     int cReamin   = c - cAlign;
2164     auto srcAlign = src + cAlign;
2165     auto dstAlign = dst + area * cAlign;
2166 
2167     for (int hi = 0; hi < area; ++hi) {
2168         auto srcHeight = srcAlign + hi * c;
2169         auto dstHeight = dstAlign + hi * 4;
2170         for (int i = 0; i < 4; ++i) {
2171             dstHeight[i] = 0;
2172         }
2173         for (int ci = 0; ci < cReamin; ++ci) {
2174             dstHeight[ci] = srcHeight[ci];
2175         }
2176     }
2177 }
MNNPackTransposeInt16(int16_t * dst,const int16_t * src,size_t area,size_t depth)2178 void MNNPackTransposeInt16(int16_t* dst, const int16_t* src, size_t area, size_t depth) {
2179     if (1 == area) {
2180         ::memcpy(dst, src, depth * sizeof(int16_t));
2181         return;
2182     }
2183     int c      = (int)depth;
2184     int cDiv4  = c / 4;
2185     int cAlign = cDiv4 * 4;
2186     if (cAlign == c) {
2187         int64_t* dst32       = (int64_t*)dst;
2188         const int64_t* src32 = (int64_t*)src;
2189         for (int hi = 0; hi < area; ++hi) {
2190             auto srcHeight = src32 + hi;
2191             auto dstHeight = dst32 + hi * cDiv4;
2192             for (int ci = 0; ci < cDiv4; ++ci) {
2193                 dstHeight[ci] = srcHeight[ci * area];
2194             }
2195         }
2196         return;
2197     }
2198 
2199     for (int hi = 0; hi < area; ++hi) {
2200         auto srcHeight = src + hi * 4;
2201         auto dstHeight = dst + hi * c;
2202         for (int ci = 0; ci < cDiv4; ++ci) {
2203             for (int i = 0; i < 4; ++i) {
2204                 dstHeight[ci * 4 + i] = srcHeight[4 * ci * area + i];
2205             }
2206         }
2207     }
2208 
2209     int cReamin   = c - cAlign;
2210     auto srcAlign = src + area * cAlign;
2211     auto dstAlign = dst + cAlign;
2212 
2213     for (int hi = 0; hi < area; ++hi) {
2214         auto srcHeight = srcAlign + hi * 4;
2215         auto dstHeight = dstAlign + hi * c;
2216 
2217         for (int ci = 0; ci < cReamin; ++ci) {
2218             dstHeight[ci] = srcHeight[ci];
2219         }
2220     }
2221 }
2222 
MNNCopyC4Int16WithStride(const float * sourceF,float * destF,size_t srcStride,size_t dstStride,size_t count)2223 void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count) {
2224     auto source = (int16_t*)sourceF;
2225     auto dest = (int16_t*)destF;
2226     for (int i = 0; i < count; ++i) {
2227         auto s = source + i * srcStride;
2228         auto d = dest + i * dstStride;
2229         *(int64_t*)(d) = *((int64_t*)s);
2230     }
2231 }
2232 
2233 
MNNSin(float * dst,const float * src,size_t dataSize)2234 void MNNSin(float* dst, const float* src, size_t dataSize) {
2235     for (int i = 0; i < dataSize; i++) {
2236         dst[i] = sinf(src[i]);
2237     }
2238 }
2239 
MNNSigmoid(float * dst,const float * src,size_t dataSize)2240 void MNNSigmoid(float* dst, const float* src, size_t dataSize) {
2241     MNNExp(dst, src, dataSize);
2242     for (int i = 0; i < dataSize; ++i) {
2243         dst[i] = 1.0f / (1.0f + dst[i]);
2244     }
2245 }
2246 
2247 /**
2248  Modified from https://github.com/alibaba/MNN/pull/1359
2249  Thanks for https://github.com/hroken
2250  */
MNNSigmoidLowp(float * dst,const float * src,size_t dataSize)2251 void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize) {
2252     MNNExp(dst, src, dataSize);
2253 #ifdef MNN_USE_NEON
2254     int dataC4 = (int)dataSize / 4;
2255     if(dataC4 > 0) {
2256         // neon optimization for sigmid cpu
2257         float32x4_t value = vdupq_n_f32(1.0f);
2258         float32x4_t out = vld1q_f32(dst);
2259         for (int i = 1; i < dataC4; ++i) {
2260             out = vrecpeq_f32(vaddq_f32(value,out));
2261             vst1q_f32(dst ,out);
2262             dst += 4;
2263             out = vld1q_f32(dst);
2264         }
2265         out = vrecpeq_f32(vaddq_f32(value,out));
2266         vst1q_f32(dst, out);
2267         dataSize = dataSize - 4 * dataC4;
2268     }
2269 #endif
2270     for (int i = 0; i < dataSize; ++i) {
2271         dst[i] = 1.0f / (1.0f + dst[i]);
2272     }
2273 }
2274 
MNNMultiAndDestTransformCommon23(float ** cacheLine,const float * weigth,float * dest,int cacheLineSize,int ow,const float * bias,const float * parameters)2275 void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
2276     int unit = ow / 2;
2277     MNN_ASSERT(cacheLineSize >= 1);
2278     auto biasF = Vec4::load(bias);
2279     auto minF = Vec4(parameters[2]);
2280     auto maxF = Vec4(parameters[3]);
2281     for (int x = 0; x < unit; ++x) {
2282         auto offset = 4 * 4 * x;
2283         int i = 0;
2284         Vec4 m0     = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
2285         Vec4 m1     = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
2286         Vec4 m2     = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
2287         Vec4 m3     = Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
2288 
2289         for (i = 1; i < cacheLineSize; ++i) {
2290             m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
2291             m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
2292             m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
2293             m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
2294         }
2295 
2296         auto o0 = m0 + m1 + m2 + biasF;
2297         auto o1 = m1 - m2 + m3 + biasF;
2298         o0 = Vec4::min(maxF, o0);
2299         o1 = Vec4::min(maxF, o1);
2300         o0 = Vec4::max(minF, o0);
2301         o1 = Vec4::max(minF, o1);
2302         Vec4::save(dest + 8 * x + 0 * 4, o0);
2303         Vec4::save(dest + 8 * x + 1 * 4, o1);
2304     }
2305     if (unit * 2 < ow) {
2306         auto offset = 4 * 4 * unit;
2307         int i = 0;
2308         Vec4 m0     = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
2309         Vec4 m1     = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
2310         Vec4 m2     = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
2311 
2312         for (i = 1; i < cacheLineSize; ++i) {
2313             m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
2314             m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
2315             m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
2316         }
2317         auto o0 = m0 + m1 + m2 + biasF;
2318         o0 = Vec4::min(maxF, o0);
2319         o0 = Vec4::max(minF, o0);
2320         Vec4::save(dest + 8 * unit + 0 * 4, o0);
2321     }
2322 }
2323 extern "C" {
2324 void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit);
2325 }
2326 
MNNSourceTransformCommonF23(const float * source,float * dest,int unit,int iw,int pad,int su,int eu)2327 void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) {
2328     for (int x = 0; x < su; ++x) {
2329         auto dstX = dest + 4 * 4 * x;
2330         auto sx   = x * 2 - (int)pad;
2331         auto ex   = sx + 4;
2332 
2333         auto clampSx = std::max(sx, 0);
2334         auto clampEx = std::min(ex, (int)iw);
2335 
2336         Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
2337         for (int i = clampSx; i < clampEx; ++i) {
2338             v[i - sx] = Vec4::load(source + 4 * i);
2339         }
2340         auto m0 = v[0] - v[2];
2341         auto m1 = v[1] + v[2];
2342         auto m2 = v[2] - v[1];
2343         auto m3 = v[3] - v[1];
2344 
2345         Vec4::save(dstX + 4 * 0, m0);
2346         Vec4::save(dstX + 4 * 1, m1);
2347         Vec4::save(dstX + 4 * 2, m2);
2348         Vec4::save(dstX + 4 * 3, m3);
2349     }
2350     MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su);
2351 
2352     for (int x = eu; x < unit; ++x) {
2353         auto dstX = dest + 4 * 4 * x;
2354         auto sx   = x * 2 - (int)pad;
2355         auto ex   = sx + 4;
2356 
2357         auto clampSx = std::max(sx, 0);
2358         auto clampEx = std::min(ex, (int)iw);
2359 
2360         Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
2361         for (int i = clampSx; i < clampEx; ++i) {
2362             v[i - sx] = Vec4::load(source + 4 * i);
2363         }
2364         auto m0 = v[0] - v[2];
2365         auto m1 = v[1] + v[2];
2366         auto m2 = v[2] - v[1];
2367         auto m3 = v[3] - v[1];
2368 
2369         Vec4::save(dstX + 4 * 0, m0);
2370         Vec4::save(dstX + 4 * 1, m1);
2371         Vec4::save(dstX + 4 * 2, m2);
2372         Vec4::save(dstX + 4 * 3, m3);
2373     }
2374 }
2375 
2376 #ifndef MNN_USE_NEON
MNNConvDwF23MulTransUnit(float ** cacheLine,const float * weigth,float * dest,size_t ow,const float * bias,const float * parameters)2377 void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameters) {
2378     int unit = ow / 2;
2379     auto w00 = Vec4::load(weigth + 0 * 16 + 4 * 0);
2380     auto w01 = Vec4::load(weigth + 0 * 16 + 4 * 1);
2381     auto w02 = Vec4::load(weigth + 0 * 16 + 4 * 2);
2382     auto w03 = Vec4::load(weigth + 0 * 16 + 4 * 3);
2383     auto w10 = Vec4::load(weigth + 1 * 16 + 4 * 0);
2384     auto w11 = Vec4::load(weigth + 1 * 16 + 4 * 1);
2385     auto w12 = Vec4::load(weigth + 1 * 16 + 4 * 2);
2386     auto w13 = Vec4::load(weigth + 1 * 16 + 4 * 3);
2387     auto w20 = Vec4::load(weigth + 2 * 16 + 4 * 0);
2388     auto w21 = Vec4::load(weigth + 2 * 16 + 4 * 1);
2389     auto w22 = Vec4::load(weigth + 2 * 16 + 4 * 2);
2390     auto w23 = Vec4::load(weigth + 2 * 16 + 4 * 3);
2391     auto biasF = Vec4::load(bias);
2392     auto minF = Vec4(parameters[2]);
2393     auto maxF = Vec4(parameters[3]);
2394     for (int x = 0; x < unit; ++x) {
2395         auto offset = 4 * 4 * x;
2396         int i = 0;
2397         Vec4 m0     = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
2398         Vec4 m1     = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
2399         Vec4 m2     = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
2400         Vec4 m3     = w03 * Vec4::load(cacheLine[0] + offset + 4 * 3);
2401 
2402         m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
2403         m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
2404         m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
2405         m3 = m3 + w13 * Vec4::load(cacheLine[1] + offset + 4 * 3);
2406 
2407         m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
2408         m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
2409         m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
2410         m3 = m3 + w23 * Vec4::load(cacheLine[2] + offset + 4 * 3);
2411 
2412         auto o0 = m0 + m1 + m2 + biasF;
2413         auto o1 = m1 - m2 + m3 + biasF;
2414         o0 = Vec4::min(maxF, o0);
2415         o1 = Vec4::min(maxF, o1);
2416         o0 = Vec4::max(minF, o0);
2417         o1 = Vec4::max(minF, o1);
2418         Vec4::save(dest + 8 * x + 0 * 4, o0);
2419         Vec4::save(dest + 8 * x + 1 * 4, o1);
2420     }
2421     if (unit * 2 < ow) {
2422         auto offset = 4 * 4 * unit;
2423         Vec4 m0     = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
2424         Vec4 m1     = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
2425         Vec4 m2     = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
2426 
2427         m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
2428         m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
2429         m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
2430 
2431         m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
2432         m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
2433         m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
2434         auto o0 = m0 + m1 + m2 + biasF;
2435         o0 = Vec4::min(maxF, o0);
2436         o0 = Vec4::max(minF, o0);
2437         Vec4::save(dest + 8 * unit + 0 * 4, o0);
2438     }
2439 }
MNNConvDwF23SourceTransUnit(const float * source,float * dest,size_t unit)2440 void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) {
2441     if (unit <= 0) {
2442         return;
2443     }
2444     Vec4 v0 = Vec4::load(source + 4 * 0);
2445     Vec4 v1 = Vec4::load(source + 4 * 1);
2446     Vec4 v2;
2447     Vec4 v3;
2448     source += 8;
2449 
2450     for (int x = 0; x < unit; ++x) {
2451         v2 = Vec4::load(source + 0 * 4);
2452         v3 = Vec4::load(source + 1 * 4);
2453         auto m0 = v0 - v2;
2454         auto m1 = v1 + v2;
2455         auto m2 = v2 - v1;
2456         auto m3 = v3 - v1;
2457 
2458         Vec4::save(dest + 4 * 0, m0);
2459         Vec4::save(dest + 4 * 1, m1);
2460         Vec4::save(dest + 4 * 2, m2);
2461         Vec4::save(dest + 4 * 3, m3);
2462 
2463         source += 8;
2464         dest += 16;
2465 
2466         v0 = v2;
2467         v1 = v3;
2468     }
2469 }
2470 #endif
2471 
2472 namespace MNN {
2473 
2474 static CoreFunctions* gCoreFunction = nullptr;
2475 
MNNCoreFunctionInit()2476 void MNNCoreFunctionInit() {
2477     gCoreFunction = new CoreFunctions;
2478     // MatMul
2479     gCoreFunction->MNNGetMatMulPackMode = MNNGetMatMulPackMode;
2480     gCoreFunction->MNNGetSparseMatMulPackMode = MNNGetSparseMatMulPackMode;
2481     gCoreFunction->MNNPackC4ForMatMul_A = MNNPackC4ForMatMul_A;
2482     gCoreFunction->MNNPackForMatMul_B = MNNPackForMatMul_B;
2483     gCoreFunction->MNNPackedMatMul = MNNPackedMatMul;
2484     gCoreFunction->MNNPackedMatMulRemain = MNNPackedMatMulRemain;
2485 
2486     gCoreFunction->MNNPackForSparseMatMul_B = MNNPackForSparseMatMul_B; // sparse packing B
2487     gCoreFunction->MNNPackedSparseMatMulEpx4 = MNNPackedSparseMatMulEpx4;
2488     gCoreFunction->MNNPackedSparseMatMulEpx1 = MNNPackedSparseMatMulEpx1;
2489 
2490     gCoreFunction->MNNComputeMatMulForE_1 = MNNComputeMatMulForE_1;
2491     gCoreFunction->MNNComputeMatMulForH_1 = MNNComputeMatMulForH_1;
2492 
2493 
2494     // Lowp
2495     gCoreFunction->MNNFp32ToLowp = nullptr;
2496     gCoreFunction->MNNLowpToFp32 = nullptr;
2497     gCoreFunction->bytes = 4;// sizeof(float)
2498 
2499     // Packed Function
2500     gCoreFunction->pack = 4;
2501     gCoreFunction->MNNPackCUnit = MNNPackC4;
2502     gCoreFunction->MNNUnpackCUnit = MNNUnpackC4;
2503 
2504     // FIXME: MNNPackTranspose and MNNUnpackTranspose is reverted
2505     gCoreFunction->MNNUnpackCUnitTranspose = MNNPackTranspose;
2506     gCoreFunction->MNNPackCUnitTranspose = MNNUnpackTranspose;
2507     gCoreFunction->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnit;
2508     gCoreFunction->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwise;
2509     gCoreFunction->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWise;
2510     gCoreFunction->MNNSourceTransformCommonF23 = MNNSourceTransformCommonF23;
2511     gCoreFunction->MNNConvDwF23MulTransUnit = MNNConvDwF23MulTransUnit;
2512     gCoreFunction->MNNMultiAndDestTransformCommon23 = MNNMultiAndDestTransformCommon23;
2513     gCoreFunction->MNNMatrixAdd = MNNMatrixAdd;
2514     gCoreFunction->MNNMatrixSub = MNNMatrixSub;
2515     gCoreFunction->MNNStrassenMergeCFunction = MNNStrassenMergeCFunction;
2516     gCoreFunction->penalty = 1.5f;
2517     gCoreFunction->MNNScaleAndAddBias = MNNScaleAndAddBias;
2518     gCoreFunction->MNNAddC4WithStride = MNNAddC4WithStride;
2519     gCoreFunction->MNNCopyC4WithStride = MNNCopyC4WithStride;
2520 
2521     gCoreFunction->chooseWinoSourceTransform = WinogradFunction::chooseSourceTransform;
2522     gCoreFunction->chooseWinoDestTransform = WinogradFunction::chooseDestTransform;
2523     gCoreFunction->MNNDeconvRunForLineDepthwise = MNNDeconvRunForLineDepthwise;
2524     gCoreFunction->MNNDeconvRunForUnitDepthWise = MNNDeconvRunForUnitDepthWise;
2525     gCoreFunction->MNNSelectBinaryFunctionForFloat = CPUBinary::selectForFloat;
2526     gCoreFunction->MNNSelectUnaryFunctionForFloat = CPUUnary::selectForFloat;
2527     gCoreFunction->MNNReluWithSlopeChannel = MNNReluWithSlopeChannel;
2528     gCoreFunction->MNNPoolingAvg = (decltype(gCoreFunction->MNNPoolingAvg))(poolingAvg<float, Vec4, 4>);
2529     // Set min value as 1 << 24
2530     gCoreFunction->MNNPoolingMax = (decltype(gCoreFunction->MNNPoolingMax))(poolingMax<float, Vec4, 4, -16777216>);
2531 
2532 #ifdef MNN_USE_ARMV82
2533     cpuinfo_arm_isa gCPUInfo;
2534     cpuinfo_arm_init(&gCPUInfo);
2535     gCoreFunction->supportFp16arith = gCPUInfo.fp16arith;
2536     gCoreFunction->supportSDot = gCPUInfo.dot;
2537 #endif
2538     MNNFunctionInit();
2539 }
MNNGetCoreFunctions()2540 CoreFunctions* MNNGetCoreFunctions() {
2541     return gCoreFunction;
2542 }
2543 };
2544