1 //
2 //  FunctionSummary.hpp
3 //  MNN
4 //
5 //  Created by MNN on 2019/08/25.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #if defined(_MSC_VER)
10 #include <intrin.h>
11 #else
12 #include <x86intrin.h>
13 #endif
14 #include <MNN/MNNDefine.h>
15 #include <stdint.h>
16 
17 #ifndef _MM_TRANSPOSE4_PS
18 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
19     do {                                          \
20         __m128 tmp3, tmp2, tmp1, tmp0;            \
21         tmp0   = _mm_unpacklo_ps((row0), (row1)); \
22         tmp2   = _mm_unpacklo_ps((row2), (row3)); \
23         tmp1   = _mm_unpackhi_ps((row0), (row1)); \
24         tmp3   = _mm_unpackhi_ps((row2), (row3)); \
25         (row0) = _mm_movelh_ps(tmp0, tmp2);       \
26         (row1) = _mm_movehl_ps(tmp2, tmp0);       \
27         (row2) = _mm_movelh_ps(tmp1, tmp3);       \
28         (row3) = _mm_movehl_ps(tmp3, tmp1);       \
29     } while (0)
30 #endif
31 #include "backend/cpu/compute/Int8FunctionsOpt.h"
32 #include "backend/cpu/compute/CommonOptFunction.h"
33 
34 // ========= CommonOptFunction.cpp ===========
35 extern "C" {
36 
37 void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
38 void _AVX_MNNGemmFloatCommon_4(float* dst, const float* src, const float* weight, size_t src_depth_quad,
39                                size_t dst_step, size_t dst_depth_quad, size_t width, size_t weight_depth_offset);
40 void _AVX_MNNGemmFloatUnit_4(float* dstOrigin, const float* src, const float* weight, size_t src_depth_quad,
41                              size_t dst_step, size_t dst_depth_quad, size_t weight_depth_offset);
42 void _AVX_MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
43                        size_t bStride, size_t height);
44 void _AVX_MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
45                        size_t bStride, size_t height);
46 void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride,
47                                     size_t length, size_t hSub);
48 
49 void _AVX_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter,
50                           const float* postParameters, const float* bias);
51 void _AVX_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
52                                 const float* postParameters, const float* bias);
53 void _AVX_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
54 
55 void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
56                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
57                                      size_t srcHStep, size_t dstHStep);
58 void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
59 void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
60 
61 void _AVX_MNNExpC8(float* dest, const float* source, const float* parameters, size_t countC8);
62 void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minV, ssize_t maxV, ssize_t zeroPoint);
63 void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t sizeQuad, ssize_t zeroPoint);
64 void _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
65 void _AVX_MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
66 
67 void _AVX_MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
68 
69 void _AVX_MNNGetMatMulPackMode_BF16(int* eP, int *lP, int* hP);
70 void _AVX_MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t l, bool transpose);
71 void _AVX_MNNPackedSparseMatMul(float* C, const float* A, const float* B, unsigned int* NNZMap, int* dataOffsetMap, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias);
72 void _AVX_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad);
73 void _AVX_MNNComputeMatMulForH_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
74 
75 void _AVX_MNNPackCUnit(float* dst, const float* src, size_t area, size_t depth);
76 void _AVX_MNNUnpackCUnit(float* dst, const float* src, size_t area, size_t depth);
77 void _AVX_MNNPackCUnitTranspose(float* dst, const float* src, size_t area, size_t depth);
78 void _AVX_MNNUnpackCUnitTranspose(float* dst, const float* src, size_t area, size_t depth);
79 void _AVX_MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose);
80 void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, size_t eSub, size_t hSub);
81 void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
82                                      size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
83 void _AVX_MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameter);
84 void _AVX_MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
85 void _AVX_MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameter);
86 
87 void _AVX_ExtraInit(void* functions);
88 void _AVX_WinogradInit(void* functions);
89 void _AVX_MNNCopyC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count);
90 void _AVX_MNNAddC4WithStride(const float* source, float* dest, size_t srcStride, size_t dstStride, size_t count);
91 void _AVX_MNNScaleAndAddBias(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, size_t biasNumber);
92 void _AVX_MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* weight, size_t fw, size_t fh,
93                                        size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
94 void _AVX_MNNDeconvRunForLineDepthwise(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
95                                        size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
96 
97 void _AVX_MNNGelu(float *dst, const float *src, size_t size);
98 }
99