1// 2// MNNWinogradMatrixProductRight.S 3// MNN 4// 5// Created by MNN on 2018/08/22. 6// Copyright © 2018, Alibaba Group Holding Limited 7// 8 9#ifdef __aarch64__ 10 11#include "MNNAsmGlobal.h" 12 13.text 14.align 5 15 16asm_function MNNWinogradMatrixProductRight 17//void MNNWinogradMatrixProductRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length); 18 19//Auto: x0: S, x1:B, x2: M, x3:w, x4:h, x5:k, x6:length 20 21//unitStepInFloat 22mov x8, #16 // 4*sizeof(float) 23mul x8, x6, x8 24 25//srcYUnitStep 26mul x9, x5, x8 27 28//B's step 29mov x10, #4 30mul x10, x4, x10 31 32LoopY: 33 mov v4.d[0], x1 34 mov v4.d[1], x3 35 LoopX: 36 mov v5.d[0], x0 37 mov v5.d[1], x1 38 movi v30.4s, #0 39 mov x11, x6 40 LoopUnitSetZero: 41 st1 {v30.4s}, [x2], #16 42 subs x11, x11, #1 43 bne LoopUnitSetZero 44 sub x2, x2, x8 45 mov x12, x5 46 47 LK4: 48 cmp x12, #4 49 blt LK3 50 mov v6.d[0], x3 51 mov v6.d[1], x4 52 LoopK4: 53 ld1 {v0.s}[0], [x1], x10 54 ld1 {v0.s}[1], [x1], x10 55 ld1 {v0.s}[2], [x1], x10 56 ld1 {v0.s}[3], [x1], x10 57 mov x11, x6 58 mov v7.d[0], x1 59 60 add x1, x0, x8 61 add x3, x1, x8 62 add x4, x3, x8 63 64 LoopUnitK4: 65 ld1 {v16.4s}, [x2] 66 ld1 {v20.4s}, [x0], #16 67 fmla v16.4s, v20.4s, v0.s[0] 68 ld1 {v21.4s}, [x1], #16 69 fmul v17.4s, v21.4s, v0.s[1] 70 ld1 {v20.4s}, [x3], #16 71 fmla v16.4s, v20.4s, v0.s[2] 72 ld1 {v21.4s}, [x4], #16 73 fmla v17.4s, v21.4s, v0.s[3] 74 75 fadd v17.4s, v16.4s, v17.4s 76 st1 {v17.4s}, [x2], #16 77 subs x11, x11, #1 78 bne LoopUnitK4 79 sub x2, x2, x8 80 sub x12, x12, #4 81 mov x0, x4 82 83 mov x1, v7.d[0] 84 cmp x12, #4 85 bge LoopK4 86 mov x3, v6.d[0] 87 mov x4, v6.d[1] 88 89 LK3: 90 cmp x12, #3 91 blt LK1 92 mov v6.d[0], x3 93 LoopK3: 94 ld1 {v0.s}[0], [x1], x10 95 ld1 {v0.s}[1], [x1], x10 96 ld1 {v0.s}[2], [x1], x10 97 mov x11, x6 98 mov v7.d[0], x1 99 100 add x1, x0, x8 101 add x3, x1, x8 102 103 LoopUnitK3: 104 ld1 {v16.4s}, [x2] 105 ld1 {v20.4s}, [x0], #16 106 fmla v16.4s, v20.4s, v0.s[0] 107 ld1 {v21.4s}, [x1], #16 108 fmul v17.4s, v21.4s, v0.s[1] 109 ld1 {v20.4s}, [x3], #16 110 fmla v16.4s, v20.4s, v0.s[2] 111 112 fadd v17.4s, v16.4s, v17.4s 113 st1 {v17.4s}, [x2], #16 114 subs x11, x11, #1 115 bne LoopUnitK3 116 sub x2, x2, x8 117 sub x12, x12, #3 118 mov x0, x4 119 mov x1, v7.d[0] 120 cmp x12, #3 121 bge LoopK3 122 mov x3, v6.d[0] 123 124 LK1: 125 cmp x12, #0 126 beq LKEnd 127 128 LoopK: 129 ld1 {v31.s}[0], [x1], x10 130 131 dup v31.4s, v31.s[0] 132 mov x11, x6 133 LoopUnit: 134 ld1 {v0.4s}, [x2] 135 ld1 {v1.4s}, [x0], #16 136 fmla v0.4s, v1.4s, v31.4s 137 138 st1 {v0.4s}, [x2], #16 139 subs x11, x11, #1 140 bne LoopUnit 141 subs x12, x12, #1 142 143 sub x2, x2, x8 144 bne LoopK 145 LKEnd: 146 mov x0, v5.d[0] 147 mov x1, v5.d[1] 148 subs x3, x3, #1 149 add x2, x2, x8 150 add x1, x1, #4 //sizeof(float) 151 152 bne LoopX 153 mov x1, v4.d[0] 154 mov x3, v4.d[1] 155 add x0, x0, x9 156 157 subs x4, x4, #1 158 bne LoopY 159 160 161 162 ret 163 164#endif 165