1//
2//  MNNConvRunForUnitDepthWise.S
3//  MNN
4//
5//  Created by MNN on 2019/02/04.
6//  Copyright © 2018, Alibaba Group Holding Limited
7//
8
9#ifdef __aarch64__
10
11#include "MNNAsmGlobal.h"
12
13.text
14.align 5
15
16asm_function MNNConvRunForUnitDepthWise
17//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
18
19//Auto: x0:dst, x1:src, x2:weight, x3:fw
20//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step
21
22cmp x3, #0
23movi v0.4s, #0
24beq UnitEnd
25cmp x4, #0
26beq UnitEnd
27
28mov x9, #4
29mul x5, x9, x5
30mul x6, x9, x6
31mul x7, x9, x7
32
33//dilate_y_step -> dilate_y_step - dilate_x_step*fw
34mul x9, x3, x6
35sub x7, x7, x9
36
37//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
38mov x9, #16
39mul x9, x3, x9
40sub x5, x5, x9
41
42
43UnitLoopH:
44mov x9, x3
45UnitLoopW:
46ld1 {v1.4s}, [x1], x6
47ld1 {v2.4s}, [x2], #16
48fmla v0.4s, v1.4s, v2.4s
49subs x9, x9, #1
50bne UnitLoopW
51subs x4, x4, #1
52add x1, x1, x7
53add x2, x2, x5
54bne UnitLoopH
55
56
57UnitEnd:
58
59st1 {v0.4s}, [x0]
60
61ret
62
63#endif
64