1//
2//  MNNConvRunForUnitDepthWise.S
3//  MNN
4//
5//  Created by MNN on 2019/02/04.
6//  Copyright © 2018, Alibaba Group Holding Limited
7//
8
9#ifdef __arm__
10#ifndef __aarch64__
11
12#include "MNNAsmGlobal.h"
13
14.text
15.align 5
16
17asm_function MNNConvRunForUnitDepthWise
18//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
19
20//Auto: r0:dst, r1:src, r2:weight, r3:fw
21
22push {r4-r9, lr}
23
24//Load from sp:
25//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
26mov r4, r3
27ldr r5, [sp, #28]
28ldr r6, [sp, #32]
29ldr r7, [sp, #36]
30ldr r8, [sp, #40]
31
32cmp r4, #0
33vmov.i32 q0, #0
34beq UnitEnd
35cmp r5, #0
36beq UnitEnd
37
38mov r9, #4
39mul r6, r9, r6
40mul r7, r9, r7
41mul r8, r9, r8
42
43//dilate_y_step -> dilate_y_step - dilate_x_step*fw
44mul r9, r4, r7
45sub r8, r8, r9
46
47//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
48mov r9, #16
49mul r9, r4, r9
50sub r6, r6, r9
51
52
53UnitLoopH:
54mov r9, r4
55UnitLoopW:
56vld1.32 {q1}, [r1], r7
57vld1.32 {q2}, [r2]!
58vmla.f32 q0, q1, q2
59subs r9, r9, #1
60bne UnitLoopW
61subs r5, r5, #1
62add r1, r1, r8
63add r2, r2, r6
64bne UnitLoopH
65
66
67UnitEnd:
68
69vst1.32 {q0}, [r0]
70
71pop {r4-r9, pc}
72
73#endif
74#endif
75