1//
2//  NEON_MNNConvRunForUnitDepthWise_BF16.S
3//  MNN
4//
5//  Created by MNN on 2021/03/09.
6//  Copyright © 2018-2021 Alibaba Group Holding Limited
7//
8
9#ifdef __arm__
10#ifndef __aarch64__
11
12#include "MNNAsmGlobal.h"
13
14.text
15.align 5
16
17asm_function NEON_MNNConvRunForUnitDepthWise_BF16
18//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
19
20//Auto: r0:dst, r1:src, r2:weight, r3:fw
21
22push {r4-r9, lr}
23
24//Load from sp:
25//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
26mov r4, r3
27ldr r5, [sp, #28]
28ldr r6, [sp, #32]
29ldr r7, [sp, #36]
30ldr r8, [sp, #40]
31
32cmp r4, #0
33vmov.i32 q0, #0
34beq UnitEnd
35cmp r5, #0
36beq UnitEnd
37
38mov r9, #2
39mul r6, r9, r6 // x6(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
40mul r7, r9, r7 // x7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
41mul r8, r9, r8 // x8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
42
43//dilate_y_step -> dilate_y_step - dilate_x_step*fw
44mul r9, r4, r7
45sub r8, r8, r9
46
47//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
48mov r9, #8
49mul r9, r4, r9
50sub r6, r6, r9
51
52
53UnitLoopH:
54mov r9, r4
55UnitLoopW:
56vld1.16 {d2}, [r1], r7
57vld1.16 {d4}, [r2]!
58vshll.s16 q1, d2, #16
59vshll.s16 q2, d4, #16
60
61vmla.f32 q0, q1, q2
62subs r9, r9, #1
63bne UnitLoopW
64subs r5, r5, #1
65add r1, r1, r8
66add r2, r2, r6
67bne UnitLoopH
68
69
70UnitEnd:
71vshrn.i32 d0, q0, #16
72vst1.16 {d0}, [r0]
73
74pop {r4-r9, pc}
75
76#endif
77#endif
78