1//
2//  NEON_MNNAxByClampBroadcastC4_BF16.S
3//  MNN
4//
5//  Created by MNN on 2021/03/09.
6//  Copyright © 2018-2021 Alibaba Group Holding Limited
7//
8
9#ifdef __arm__
10#ifndef __aarch64__
11
12#include "MNNAsmGlobal.h"
13
14.text
15.align 5
16
17asm_function NEON_MNNAxByClampBroadcastC4_BF16
18//void NEON_MNNAxByClampBroadcastC4_BF16(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters)
19//Auto: r0: C, r1:A, r2:B, r3:width
20//r4:cStride, r5:aStride, r6:height, r7:parameters
21push {r4-r11, lr}
22ldr r4, [sp, #36]
23ldr r5, [sp, #40]
24ldr r6, [sp, #44]
25ldr r7, [sp, #48]
26
27
28vld1.32 {q3}, [r7]
29vdup.f32 q14, d7[0]
30vdup.f32 q15, d7[1]
31mov r12, #2 //sizeof(int16_t)
32mul r4, r12, r4
33mul r5, r12, r5
34
35LoopY:
36mov r8, r0
37mov r9, r1
38vld1.16 {d26}, [r2]!
39vshll.s16 q13, d26, #16
40mov r11, r3
41
42L1:
43cmp r11, #0
44beq EndLine
45
46L1Loop:
47vld1.16 {d0}, [r1]!
48vshll.s16 q0, d0, #16
49vmla.f32 q0, q13, d6[1]
50vmax.f32 q0, q0, q14
51vmin.f32 q0, q0, q15
52vshrn.i32 d0, q0, #16
53vst1.16 {d0}, [r0]!
54subs r11, r11, #1
55bne L1Loop
56
57EndLine:
58add r0, r8, r4
59add r1, r9, r5
60
61subs r6, r6, #1
62bne LoopY
63
64pop {r4-r11, pc}
65
66#endif
67#endif
68