1// 2// NEON_MNNAxByClampBroadcastC4_BF16.S 3// MNN 4// 5// Created by MNN on 2021/03/09. 6// Copyright © 2018-2021 Alibaba Group Holding Limited 7// 8 9#ifdef __arm__ 10#ifndef __aarch64__ 11 12#include "MNNAsmGlobal.h" 13 14.text 15.align 5 16 17asm_function NEON_MNNAxByClampBroadcastC4_BF16 18//void NEON_MNNAxByClampBroadcastC4_BF16(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters) 19//Auto: r0: C, r1:A, r2:B, r3:width 20//r4:cStride, r5:aStride, r6:height, r7:parameters 21push {r4-r11, lr} 22ldr r4, [sp, #36] 23ldr r5, [sp, #40] 24ldr r6, [sp, #44] 25ldr r7, [sp, #48] 26 27 28vld1.32 {q3}, [r7] 29vdup.f32 q14, d7[0] 30vdup.f32 q15, d7[1] 31mov r12, #2 //sizeof(int16_t) 32mul r4, r12, r4 33mul r5, r12, r5 34 35LoopY: 36mov r8, r0 37mov r9, r1 38vld1.16 {d26}, [r2]! 39vshll.s16 q13, d26, #16 40mov r11, r3 41 42L1: 43cmp r11, #0 44beq EndLine 45 46L1Loop: 47vld1.16 {d0}, [r1]! 48vshll.s16 q0, d0, #16 49vmla.f32 q0, q13, d6[1] 50vmax.f32 q0, q0, q14 51vmin.f32 q0, q0, q15 52vshrn.i32 d0, q0, #16 53vst1.16 {d0}, [r0]! 54subs r11, r11, #1 55bne L1Loop 56 57EndLine: 58add r0, r8, r4 59add r1, r9, r5 60 61subs r6, r6, #1 62bne LoopY 63 64pop {r4-r11, pc} 65 66#endif 67#endif 68