1// 2// NEON_MNNConvRunForUnitDepthWise_BF16.S 3// MNN 4// 5// Created by MNN on 2021/03/09. 6// Copyright © 2018-2021 Alibaba Group Holding Limited 7// 8 9#ifdef __arm__ 10#ifndef __aarch64__ 11 12#include "MNNAsmGlobal.h" 13 14.text 15.align 5 16 17asm_function NEON_MNNConvRunForUnitDepthWise_BF16 18//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step) 19 20//Auto: r0:dst, r1:src, r2:weight, r3:fw 21 22push {r4-r9, lr} 23 24//Load from sp: 25//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step 26mov r4, r3 27ldr r5, [sp, #28] 28ldr r6, [sp, #32] 29ldr r7, [sp, #36] 30ldr r8, [sp, #40] 31 32cmp r4, #0 33vmov.i32 q0, #0 34beq UnitEnd 35cmp r5, #0 36beq UnitEnd 37 38mov r9, #2 39mul r6, r9, r6 // x6(weight_y_step in byte) = sizeof(int16_t) * weight_y_step 40mul r7, r9, r7 // x7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step 41mul r8, r9, r8 // x8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step 42 43//dilate_y_step -> dilate_y_step - dilate_x_step*fw 44mul r9, r4, r7 45sub r8, r8, r9 46 47//weight_y_step -> weight_y_step - 4*sizeof(float)*fw 48mov r9, #8 49mul r9, r4, r9 50sub r6, r6, r9 51 52 53UnitLoopH: 54mov r9, r4 55UnitLoopW: 56vld1.16 {d2}, [r1], r7 57vld1.16 {d4}, [r2]! 58vshll.s16 q1, d2, #16 59vshll.s16 q2, d4, #16 60 61vmla.f32 q0, q1, q2 62subs r9, r9, #1 63bne UnitLoopW 64subs r5, r5, #1 65add r1, r1, r8 66add r2, r2, r6 67bne UnitLoopH 68 69 70UnitEnd: 71vshrn.i32 d0, q0, #16 72vst1.16 {d0}, [r0] 73 74pop {r4-r9, pc} 75 76#endif 77#endif 78