1// 2// MNNConvRunForUnitDepthWise.S 3// MNN 4// 5// Created by MNN on 2019/02/04. 6// Copyright © 2018, Alibaba Group Holding Limited 7// 8 9#ifdef __arm__ 10#ifndef __aarch64__ 11 12#include "MNNAsmGlobal.h" 13 14.text 15.align 5 16 17asm_function MNNConvRunForUnitDepthWise 18//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step) 19 20//Auto: r0:dst, r1:src, r2:weight, r3:fw 21 22push {r4-r9, lr} 23 24//Load from sp: 25//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step 26mov r4, r3 27ldr r5, [sp, #28] 28ldr r6, [sp, #32] 29ldr r7, [sp, #36] 30ldr r8, [sp, #40] 31 32cmp r4, #0 33vmov.i32 q0, #0 34beq UnitEnd 35cmp r5, #0 36beq UnitEnd 37 38mov r9, #4 39mul r6, r9, r6 40mul r7, r9, r7 41mul r8, r9, r8 42 43//dilate_y_step -> dilate_y_step - dilate_x_step*fw 44mul r9, r4, r7 45sub r8, r8, r9 46 47//weight_y_step -> weight_y_step - 4*sizeof(float)*fw 48mov r9, #16 49mul r9, r4, r9 50sub r6, r6, r9 51 52 53UnitLoopH: 54mov r9, r4 55UnitLoopW: 56vld1.32 {q1}, [r1], r7 57vld1.32 {q2}, [r2]! 58vmla.f32 q0, q1, q2 59subs r9, r9, #1 60bne UnitLoopW 61subs r5, r5, #1 62add r1, r1, r8 63add r2, r2, r6 64bne UnitLoopH 65 66 67UnitEnd: 68 69vst1.32 {q0}, [r0] 70 71pop {r4-r9, pc} 72 73#endif 74#endif 75