1// 2// MNNConvDwF23MulTransUnit.S 3// MNN 4// 5// Created by MNN on 2019/4/4. 6// Copyright © 2018, Alibaba Group Holding Limited 7// 8#ifdef __aarch64__ 9 10#include "MNNAsmGlobal.h" 11 12.text 13.align 5 14 15asm_function MNNConvDwF23MulTransUnit 16//void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameters); 17//Auto: x0:cacheLine, x1:weight, x2:dest, x3:ow, x4: bias, x5: parameters 18sub sp, sp, #64 19st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 20 21ld1 {v8.4s}, [x4] // bias 22ldr w9, [x5, #8] 23ldr w10, [x5, #12] 24dup v9.4s, w9 // min 25dup v10.4s, w10 // max 26 27ldr x4, [x0, #0] 28ldr x5, [x0, #8] 29ldr x6, [x0, #16] 30 31ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 32ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64 33ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x1] 34 35 36L2: 37cmp x3, #2 38blt L1 39 40LoopL2: 41 42ld1 {v20.4s, v21.4s}, [x4], #32 43fmul v0.4s, v4.4s, v20.4s 44ld1 {v22.4s, v23.4s}, [x4], #32 45fmul v1.4s, v5.4s, v21.4s 46fmul v2.4s, v6.4s, v22.4s 47ld1 {v20.4s, v21.4s}, [x5], #32 48fmul v3.4s, v7.4s, v23.4s 49 50fmla v0.4s, v16.4s, v20.4s 51ld1 {v22.4s, v23.4s}, [x5], #32 52fmla v1.4s, v17.4s, v21.4s 53fmla v2.4s, v18.4s, v22.4s 54fmla v3.4s, v19.4s, v23.4s 55 56ld1 {v20.4s, v21.4s}, [x6], #32 57fmla v0.4s, v28.4s, v20.4s 58fmla v1.4s, v29.4s, v21.4s 59fadd v0.4s, v1.4s, v0.4s 60ld1 {v22.4s, v23.4s}, [x6], #32 61 62fmla v2.4s, v30.4s, v22.4s 63fmla v3.4s, v31.4s, v23.4s 64fadd v0.4s, v0.4s, v2.4s 65 66fadd v3.4s, v3.4s, v1.4s 67fsub v1.4s, v3.4s, v2.4s 68 69fadd v0.4s, v0.4s, v8.4s 70fadd v1.4s, v1.4s, v8.4s 71 72fmin v0.4s, v0.4s, v10.4s 73fmin v1.4s, v1.4s, v10.4s 74 75fmax v0.4s, v0.4s, v9.4s 76fmax v1.4s, v1.4s, v9.4s 77 78st1 {v0.4s, v1.4s}, [x2], #32 79 80sub x3, x3, #2 81cmp x3, #2 82bge LoopL2 83 84 85L1: 86cmp x3, #0 87beq End 88ld1 {v20.4s, v21.4s, v22.4s}, [x4] 89fmul v0.4s, v4.4s, v20.4s 90fmul v1.4s, v5.4s, v21.4s 91fmul v2.4s, v6.4s, v22.4s 92ld1 {v20.4s, v21.4s, v22.4s}, [x5] 93 94fmla v0.4s, v16.4s, v20.4s 95fmla v1.4s, v17.4s, v21.4s 96fmla v2.4s, v18.4s, v22.4s 97 98ld1 {v20.4s, v21.4s, v22.4s}, [x6] 99fmla v0.4s, v28.4s, v20.4s 100fmla v1.4s, v29.4s, v21.4s 101fadd v0.4s, v1.4s, v0.4s 102 103fmla v2.4s, v30.4s, v22.4s 104fadd v0.4s, v0.4s, v2.4s 105 106fadd v0.4s, v0.4s, v8.4s 107fmin v0.4s, v0.4s, v10.4s 108fmax v0.4s, v0.4s, v9.4s 109 110st1 {v0.4s}, [x2] 111End: 112 113sub sp, sp, #64 114ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 115 116ret 117#endif 118