1// 2// MNNPackC4_BF16.S 3// MNN 4// 5// Created by MNN on 2021/02/26. 6// Copyright © 2018-2021 Alibaba Group Holding Limited 7// 8 9 10 11 12#ifdef __arm__ 13#ifndef __aarch64__ 14 15#include "MNNAsmGlobal.h" 16.text 17.align 5 18 19// .macro transpose 20// vtrn.16 d0, d1 21// vtrn.16 d2, d3 22// vswp d0[2-3], d1[2-3] // should swap high half of d-vector, the half is 32-bit. there is no instruction, we use vst4.16 instead 23// vswp d2[2-3], d3[2-3] 24// .endm 25 26asm_function MNNPackC4_BF16 27// treate float pointer as int16_t* 28//void MNNPackC4_BF16(float* dst, const float* src, size_t area, size_t depth) 29//Auto load: 30//r0:dst, r1:src, r2:area, r3:depth 31 32 33push {r4, r5, r6, r7, r8, lr} 34 35mul r4, r2, r3 36cmp r4, #0 37beq UpEnd 38 39//r4: src DepthOffset:area*sizeof(int16_t) 40mov r4, #2 // sizeof(int16_t) 41mul r4, r2, r4 42 43UpL4: 44cmp r3, #3 45ble UpL3 46 47UpL4Loop: 48add r5, r1, r4 49add r6, r4, r5 50add r7, r4, r6 51mov r8, r2 52cmp r8, #3 53ble UpL4AreaRemain 54UpL4AreaLoop: 55vld1.16 {d0}, [r1]! // load 4 elements of 16-bit into 64bit vector register d0 56vld1.16 {d1}, [r5]! 57vld1.16 {d2}, [r6]! 58vld1.16 {d3}, [r7]! 59// transpose // no suitable instruction to transpose int16_t type 60vst4.16 {d0, d1, d2, d3}, [r0]! 61sub r8, r8, #4 62cmp r8, #4 63bge UpL4AreaLoop 64 65UpL4AreaRemain: 66cmp r8, #0 67beq UpL4AreaRemainEnd 68UpL4AreaRemainLoop: 69vld1.16 {d0[0]}, [r1]! 70vld1.16 {d0[1]}, [r5]! 71vld1.16 {d0[2]}, [r6]! 72vld1.16 {d0[3]}, [r7]! 73 74vst1.16 {d0}, [r0]! 75 76subs r8, r8, #1 77bne UpL4AreaRemainLoop 78UpL4AreaRemainEnd: 79sub r3, r3, #4 80mov r1, r7 81cmp r3, #4 82bge UpL4Loop 83 84UpL3: 85cmp r3, #2 86ble UpL2 87add r5, r1, r4 88add r6, r4, r5 89mov r8, r2 90cmp r8, #3 91ble UpL3AreaRemain 92UpL3AreaLoop: 93vld1.16 {d0}, [r1]! 94vmov.i16 d3, #0 95vld1.16 {d1}, [r5]! 96vld1.16 {d2}, [r6]! 97// transpose // no suitable instruction to transpose int16_t type 98vst4.16 {d0, d1, d2, d3}, [r0]! 99sub r8, r8, #4 100cmp r8, #4 101bge UpL3AreaLoop 102 103cmp r8, #0 104beq UpL3AreaRemainEnd 105UpL3AreaRemain: 106vmov.i16 d0, #0 107vld1.16 {d0[0]}, [r1]! 108vld1.16 {d0[1]}, [r5]! 109vld1.16 {d0[2]}, [r6]! 110 111vst1.16 {d0}, [r0]! 112 113subs r8, r8, #1 114bne UpL3AreaRemain 115 116UpL3AreaRemainEnd: 117sub r3, r3, #3 118 119 120UpL2: 121cmp r3, #1 122ble UpL1 123add r5, r1, r4 124mov r8, r2 125cmp r8, #3 126ble UpL2AreaRemain 127UpL2AreaLoop: 128vld1.16 {d0}, [r1]! 129vmov.i16 d3, #0 130vld1.16 {d1}, [r5]! 131vmov.i16 d2, #0 132// transpose // no suitable instruction to transpose int16_t type 133vst4.16 {d0, d1, d2, d3}, [r0]! 134sub r8, r8, #4 135cmp r8, #4 136bge UpL2AreaLoop 137 138cmp r8, #0 139beq UpL2AreaRemainEnd 140UpL2AreaRemain: 141vmov.i16 d0, #0 142vld1.16 {d0[0]}, [r1]! 143vld1.16 {d0[1]}, [r5]! 144 145vst1.16 {d0}, [r0]! 146 147subs r8, r8, #1 148bne UpL2AreaRemain 149 150UpL2AreaRemainEnd: 151sub r3, r3, #2 152 153UpL1: 154cmp r3, #0 155beq UpEnd 156mov r8, r2 157cmp r8, #3 158ble UpL1AreaRemain 159UpL1AreaLoop: 160vld1.16 {d0}, [r1]! 161vmov.i16 d3, #0 162vmov.i16 d1, #0 163vmov.i16 d2, #0 164// transpose // no suitable instruction to transpose int16_t type 165vst4.16 {d0, d1, d2, d3}, [r0]! 166sub r8, r8, #4 167cmp r8, #4 168bge UpL1AreaLoop 169 170cmp r8, #0 171beq UpL1AreaRemainEnd 172UpL1AreaRemain: 173vmov.i16 d0, #0 174vld1.16 {d0[0]}, [r1]! 175 176vst1.16 {d0}, [r0]! 177 178subs r8, r8, #1 179bne UpL1AreaRemain 180 181UpL1AreaRemainEnd: 182 183UpEnd: 184 185pop {r4, r5, r6, r7, r8, pc} 186 187#endif 188#endif 189