1// 2// NEON_MNNUnPackC4_BF16.S 3// MNN 4// 5// Created by MNN on 2019/02/02. 6// Copyright © 2018, Alibaba Group Holding Limited 7// 8 9#ifdef __aarch64__ 10#include "MNNAsmGlobal.h" 11 12.text 13.align 5 14 15asm_function NEON_MNNUnpackC4_BF16 16// treate float pointer as int16_t* 17//void NEON_MNNUnpackC4_BF16(float* dst, const float* src, size_t area, size_t depth) 18//Auto load: 19//x0:dst, x1:src, x2:area, x3:depth 20mul x4, x2, x3 21cmp x4, #0 22beq DownEnd 23 24//Swap x0, x1 25mov x4, x0 26mov x0, x1 27mov x1, x4 28 29//x4: srcDepthOffset:area * sizeof(int16_t) 30mov x4, #2 // sizeof(int16_t) 31mul x4, x2, x4 32 33DownL4: 34cmp x3, #3 35ble DownL3 36 37DownL4Loop: 38add x5, x1, x4 39add x6, x4, x5 40add x7, x4, x6 41mov x8, x2 42cmp x8, #3 43ble DownL4AreaRemain 44DownL4AreaLoop: 45ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t) 46st1 {v0.4h}, [x1], #8 // 4 * sizeof(int16_t) 47st1 {v1.4h}, [x5], #8 // 4 * sizeof(int16_t) 48st1 {v2.4h}, [x6], #8 // 4 * sizeof(int16_t) 49st1 {v3.4h}, [x7], #8 // 4 * sizeof(int16_t) 50sub x8, x8, #4 51cmp x8, #4 52bge DownL4AreaLoop 53 54DownL4AreaRemain: 55cmp x8, #0 56beq DownL4AreaRemainEnd 57DownL4AreaRemainLoop: 58ld1 {v0.4h}, [x0], #8 59st1 {v0.h}[0], [x1], #2 60st1 {v0.h}[1], [x5], #2 61st1 {v0.h}[2], [x6], #2 62st1 {v0.h}[3], [x7], #2 63 64 65subs x8, x8, #1 66bne DownL4AreaRemainLoop 67DownL4AreaRemainEnd: 68sub x3, x3, #4 69mov x1, x7 70cmp x3, #4 71bge DownL4Loop 72 73DownL3: 74cmp x3, #2 75ble DownL2 76add x5, x1, x4 77add x6, x4, x5 78mov x8, x2 79cmp x8, #3 80ble DownL3AreaRemain 81DownL3AreaLoop: 82ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t) 83st1 {v0.4h}, [x1], #8 // 4 * sizeof(int16_t) 84st1 {v1.4h}, [x5], #8 // 4 * sizeof(int16_t) 85st1 {v2.4h}, [x6], #8 // 4 * sizeof(int16_t) 86sub x8, x8, #4 87cmp x8, #4 88bge DownL3AreaLoop 89 90cmp x8, #0 91beq DownL3AreaRemainEnd 92DownL3AreaRemain: 93ld1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t) 94st1 {v0.h}[0], [x1], #2 // sizeof(int16_t) 95st1 {v0.h}[1], [x5], #2 // sizeof(int16_t) 96st1 {v0.h}[2], [x6], #2 // sizeof(int16_t) 97 98subs x8, x8, #1 99bne DownL3AreaRemain 100 101DownL3AreaRemainEnd: 102sub x3, x3, #3 103 104 105DownL2: 106cmp x3, #1 107ble DownL1 108add x5, x1, x4 109mov x8, x2 110cmp x8, #3 111ble DownL2AreaRemain 112DownL2AreaLoop: 113ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t) 114st1 {v0.4h}, [x1], #8 115st1 {v1.4h}, [x5], #8 116 117sub x8, x8, #4 118cmp x8, #4 119bge DownL2AreaLoop 120 121cmp x8, #0 122beq DownL2AreaRemainEnd 123DownL2AreaRemain: 124ld1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t) 125st1 {v0.h}[0], [x1], #2 126st1 {v0.h}[1], [x5], #2 127 128subs x8, x8, #1 129bne DownL2AreaRemain 130 131DownL2AreaRemainEnd: 132sub x3, x3, #2 133 134DownL1: 135cmp x3, #0 136beq DownEnd 137mov x8, x2 138cmp x8, #3 139ble DownL1AreaRemain 140DownL1AreaLoop: 141ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t) 142st1 {v0.4h}, [x1], #8 143 144sub x8, x8, #4 145cmp x8, #4 146bge DownL1AreaLoop 147 148cmp x8, #0 149beq DownL1AreaRemainEnd 150DownL1AreaRemain: 151movi v0.4h, #0 152ld1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t) 153st1 {v0.h}[0], [x1], #2 154 155 156subs x8, x8, #1 157bne DownL1AreaRemain 158 159DownL1AreaRemainEnd: 160 161DownEnd: 162 163ret 164 165 166#endif 167 168