1// 2// MNNPackC8FP16.S 3// MNN 4// 5// Created by MNN on 2020/6/30. 6// Copyright © 2020 Alibaba. All rights reserved. 7// 8#ifdef __aarch64__ 9 10#include "MNNAsmGlobal.h" 11 12.text 13.align 5 14asm_function MNNPackC8FP16 15//void MNNPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t area, size_t depth); 16// depth, area -> depthC8, area, 8 17// Auto: x0:dest, x1:source, x2: area, x3: depth 18// x4: areaC8, x5:depthC8, x6: sourceStride, x7: destStride 19 20lsr x4, x2, #3 21lsr x5, x3, #3 22mov x12, #2 // sizeof(FLOAT16) 23mov x13, #16 // 8 * sizeof(FLOAT16) 24mul x6, x12, x2 25mul x7, x13, x2 26mov x12, #32 27mul x15, x12, x2 28 29// [x0, x1, x2, x3] => [x0, x6, x2, x3] =mov=> [x0, x1, x2, x3] 30.macro transpose_4x4 x0, x1, x2, x3, x5, x6 31// x0: [00,01,02,03] \ x5:[00,10,02,12] \ x0:[00,10,20,30] 32// x1: [10,11,12,13] ===\ x1:[01,11,03,13] ===\ x6:[01,11,21,31] 33// x2: [20,21,22,23] ===/ x6:[20,30,22,32] ===/ x2:[02,12,22,32] 34// x3: [30,31,32,33] / x3:[21,31,23,33] / x3:[03,13,23,33] 35 trn1 \x5\().4s, \x0\().4s, \x1\().4s 36 trn2 \x1\().4s, \x0\().4s, \x1\().4s 37 trn1 \x6\().4s, \x2\().4s, \x3\().4s 38 trn2 \x3\().4s, \x2\().4s, \x3\().4s 39 trn1 \x0\().2d, \x5\().2d, \x6\().2d 40 trn2 \x2\().2d, \x5\().2d, \x6\().2d 41 trn1 \x6\().2d, \x1\().2d, \x3\().2d 42 trn2 \x3\().2d, \x1\().2d, \x3\().2d 43 mov \x1\().16b, \x6\().16b 44.endm 45 46LoopH: 47mov x8, x0 48mov x9, x1 49mov x12, x4 50 51LoopL: 52mov x10, x9 53ld1 {v16.4s, v17.4s}, [x9], x6 54ld1 {v18.4s, v19.4s}, [x9], x6 55ld1 {v20.4s, v21.4s}, [x9], x6 56ld1 {v22.4s, v23.4s}, [x9], x6 57 58ld1 {v24.4s, v25.4s}, [x9], x6 59ld1 {v26.4s, v27.4s}, [x9], x6 60ld1 {v28.4s, v29.4s}, [x9], x6 61ld1 {v30.4s, v31.4s}, [x9], x6 62 63transpose_4x4 v16, v18, v20, v22, v0, v1 64transpose_4x4 v17, v19, v21, v23, v2, v3 65transpose_4x4 v24, v26, v28, v30, v4, v5 66transpose_4x4 v25, v27, v29, v31, v6, v7 67 68stp q16, q24, [x8], #32 69stp q18, q26, [x8], #32 70stp q20, q28, [x8], #32 71stp q22, q30, [x8], #32 72 73stp q17, q25, [x8], #32 74stp q19, q27, [x8], #32 75stp q21, q29, [x8], #32 76stp q23, q31, [x8], #32 77 78add x9, x10, #32 79 80subs x12, x12, #1 81bne LoopL 82 83 84subs x5, x5, #1 85add x0, x0, x7 86add x1, x1, x15 87bne LoopH 88 89 90ret 91 92#endif 93