1//
2//  MNNPackC8FP16.S
3//  MNN
4//
5//  Created by MNN on 2020/6/30.
6//  Copyright © 2020 Alibaba. All rights reserved.
7//
8#ifdef __aarch64__
9
10#include "MNNAsmGlobal.h"
11
12.text
13.align 5
14asm_function MNNPackC8FP16
15//void MNNPackC8FP16(FLOAT16* dest, const FLOAT16* source, size_t area, size_t depth);
16// depth, area ->  depthC8, area, 8
17// Auto: x0:dest, x1:source, x2: area, x3: depth
18// x4: areaC8, x5:depthC8, x6: sourceStride, x7: destStride
19
20lsr x4, x2, #3
21lsr x5, x3, #3
22mov x12, #2  // sizeof(FLOAT16)
23mov x13, #16 // 8 * sizeof(FLOAT16)
24mul x6, x12, x2
25mul x7, x13, x2
26mov x12, #32
27mul x15, x12, x2
28
29// [x0, x1, x2, x3] => [x0, x6, x2, x3] =mov=> [x0, x1, x2, x3]
30.macro transpose_4x4 x0, x1, x2, x3, x5, x6
31// x0: [00,01,02,03]    \   x5:[00,10,02,12]    \   x0:[00,10,20,30]
32// x1: [10,11,12,13]  ===\  x1:[01,11,03,13]  ===\  x6:[01,11,21,31]
33// x2: [20,21,22,23]  ===/  x6:[20,30,22,32]  ===/  x2:[02,12,22,32]
34// x3: [30,31,32,33]    /   x3:[21,31,23,33]    /   x3:[03,13,23,33]
35    trn1 \x5\().4s,  \x0\().4s, \x1\().4s
36    trn2 \x1\().4s,  \x0\().4s, \x1\().4s
37    trn1 \x6\().4s,  \x2\().4s, \x3\().4s
38    trn2 \x3\().4s,  \x2\().4s, \x3\().4s
39    trn1 \x0\().2d,  \x5\().2d, \x6\().2d
40    trn2 \x2\().2d,  \x5\().2d, \x6\().2d
41    trn1 \x6\().2d,  \x1\().2d, \x3\().2d
42    trn2 \x3\().2d,  \x1\().2d, \x3\().2d
43    mov \x1\().16b, \x6\().16b
44.endm
45
46LoopH:
47mov x8, x0
48mov x9, x1
49mov x12, x4
50
51LoopL:
52mov x10, x9
53ld1 {v16.4s, v17.4s}, [x9], x6
54ld1 {v18.4s, v19.4s}, [x9], x6
55ld1 {v20.4s, v21.4s}, [x9], x6
56ld1 {v22.4s, v23.4s}, [x9], x6
57
58ld1 {v24.4s, v25.4s}, [x9], x6
59ld1 {v26.4s, v27.4s}, [x9], x6
60ld1 {v28.4s, v29.4s}, [x9], x6
61ld1 {v30.4s, v31.4s}, [x9], x6
62
63transpose_4x4 v16, v18, v20, v22, v0, v1
64transpose_4x4 v17, v19, v21, v23, v2, v3
65transpose_4x4 v24, v26, v28, v30, v4, v5
66transpose_4x4 v25, v27, v29, v31, v6, v7
67
68stp q16, q24, [x8], #32
69stp q18, q26, [x8], #32
70stp q20, q28, [x8], #32
71stp q22, q30, [x8], #32
72
73stp q17, q25, [x8], #32
74stp q19, q27, [x8], #32
75stp q21, q29, [x8], #32
76stp q23, q31, [x8], #32
77
78add x9, x10, #32
79
80subs x12, x12, #1
81bne LoopL
82
83
84subs x5, x5, #1
85add x0, x0, x7
86add x1, x1, x15
87bne LoopH
88
89
90ret
91
92#endif
93