1/******************************************************************************
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2015 Martin Storsjo
4 * Copyright © 2015 Janne Grunau
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
28
29#ifndef DAV1D_SRC_ARM_64_UTIL_S
30#define DAV1D_SRC_ARM_64_UTIL_S
31
32#include "config.h"
33#include "src/arm/asm.S"
34
35.macro  movrel rd, val, offset=0
36#if defined(__APPLE__)
37  .if \offset < 0
38        adrp            \rd, \val@PAGE
39        add             \rd, \rd, \val@PAGEOFF
40        sub             \rd, \rd, -(\offset)
41  .else
42        adrp            \rd, \val+(\offset)@PAGE
43        add             \rd, \rd, \val+(\offset)@PAGEOFF
44  .endif
45#elif defined(PIC) && defined(_WIN32)
46  .if \offset < 0
47        adrp            \rd, \val
48        add             \rd, \rd, :lo12:\val
49        sub             \rd, \rd, -(\offset)
50  .else
51        adrp            \rd, \val+(\offset)
52        add             \rd, \rd, :lo12:\val+(\offset)
53  .endif
54#elif defined(PIC)
55        adrp            \rd, \val+(\offset)
56        add             \rd, \rd, :lo12:\val+(\offset)
57#else
58        ldr             \rd, =\val+\offset
59#endif
60.endm
61
62.macro sub_sp space
63#ifdef _WIN32
64.if \space > 8192
65        // Here, we'd need to touch two (or more) pages while decrementing
66        // the stack pointer.
67        .error          "sub_sp_align doesn't support values over 8K at the moment"
68.elseif \space > 4096
69        sub             x16, sp,  #4096
70        ldr             xzr, [x16]
71        sub             sp,  x16, #(\space - 4096)
72.else
73        sub             sp,  sp,  #\space
74.endif
75#else
76.if \space >= 4096
77        sub             sp,  sp,  #(\space)/4096*4096
78.endif
79.if (\space % 4096) != 0
80        sub             sp,  sp,  #(\space)%4096
81.endif
82#endif
83.endm
84
85.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl
86        // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7
87        zip1            \r0\().16b, \r0\().16b, \r1\().16b
88        // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7
89        zip1            \r2\().16b, \r2\().16b, \r3\().16b
90        // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7
91        zip1            \r4\().16b, \r4\().16b, \r5\().16b
92        // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7
93        zip1            \r6\().16b, \r6\().16b, \r7\().16b
94
95        // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6
96        trn1            \r1\().8h,  \r0\().8h,  \r2\().8h
97        // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7
98        trn2            \r3\().8h,  \r0\().8h,  \r2\().8h
99        // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6
100        trn1            \r5\().8h,  \r4\().8h,  \r6\().8h
101        // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7
102        trn2            \r7\().8h,  \r4\().8h,  \r6\().8h
103
104        // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4
105        trn1            \r0\().4s,  \r1\().4s,  \r5\().4s
106        // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6
107        trn2            \r2\().4s,  \r1\().4s,  \r5\().4s
108        // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5
109        trn1            \r1\().4s,  \r3\().4s,  \r7\().4s
110        // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7
111        trn2            \r3\().4s,  \r3\().4s,  \r7\().4s
112
113        \xtl\()2        \r4\().8h,  \r0\().16b
114        \xtl            \r0\().8h,  \r0\().8b
115        \xtl\()2        \r6\().8h,  \r2\().16b
116        \xtl            \r2\().8h,  \r2\().8b
117        \xtl\()2        \r5\().8h,  \r1\().16b
118        \xtl            \r1\().8h,  \r1\().8b
119        \xtl\()2        \r7\().8h,  \r3\().16b
120        \xtl            \r3\().8h,  \r3\().8b
121.endm
122
123.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
124        trn1            \t8\().8h,  \r0\().8h,  \r1\().8h
125        trn2            \t9\().8h,  \r0\().8h,  \r1\().8h
126        trn1            \r1\().8h,  \r2\().8h,  \r3\().8h
127        trn2            \r3\().8h,  \r2\().8h,  \r3\().8h
128        trn1            \r0\().8h,  \r4\().8h,  \r5\().8h
129        trn2            \r5\().8h,  \r4\().8h,  \r5\().8h
130        trn1            \r2\().8h,  \r6\().8h,  \r7\().8h
131        trn2            \r7\().8h,  \r6\().8h,  \r7\().8h
132
133        trn1            \r4\().4s,  \r0\().4s,  \r2\().4s
134        trn2            \r2\().4s,  \r0\().4s,  \r2\().4s
135        trn1            \r6\().4s,  \r5\().4s,  \r7\().4s
136        trn2            \r7\().4s,  \r5\().4s,  \r7\().4s
137        trn1            \r5\().4s,  \t9\().4s,  \r3\().4s
138        trn2            \t9\().4s,  \t9\().4s,  \r3\().4s
139        trn1            \r3\().4s,  \t8\().4s,  \r1\().4s
140        trn2            \t8\().4s,  \t8\().4s,  \r1\().4s
141
142        trn1            \r0\().2d,  \r3\().2d,  \r4\().2d
143        trn2            \r4\().2d,  \r3\().2d,  \r4\().2d
144        trn1            \r1\().2d,  \r5\().2d,  \r6\().2d
145        trn2            \r5\().2d,  \r5\().2d,  \r6\().2d
146        trn2            \r6\().2d,  \t8\().2d,  \r2\().2d
147        trn1            \r2\().2d,  \t8\().2d,  \r2\().2d
148        trn1            \r3\().2d,  \t9\().2d,  \r7\().2d
149        trn2            \r7\().2d,  \t9\().2d,  \r7\().2d
150.endm
151
152.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
153        trn1            \t8\().16b, \r0\().16b, \r1\().16b
154        trn2            \t9\().16b, \r0\().16b, \r1\().16b
155        trn1            \r1\().16b, \r2\().16b, \r3\().16b
156        trn2            \r3\().16b, \r2\().16b, \r3\().16b
157        trn1            \r0\().16b, \r4\().16b, \r5\().16b
158        trn2            \r5\().16b, \r4\().16b, \r5\().16b
159        trn1            \r2\().16b, \r6\().16b, \r7\().16b
160        trn2            \r7\().16b, \r6\().16b, \r7\().16b
161
162        trn1            \r4\().8h,  \r0\().8h,  \r2\().8h
163        trn2            \r2\().8h,  \r0\().8h,  \r2\().8h
164        trn1            \r6\().8h,  \r5\().8h,  \r7\().8h
165        trn2            \r7\().8h,  \r5\().8h,  \r7\().8h
166        trn1            \r5\().8h,  \t9\().8h,  \r3\().8h
167        trn2            \t9\().8h,  \t9\().8h,  \r3\().8h
168        trn1            \r3\().8h,  \t8\().8h,  \r1\().8h
169        trn2            \t8\().8h,  \t8\().8h,  \r1\().8h
170
171        trn1            \r0\().4s,  \r3\().4s,  \r4\().4s
172        trn2            \r4\().4s,  \r3\().4s,  \r4\().4s
173        trn1            \r1\().4s,  \r5\().4s,  \r6\().4s
174        trn2            \r5\().4s,  \r5\().4s,  \r6\().4s
175        trn2            \r6\().4s,  \t8\().4s,  \r2\().4s
176        trn1            \r2\().4s,  \t8\().4s,  \r2\().4s
177        trn1            \r3\().4s,  \t9\().4s,  \r7\().4s
178        trn2            \r7\().4s,  \t9\().4s,  \r7\().4s
179.endm
180
181.macro  transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
182        trn1            \t4\().16b, \r0\().16b, \r1\().16b
183        trn2            \t5\().16b, \r0\().16b, \r1\().16b
184        trn1            \t6\().16b, \r2\().16b, \r3\().16b
185        trn2            \t7\().16b, \r2\().16b, \r3\().16b
186
187        trn1            \r0\().8h,  \t4\().8h,  \t6\().8h
188        trn2            \r2\().8h,  \t4\().8h,  \t6\().8h
189        trn1            \r1\().8h,  \t5\().8h,  \t7\().8h
190        trn2            \r3\().8h,  \t5\().8h,  \t7\().8h
191.endm
192
193.macro  transpose_4x4h  r0, r1, r2, r3, t4, t5, t6, t7
194        trn1            \t4\().4h,  \r0\().4h,  \r1\().4h
195        trn2            \t5\().4h,  \r0\().4h,  \r1\().4h
196        trn1            \t6\().4h,  \r2\().4h,  \r3\().4h
197        trn2            \t7\().4h,  \r2\().4h,  \r3\().4h
198
199        trn1            \r0\().2s,  \t4\().2s,  \t6\().2s
200        trn2            \r2\().2s,  \t4\().2s,  \t6\().2s
201        trn1            \r1\().2s,  \t5\().2s,  \t7\().2s
202        trn2            \r3\().2s,  \t5\().2s,  \t7\().2s
203.endm
204
205.macro  transpose_4x4s  r0, r1, r2, r3, t4, t5, t6, t7
206        trn1            \t4\().4s,  \r0\().4s,  \r1\().4s
207        trn2            \t5\().4s,  \r0\().4s,  \r1\().4s
208        trn1            \t6\().4s,  \r2\().4s,  \r3\().4s
209        trn2            \t7\().4s,  \r2\().4s,  \r3\().4s
210
211        trn1            \r0\().2d,  \t4\().2d,  \t6\().2d
212        trn2            \r2\().2d,  \t4\().2d,  \t6\().2d
213        trn1            \r1\().2d,  \t5\().2d,  \t7\().2d
214        trn2            \r3\().2d,  \t5\().2d,  \t7\().2d
215.endm
216
217.macro  transpose_4x8h  r0, r1, r2, r3, t4, t5, t6, t7
218        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
219        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
220        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
221        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
222
223        trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
224        trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
225        trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
226        trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
227.endm
228
229#endif /* DAV1D_SRC_ARM_64_UTIL_S */
230