1/*****************************************************************************
2 * asm.S: arm utility macros
3 *****************************************************************************
4 * Copyright (C) 2013-2020 MulticoreWare, Inc
5 *
6 * Authors: Mans Rullgard <mans@mansr.com>
7 *          David Conrad <lessen42@gmail.com>
8 *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23 *
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at license @ x265.com.
26 *****************************************************************************/
27
28.syntax unified
29
30#if   HAVE_NEON
31        .arch           armv7-a
32#elif HAVE_ARMV6T2
33        .arch           armv6t2
34#elif HAVE_ARMV6
35        .arch           armv6
36#endif
37
38.fpu neon
39
40#ifdef PREFIX
41#   define EXTERN_ASM _
42#else
43#   define EXTERN_ASM
44#endif
45
46#ifdef __ELF__
47#   define ELF
48#else
49#   define ELF @
50#endif
51
52#if HAVE_AS_FUNC
53#   define FUNC
54#else
55#   define FUNC @
56#endif
57
58.macro require8, val=1
59ELF     .eabi_attribute 24, \val
60.endm
61
62.macro preserve8, val=1
63ELF     .eabi_attribute 25, \val
64.endm
65
66.macro function name, export=1
67    .macro endfunc
68ELF     .size   \name, . - \name
69FUNC    .endfunc
70        .purgem endfunc
71    .endm
72        .align  2
73.if \export == 1
74        .global EXTERN_ASM\name
75ELF     .hidden EXTERN_ASM\name
76ELF     .type   EXTERN_ASM\name, %function
77FUNC    .func   EXTERN_ASM\name
78EXTERN_ASM\name:
79.else
80ELF     .hidden \name
81ELF     .type   \name, %function
82FUNC    .func   \name
83\name:
84.endif
85.endm
86
87.macro movrel rd, val
88#if HAVE_ARMV6T2 && !defined(PIC)
89        movw            \rd, #:lower16:\val
90        movt            \rd, #:upper16:\val
91#else
92        ldr             \rd, =\val
93#endif
94.endm
95
96.macro movconst rd, val
97#if HAVE_ARMV6T2
98    movw        \rd, #:lower16:\val
99.if \val >> 16
100    movt        \rd, #:upper16:\val
101.endif
102#else
103    ldr         \rd, =\val
104#endif
105.endm
106
107#define GLUE(a, b) a ## b
108#define JOIN(a, b) GLUE(a, b)
109#define X(s) JOIN(EXTERN_ASM, s)
110
111#define FENC_STRIDE 64
112#define FDEC_STRIDE 32
113
114.macro HORIZ_ADD dest, a, b
115.ifnb \b
116    vadd.u16    \a, \a, \b
117.endif
118    vpaddl.u16  \a, \a
119    vpaddl.u32  \dest, \a
120.endm
121
122.macro SUMSUB_AB sum, diff, a, b
123    vadd.s16    \sum,  \a, \b
124    vsub.s16    \diff, \a, \b
125.endm
126
127.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
128    SUMSUB_AB   \s1, \d1, \a, \b
129    SUMSUB_AB   \s2, \d2, \c, \d
130.endm
131
132.macro ABS2 a b
133    vabs.s16 \a, \a
134    vabs.s16 \b, \b
135.endm
136
137// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
138// op = sumsub/amax (sum and diff / maximum of absolutes)
139// d1/2 = destination registers
140// s1/2 = source registers
141.macro HADAMARD dist, op, d1, d2, s1, s2
142.if \dist == 1
143    vtrn.16     \s1, \s2
144.else
145    vtrn.32     \s1, \s2
146.endif
147.ifc \op, sumsub
148    SUMSUB_AB   \d1, \d2, \s1, \s2
149.else
150    vabs.s16    \s1, \s1
151    vabs.s16    \s2, \s2
152    vmax.s16    \d1, \s1, \s2
153.endif
154.endm
155
156.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
157    vtrn.32         \r0, \r4
158    vtrn.32         \r1, \r5
159    vtrn.32         \r2, \r6
160    vtrn.32         \r3, \r7
161    vtrn.16         \r0, \r2
162    vtrn.16         \r1, \r3
163    vtrn.16         \r4, \r6
164    vtrn.16         \r5, \r7
165    vtrn.8          \r0, \r1
166    vtrn.8          \r2, \r3
167    vtrn.8          \r4, \r5
168    vtrn.8          \r6, \r7
169.endm
170
171.macro TRANSPOSE4x4 r0 r1 r2 r3
172    vtrn.16         \r0, \r2
173    vtrn.16         \r1, \r3
174    vtrn.8          \r0, \r1
175    vtrn.8          \r2, \r3
176.endm
177
178.macro TRANSPOSE4x4_16  r0, r1, r2, r3
179    vtrn.32     \r0, \r2            // r0 = [21 20 01 00], r2 = [23 22 03 02]
180    vtrn.32     \r1, \r3            // r1 = [31 30 11 10], r3 = [33 32 13 12]
181    vtrn.16     \r0, \r1            // r0 = [30 20 10 00], r1 = [31 21 11 01]
182    vtrn.16     \r2, \r3            // r2 = [32 22 12 02], r3 = [33 23 13 03]
183.endm
184
185.macro TRANSPOSE4x4x2_16  rA0, rA1, rA2, rA3, rB0, rB1, rB2, rB3
186    vtrn.32     \rA0, \rA2          // r0 = [21 20 01 00], r2 = [23 22 03 02]
187    vtrn.32     \rA1, \rA3          // r1 = [31 30 11 10], r3 = [33 32 13 12]
188    vtrn.32     \rB0, \rB2
189    vtrn.32     \rB1, \rB3
190    vtrn.16     \rA0, \rA1          // r0 = [30 20 10 00], r1 = [31 21 11 01]
191    vtrn.16     \rA2, \rA3          // r2 = [32 22 12 02], r3 = [33 23 13 03]
192    vtrn.16     \rB0, \rB1
193    vtrn.16     \rB2, \rB3
194.endm
195