1/******************************************************************************
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2020, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// The exported functions in this file have got the following signature:
32// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob,
33//                int bitdepth_max);
34
35// Most of the functions use the following register layout:
36// x0-x3  external parameters
37// x4     function pointer to first transform
38// x5     function pointer to second transform
39// x6     output parameter for helper function
40// x7     input parameter for helper function
41// x8     input stride for helper function
42// x9-x12 scratch variables for helper functions
43// x13    pointer to list of eob thresholds
44// x14    return pointer for helper function
45// x15    return pointer for main function
46
47// The SIMD registers most often use the following layout:
48// v0-v1   multiplication coefficients
49// v2-v7   scratch registers
50// v8-v15  unused
51// v16-v31 inputs/outputs of transforms
52
53const idct_coeffs, align=4
54        // idct4
55        .int            2896, 2896*8*(1<<16), 1567, 3784
56        // idct8
57        .int            799, 4017, 3406, 2276
58        // idct16
59        .int            401, 4076, 3166, 2598
60        .int            1931, 3612, 3920, 1189
61        // idct32
62        .int            201, 4091, 3035, 2751
63        .int            1751, 3703, 3857, 1380
64        .int            995, 3973, 3513, 2106
65        .int            2440, 3290, 4052, 601
66endconst
67
68const idct64_coeffs, align=4
69        .int            101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
70        .int            1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
71        .int            4076, 401, 4017, 799
72
73        .int            4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
74        .int            3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
75        .int            -3166, -2598, -799, -4017
76
77        .int            501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
78        .int            2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
79        .int            3612, 1931, 2276, 3406
80
81        .int            4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
82        .int            3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
83        .int            -3920, -1189, -3406, -2276
84endconst
85
86const iadst4_coeffs, align=4
87        .int            1321, 3803, 2482, 3344
88endconst
89
90const iadst8_coeffs, align=4
91        .int            4076, 401, 3612, 1931
92        .int            2598, 3166, 1189, 3920
93        // idct_coeffs
94        .int            2896, 0, 1567, 3784
95endconst
96
97const iadst16_coeffs, align=4
98        .int            4091, 201, 3973, 995
99        .int            3703, 1751, 3290, 2440
100        .int            2751, 3035, 2106, 3513
101        .int            1380, 3857, 601, 4052
102endconst
103
104.macro mul_mla d, s0, s1, c0, c1
105        mul             \d\().4s, \s0\().4s, \c0
106        mla             \d\().4s, \s1\().4s, \c1
107.endm
108
109.macro mul_mls d, s0, s1, c0, c1
110        mul             \d\().4s, \s0\().4s, \c0
111        mls             \d\().4s, \s1\().4s, \c1
112.endm
113
114.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
115        sqrdmulh        \r0\sz,  \r0\sz,  \c
116        sqrdmulh        \r1\sz,  \r1\sz,  \c
117        sqrdmulh        \r2\sz,  \r2\sz,  \c
118        sqrdmulh        \r3\sz,  \r3\sz,  \c
119.ifnb \r4
120        sqrdmulh        \r4\sz,  \r4\sz,  \c
121        sqrdmulh        \r5\sz,  \r5\sz,  \c
122        sqrdmulh        \r6\sz,  \r6\sz,  \c
123        sqrdmulh        \r7\sz,  \r7\sz,  \c
124.endif
125.endm
126
127.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4
128.ifnb \load
129        ld1             {\load},  [\src], x1
130.endif
131.ifnb \shift
132        srshr           \shift,  \shift,  #\shiftbits
133.endif
134.ifnb \addsrc
135        sqadd           \adddst, \adddst, \addsrc
136.endif
137.ifnb \max
138        smax            \max,  \max,  v6.8h
139.endif
140.ifnb \min
141        smin            \min,  \min,  v7.8h
142.endif
143.ifnb \store
144        st1             {\store},  [\dst], x1
145.endif
146.endm
147.macro load_add_store_8x16 dst, src
148        mov             \src, \dst
149        movi            v6.8h,   #0
150        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
151        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src
152        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src
153        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src
154        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src
155        load_add_store  v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src
156        load_add_store  v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src
157        load_add_store  v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src
158        load_add_store  v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src
159        load_add_store  v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src
160        load_add_store  v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src
161        load_add_store  v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src
162        load_add_store  v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src
163        load_add_store  v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src
164        load_add_store  v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src
165        load_add_store  v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src
166        load_add_store  v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src
167        load_add_store       ,       , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src
168        load_add_store       ,       , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src
169        load_add_store       ,       ,      ,       , v31.8h, v30.8h, v29.8h, \dst, \src
170        load_add_store       ,       ,      ,       ,       , v31.8h, v30.8h, \dst, \src
171        load_add_store       ,       ,      ,       ,       ,       , v31.8h, \dst, \src
172.endm
173.macro load_add_store_8x8 dst, src, shiftbits=4
174        mov             \src, \dst
175        movi            v6.8h,   #0
176        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
177        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
178        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
179        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src, \shiftbits
180        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src, \shiftbits
181        load_add_store  v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src, \shiftbits
182        load_add_store  v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
183        load_add_store  v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
184        load_add_store  v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits
185        load_add_store       ,       , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits
186        load_add_store       ,       , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits
187        load_add_store       ,       ,      ,       , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits
188        load_add_store       ,       ,      ,       ,       , v23.8h, v22.8h, \dst, \src, \shiftbits
189        load_add_store       ,       ,      ,       ,       ,       , v23.8h, \dst, \src, \shiftbits
190.endm
191.macro load_add_store_8x4 dst, src, shiftbits=4
192        mov             \src, \dst
193        movi            v6.8h,   #0
194        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
195        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
196        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
197        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src, \shiftbits
198        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src, \shiftbits
199        load_add_store       ,       , v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src, \shiftbits
200        load_add_store       ,       , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
201        load_add_store       ,       ,      ,       , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
202        load_add_store       ,       ,      ,       ,       , v19.8h, v18.8h, \dst, \src, \shiftbits
203        load_add_store       ,       ,      ,       ,       ,       , v19.8h, \dst, \src, \shiftbits
204.endm
205.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src
206.ifnb \load
207        ld1             {\load}[0],  [\src], x1
208.endif
209.ifnb \inssrc
210        ins             \insdst\().d[1],   \inssrc\().d[0]
211.endif
212.ifnb \shift
213        srshr           \shift,  \shift,  #4
214.endif
215.ifnb \load
216        ld1             {\load}[1],  [\src], x1
217.endif
218.ifnb \addsrc
219        sqadd           \adddst, \adddst, \addsrc
220.endif
221.ifnb \store
222        st1             {\store}[0],  [\dst], x1
223.endif
224.ifnb \max
225        smax            \max,  \max,  v6.8h
226.endif
227.ifnb \min
228        smin            \min,  \min,  v7.8h
229.endif
230.ifnb \store
231        st1             {\store}[1],  [\dst], x1
232.endif
233.endm
234.macro load_add_store_4x16 dst, src
235        mov             \src, \dst
236        movi            v6.8h,   #0
237        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
238        load_add_store4 v0.d, v17, v16,       ,      ,       ,       ,       ,      , \dst, \src
239        load_add_store4 v1.d, v19, v18,       ,      ,       ,       ,       ,      , \dst, \src
240        load_add_store4 v2.d, v21, v20, v16.8h,      ,       ,       ,       ,      , \dst, \src
241        load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h,       ,       ,      , \dst, \src
242        load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h,       ,      , \dst, \src
243        load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h,      , \dst, \src
244        load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
245        load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src
246        load_add_store4     ,    ,    , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src
247        load_add_store4     ,    ,    , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src
248        load_add_store4     ,    ,    ,       , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src
249        load_add_store4     ,    ,    ,       ,      ,       , v30.8h, v28.8h, v26.d, \dst, \src
250        load_add_store4     ,    ,    ,       ,      ,       ,       , v30.8h, v28.d, \dst, \src
251        load_add_store4     ,    ,    ,       ,      ,       ,       ,       , v30.d, \dst, \src
252.endm
253.macro load_add_store_4x8 dst, src
254        mov             \src, \dst
255        movi            v6.8h,   #0
256        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
257        load_add_store4 v0.d, v17, v16,       ,      ,       ,       ,       ,      , \dst, \src
258        load_add_store4 v1.d, v19, v18,       ,      ,       ,       ,       ,      , \dst, \src
259        load_add_store4 v2.d, v21, v20, v16.8h,      ,       ,       ,       ,      , \dst, \src
260        load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h,       ,       ,      , \dst, \src
261        load_add_store4     ,    ,    , v20.8h, v1.8h, v18.8h, v16.8h,       ,      , \dst, \src
262        load_add_store4     ,    ,    , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h,      , \dst, \src
263        load_add_store4     ,    ,    ,       , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
264        load_add_store4     ,    ,    ,       ,      ,       , v22.8h, v20.8h, v18.d, \dst, \src
265        load_add_store4     ,    ,    ,       ,      ,       ,       , v22.8h, v20.d, \dst, \src
266        load_add_store4     ,    ,    ,       ,      ,       ,       ,       , v22.d, \dst, \src
267.endm
268
269.macro idct_dc w, h, shift
270        cbnz            w3,  1f
271        movz            w16, #2896*8, lsl #16
272        ld1r            {v16.4s}, [x2]
273        dup             v0.2s,   w16
274        sqrdmulh        v20.4s,  v16.4s,  v0.s[0]
275        str             wzr, [x2]
276.if (\w == 2*\h) || (2*\w == \h)
277        sqrdmulh        v20.4s,  v20.4s,  v0.s[0]
278.endif
279.if \shift > 0
280        sqrshrn         v16.4h,  v20.4s,  #\shift
281        sqrshrn2        v16.8h,  v20.4s,  #\shift
282.else
283        sqxtn           v16.4h,  v20.4s
284        sqxtn2          v16.8h,  v20.4s
285.endif
286        sqrdmulh        v16.8h,  v16.8h,  v0.h[1]
287        srshr           v16.8h,  v16.8h,  #4
288        mov             w4,  #\h
289        b               idct_dc_w\w\()_neon
2901:
291.endm
292
293function idct_dc_w4_neon
294        movi            v30.8h,  #0
295        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
2961:
297        ld1             {v0.d}[0], [x0], x1
298        ld1             {v0.d}[1], [x0], x1
299        ld1             {v1.d}[0], [x0], x1
300        subs            w4,  w4,  #4
301        ld1             {v1.d}[1], [x0], x1
302        sqadd           v0.8h,   v0.8h,   v16.8h
303        sub             x0,  x0,  x1, lsl #2
304        sqadd           v1.8h,   v1.8h,   v16.8h
305        smax            v0.8h,   v0.8h,   v30.8h
306        smax            v1.8h,   v1.8h,   v30.8h
307        smin            v0.8h,   v0.8h,   v31.8h
308        st1             {v0.d}[0], [x0], x1
309        smin            v1.8h,   v1.8h,   v31.8h
310        st1             {v0.d}[1], [x0], x1
311        st1             {v1.d}[0], [x0], x1
312        st1             {v1.d}[1], [x0], x1
313        b.gt            1b
314        ret
315endfunc
316
317function idct_dc_w8_neon
318        movi            v30.8h,  #0
319        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
3201:
321        ld1             {v0.8h}, [x0], x1
322        subs            w4,  w4,  #4
323        ld1             {v1.8h}, [x0], x1
324        sqadd           v0.8h,   v0.8h,   v16.8h
325        ld1             {v2.8h}, [x0], x1
326        sqadd           v1.8h,   v1.8h,   v16.8h
327        ld1             {v3.8h}, [x0], x1
328        sqadd           v2.8h,   v2.8h,   v16.8h
329        sqadd           v3.8h,   v3.8h,   v16.8h
330        sub             x0,  x0,  x1, lsl #2
331        smax            v0.8h,   v0.8h,   v30.8h
332        smax            v1.8h,   v1.8h,   v30.8h
333        smax            v2.8h,   v2.8h,   v30.8h
334        smax            v3.8h,   v3.8h,   v30.8h
335        smin            v0.8h,   v0.8h,   v31.8h
336        smin            v1.8h,   v1.8h,   v31.8h
337        st1             {v0.8h}, [x0], x1
338        smin            v2.8h,   v2.8h,   v31.8h
339        st1             {v1.8h}, [x0], x1
340        smin            v3.8h,   v3.8h,   v31.8h
341        st1             {v2.8h}, [x0], x1
342        st1             {v3.8h}, [x0], x1
343        b.gt            1b
344        ret
345endfunc
346
347function idct_dc_w16_neon
348        movi            v30.8h,  #0
349        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
3501:
351        ld1             {v0.8h, v1.8h}, [x0], x1
352        subs            w4,  w4,  #2
353        ld1             {v2.8h, v3.8h}, [x0], x1
354        sqadd           v0.8h,   v0.8h,   v16.8h
355        sqadd           v1.8h,   v1.8h,   v16.8h
356        sub             x0,  x0,  x1, lsl #1
357        sqadd           v2.8h,   v2.8h,   v16.8h
358        sqadd           v3.8h,   v3.8h,   v16.8h
359        smax            v0.8h,   v0.8h,   v30.8h
360        smax            v1.8h,   v1.8h,   v30.8h
361        smax            v2.8h,   v2.8h,   v30.8h
362        smax            v3.8h,   v3.8h,   v30.8h
363        smin            v0.8h,   v0.8h,   v31.8h
364        smin            v1.8h,   v1.8h,   v31.8h
365        smin            v2.8h,   v2.8h,   v31.8h
366        st1             {v0.8h, v1.8h}, [x0], x1
367        smin            v3.8h,   v3.8h,   v31.8h
368        st1             {v2.8h, v3.8h}, [x0], x1
369        b.gt            1b
370        ret
371endfunc
372
373function idct_dc_w32_neon
374        movi            v30.8h,  #0
375        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
3761:
377        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
378        subs            w4,  w4,  #1
379        sqadd           v0.8h,   v0.8h,   v16.8h
380        sqadd           v1.8h,   v1.8h,   v16.8h
381        sqadd           v2.8h,   v2.8h,   v16.8h
382        sqadd           v3.8h,   v3.8h,   v16.8h
383        smax            v0.8h,   v0.8h,   v30.8h
384        smax            v1.8h,   v1.8h,   v30.8h
385        smax            v2.8h,   v2.8h,   v30.8h
386        smax            v3.8h,   v3.8h,   v30.8h
387        smin            v0.8h,   v0.8h,   v31.8h
388        smin            v1.8h,   v1.8h,   v31.8h
389        smin            v2.8h,   v2.8h,   v31.8h
390        smin            v3.8h,   v3.8h,   v31.8h
391        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
392        b.gt            1b
393        ret
394endfunc
395
396function idct_dc_w64_neon
397        movi            v30.8h,  #0
398        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
399        sub             x1,  x1,  #64
4001:
401        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
402        subs            w4,  w4,  #1
403        sqadd           v0.8h,   v0.8h,   v16.8h
404        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
405        sqadd           v1.8h,   v1.8h,   v16.8h
406        sub             x0,  x0,  #64
407        sqadd           v2.8h,   v2.8h,   v16.8h
408        sqadd           v3.8h,   v3.8h,   v16.8h
409        sqadd           v4.8h,   v4.8h,   v16.8h
410        sqadd           v5.8h,   v5.8h,   v16.8h
411        sqadd           v6.8h,   v6.8h,   v16.8h
412        sqadd           v7.8h,   v7.8h,   v16.8h
413        smax            v0.8h,   v0.8h,   v30.8h
414        smax            v1.8h,   v1.8h,   v30.8h
415        smax            v2.8h,   v2.8h,   v30.8h
416        smax            v3.8h,   v3.8h,   v30.8h
417        smax            v4.8h,   v4.8h,   v30.8h
418        smax            v5.8h,   v5.8h,   v30.8h
419        smax            v6.8h,   v6.8h,   v30.8h
420        smax            v7.8h,   v7.8h,   v30.8h
421        smin            v0.8h,   v0.8h,   v31.8h
422        smin            v1.8h,   v1.8h,   v31.8h
423        smin            v2.8h,   v2.8h,   v31.8h
424        smin            v3.8h,   v3.8h,   v31.8h
425        smin            v4.8h,   v4.8h,   v31.8h
426        smin            v5.8h,   v5.8h,   v31.8h
427        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
428        smin            v6.8h,   v6.8h,   v31.8h
429        smin            v7.8h,   v7.8h,   v31.8h
430        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
431        b.gt            1b
432        ret
433endfunc
434
435.macro iwht4
436        add             v16.4s,  v16.4s,  v17.4s
437        sub             v21.4s,  v18.4s,  v19.4s
438        sub             v20.4s,  v16.4s,  v21.4s
439        sshr            v20.4s,  v20.4s,  #1
440        sub             v18.4s,  v20.4s,  v17.4s
441        sub             v17.4s,  v20.4s,  v19.4s
442        add             v19.4s,  v21.4s,  v18.4s
443        sub             v16.4s,  v16.4s,  v17.4s
444.endm
445
446.macro idct_4 r0, r1, r2, r3
447        mul_mla         v6,  \r1, \r3, v0.s[3], v0.s[2]
448        mul_mls         v4,  \r1, \r3, v0.s[2], v0.s[3]
449        mul_mla         v2,  \r0, \r2, v0.s[0], v0.s[0]
450        mul_mls         v3,  \r0, \r2, v0.s[0], v0.s[0]
451        srshr           v6.4s,  v6.4s,  #12
452        srshr           v7.4s,  v4.4s,  #12
453        srshr           v2.4s,  v2.4s,  #12
454        srshr           v3.4s,  v3.4s,  #12
455        sqadd           \r0\().4s,  v2.4s,   v6.4s
456        sqsub           \r3\().4s,  v2.4s,   v6.4s
457        sqadd           \r1\().4s,  v3.4s,   v7.4s
458        sqsub           \r2\().4s,  v3.4s,   v7.4s
459.endm
460
461function inv_dct_4s_x4_neon
462        movrel          x16, idct_coeffs
463        ld1             {v0.4s}, [x16]
464        idct_4          v16, v17, v18, v19
465        ret
466endfunc
467
468.macro iadst_4x4 o0, o1, o2, o3
469        movrel          x16, iadst4_coeffs
470        ld1             {v0.4s}, [x16]
471
472        sub             v3.4s,   v16.4s,  v18.4s
473        mul             v4.4s,   v16.4s,  v0.s[0]
474        mla             v4.4s,   v18.4s,  v0.s[1]
475        mla             v4.4s,   v19.4s,  v0.s[2]
476        mul             v7.4s,   v17.4s,  v0.s[3]
477        add             v3.4s,   v3.4s,   v19.4s
478        mul             v5.4s,   v16.4s,  v0.s[2]
479        mls             v5.4s,   v18.4s,  v0.s[0]
480        mls             v5.4s,   v19.4s,  v0.s[1]
481
482        add             \o3\().4s, v4.4s,     v5.4s
483        mul             \o2\().4s, v3.4s,     v0.s[3]
484        add             \o0\().4s, v4.4s,     v7.4s
485        add             \o1\().4s, v5.4s,     v7.4s
486        sub             \o3\().4s, \o3\().4s, v7.4s
487
488        srshr           \o0\().4s, \o0\().4s, #12
489        srshr           \o2\().4s, \o2\().4s, #12
490        srshr           \o1\().4s, \o1\().4s, #12
491        srshr           \o3\().4s, \o3\().4s, #12
492.endm
493
494function inv_adst_4s_x4_neon
495        iadst_4x4       v16, v17, v18, v19
496        ret
497endfunc
498
499function inv_flipadst_4s_x4_neon
500        iadst_4x4       v19, v18, v17, v16
501        ret
502endfunc
503
504function inv_identity_4s_x4_neon
505        movz            w16, #(5793-4096)*8, lsl #16
506        dup             v0.2s,   w16
507        sqrdmulh        v4.4s,   v16.4s,  v0.s[0]
508        sqrdmulh        v5.4s,   v17.4s,  v0.s[0]
509        sqrdmulh        v6.4s,   v18.4s,  v0.s[0]
510        sqrdmulh        v7.4s,   v19.4s,  v0.s[0]
511        sqadd           v16.4s,  v16.4s,  v4.4s
512        sqadd           v17.4s,  v17.4s,  v5.4s
513        sqadd           v18.4s,  v18.4s,  v6.4s
514        sqadd           v19.4s,  v19.4s,  v7.4s
515        ret
516endfunc
517
518function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
519        mov             x15, x30
520        movi            v30.4s,  #0
521        movi            v31.4s,  #0
522        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
523        st1             {v30.4s, v31.4s}, [x2], #32
524
525        sshr            v16.4s,  v16.4s,  #2
526        sshr            v17.4s,  v17.4s,  #2
527        sshr            v18.4s,  v18.4s,  #2
528        sshr            v19.4s,  v19.4s,  #2
529
530        iwht4
531
532        st1             {v30.4s, v31.4s}, [x2], #32
533        transpose_4x4s  v16, v17, v18, v19, v20, v21, v22, v23
534
535        iwht4
536
537        ld1             {v0.d}[0], [x0], x1
538        sqxtn           v16.4h,  v16.4s
539        ld1             {v0.d}[1], [x0], x1
540        sqxtn2          v16.8h,  v17.4s
541        ld1             {v1.d}[0], [x0], x1
542        sqxtn           v18.4h,  v18.4s
543        ld1             {v1.d}[1], [x0], x1
544        sqxtn2          v18.8h,  v19.4s
545
546        b               L(itx_4x4_end)
547endfunc
548
549function inv_txfm_add_4x4_neon
550        movi            v30.4s,  #0
551        movi            v31.4s,  #0
552        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
553        st1             {v30.4s, v31.4s}, [x2], #32
554
555        blr             x4
556
557        st1             {v30.4s, v31.4s}, [x2], #32
558        sqxtn           v16.4h,  v16.4s
559        sqxtn           v17.4h,  v17.4s
560        sqxtn           v18.4h,  v18.4s
561        sqxtn           v19.4h,  v19.4s
562        transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
563
564        blr             x5
565
566        ld1             {v0.d}[0], [x0], x1
567        ld1             {v0.d}[1], [x0], x1
568        ins             v16.d[1], v17.d[0]
569        ins             v18.d[1], v19.d[0]
570        ld1             {v1.d}[0], [x0], x1
571        ld1             {v1.d}[1], [x0], x1
572        srshr           v16.8h,  v16.8h,  #4
573        srshr           v18.8h,  v18.8h,  #4
574
575L(itx_4x4_end):
576        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
577        sub             x0,  x0,  x1, lsl #2
578        sqadd           v16.8h,  v16.8h,  v0.8h
579        sqadd           v18.8h,  v18.8h,  v1.8h
580        smax            v16.8h,  v16.8h,  v30.8h
581        smax            v18.8h,  v18.8h,  v30.8h
582        smin            v16.8h,  v16.8h,  v31.8h
583        st1             {v16.d}[0], [x0], x1
584        smin            v18.8h,  v18.8h,  v31.8h
585        st1             {v16.d}[1], [x0], x1
586        st1             {v18.d}[0], [x0], x1
587        st1             {v18.d}[1], [x0], x1
588
589        br              x15
590endfunc
591
592.macro def_fn_4x4 txfm1, txfm2
593function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
594        mov             x15, x30
595
596.ifc \txfm1\()_\txfm2, dct_dct
597        cbnz            w3,  1f
598        movz            w16, #2896*8, lsl #16
599        ld1r            {v16.4s}, [x2]
600        dup             v4.2s,   w16
601        str             wzr, [x2]
602        sqrdmulh        v16.4s,  v16.4s,  v4.s[0]
603        ld1             {v0.d}[0], [x0], x1
604        sqxtn           v20.4h,  v16.4s
605        sqxtn2          v20.8h,  v16.4s
606        ld1             {v0.d}[1], [x0], x1
607        sqrdmulh        v20.8h,  v20.8h,  v4.h[1]
608        ld1             {v1.d}[0], [x0], x1
609        srshr           v16.8h,  v20.8h,  #4
610        ld1             {v1.d}[1], [x0], x1
611        srshr           v18.8h,  v20.8h,  #4
612        movi            v30.8h,  #0
613        b               L(itx_4x4_end)
6141:
615.endif
616        adr             x4,  inv_\txfm1\()_4s_x4_neon
617        movrel          x5,  X(inv_\txfm2\()_4h_x4_neon)
618        b               inv_txfm_add_4x4_neon
619endfunc
620.endm
621
622def_fn_4x4 dct, dct
623def_fn_4x4 identity, identity
624def_fn_4x4 dct, adst
625def_fn_4x4 dct, flipadst
626def_fn_4x4 dct, identity
627def_fn_4x4 adst, dct
628def_fn_4x4 adst, adst
629def_fn_4x4 adst, flipadst
630def_fn_4x4 flipadst, dct
631def_fn_4x4 flipadst, adst
632def_fn_4x4 flipadst, flipadst
633def_fn_4x4 identity, dct
634
635def_fn_4x4 adst, identity
636def_fn_4x4 flipadst, identity
637def_fn_4x4 identity, adst
638def_fn_4x4 identity, flipadst
639
640.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
641        idct_4          \r0, \r2, \r4, \r6
642
643        mul_mls         v2,  \r1, \r7, v1.s[0], v1.s[1]  // -> t4a
644        mul_mla         v4,  \r1, \r7, v1.s[1], v1.s[0]  // -> t7a
645        mul_mls         v6,  \r5, \r3, v1.s[2], v1.s[3]  // -> t5a
646        mul_mla         v7,  \r5, \r3, v1.s[3], v1.s[2]  // -> t6a
647        srshr           \r1\().4s, v2.4s,  #12           // t4a
648        srshr           \r7\().4s, v4.4s,  #12           // t7a
649        srshr           \r3\().4s, v6.4s,  #12           // t5a
650        srshr           \r5\().4s, v7.4s,  #12           // taa
651
652        sqadd           v2.4s,     \r1\().4s,  \r3\().4s // t4
653        sqsub           \r1\().4s, \r1\().4s,  \r3\().4s // t5a
654        sqadd           v3.4s,     \r7\().4s,  \r5\().4s // t7
655        sqsub           \r3\().4s, \r7\().4s,  \r5\().4s // t6a
656
657        mul_mls         v4,  \r3, \r1, v0.s[0], v0.s[0]  // -> t5
658        mul_mla         v6,  \r3, \r1, v0.s[0], v0.s[0]  // -> t6
659        srshr           v4.4s,  v4.4s,  #12              // t5
660        srshr           v5.4s,  v6.4s,  #12              // t6
661
662        sqsub           \r7\().4s,  \r0\().4s,  v3.4s    // out7
663        sqadd           \r0\().4s,  \r0\().4s,  v3.4s    // out0
664        sqadd           \r1\().4s,  \r2\().4s,  v5.4s    // out1
665        sqsub           v6.4s,      \r2\().4s,  v5.4s    // out6
666        sqadd           \r2\().4s,  \r4\().4s,  v4.4s    // out2
667        sqsub           \r5\().4s,  \r4\().4s,  v4.4s    // out5
668        sqadd           \r3\().4s,  \r6\().4s,  v2.4s    // out3
669        sqsub           \r4\().4s,  \r6\().4s,  v2.4s    // out4
670        mov             \r6\().16b, v6.16b               // out6
671.endm
672
673function inv_dct_4s_x8_neon
674        movrel          x16, idct_coeffs
675        ld1             {v0.4s, v1.4s}, [x16]
676        idct_8          v16, v17, v18, v19, v20, v21, v22, v23
677        ret
678endfunc
679
680.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
681        movrel          x16, iadst8_coeffs
682        ld1             {v0.4s, v1.4s}, [x16], #32
683
684        mul_mla         v2,  v23, v16, v0.s[0], v0.s[1]
685        mul_mls         v4,  v23, v16, v0.s[1], v0.s[0]
686        mul_mla         v6,  v21, v18, v0.s[2], v0.s[3]
687        srshr           v16.4s, v2.4s,  #12  // t0a
688        srshr           v23.4s, v4.4s,  #12  // t1a
689        mul_mls         v2,  v21, v18, v0.s[3], v0.s[2]
690        mul_mla         v4,  v19, v20, v1.s[0], v1.s[1]
691        srshr           v18.4s, v6.4s,  #12  // t2a
692        srshr           v21.4s, v2.4s,  #12  // t3a
693        mul_mls         v6,  v19, v20, v1.s[1], v1.s[0]
694        mul_mla         v2,  v17, v22, v1.s[2], v1.s[3]
695        srshr           v20.4s, v4.4s,  #12  // t4a
696        srshr           v19.4s, v6.4s,  #12  // t5a
697        mul_mls         v4,  v17, v22, v1.s[3], v1.s[2]
698        srshr           v22.4s, v2.4s,  #12  // t6a
699        srshr           v17.4s, v4.4s,  #12  // t7a
700
701        ld1             {v0.4s}, [x16]
702
703        sqadd           v2.4s,   v16.4s,  v20.4s // t0
704        sqsub           v3.4s,   v16.4s,  v20.4s // t4
705        sqadd           v4.4s,   v23.4s,  v19.4s // t1
706        sqsub           v5.4s,   v23.4s,  v19.4s // t5
707        sqadd           v6.4s,   v18.4s,  v22.4s // t2
708        sqsub           v7.4s,   v18.4s,  v22.4s // t6
709        sqadd           v18.4s,  v21.4s,  v17.4s // t3
710        sqsub           v19.4s,  v21.4s,  v17.4s // t7
711
712        mul_mla         v16, v3,  v5,  v0.s[3], v0.s[2]
713        mul_mls         v20, v3,  v5,  v0.s[2], v0.s[3]
714        mul_mls         v22, v19, v7,  v0.s[3], v0.s[2]
715
716        srshr           v3.4s,  v16.4s, #12  // t4a
717        srshr           v5.4s,  v20.4s, #12  // t5a
718
719        mul_mla         v16, v19, v7,  v0.s[2], v0.s[3]
720
721        srshr           v7.4s,  v22.4s, #12  // t6a
722        srshr           v19.4s, v16.4s, #12  // t7a
723
724        sqadd           \o0\().4s, v2.4s, v6.4s  // out0
725        sqsub           v2.4s,     v2.4s, v6.4s  // t2
726        sqadd           \o7\().4s, v4.4s, v18.4s // out7
727        sqsub           v4.4s,     v4.4s, v18.4s // t3
728        sqneg           \o7\().4s, \o7\().4s     // out7
729
730        sqadd           \o1\().4s, v3.4s, v7.4s  // out1
731        sqsub           v3.4s,     v3.4s, v7.4s  // t6
732        sqadd           \o6\().4s, v5.4s, v19.4s // out6
733        sqsub           v5.4s,     v5.4s, v19.4s // t7
734        sqneg           \o1\().4s, \o1\().4s     // out1
735
736        mul_mla         v18, v2,  v4,  v0.s[0], v0.s[0] // -> out3 (v19 or v20)
737        mul_mls         v6,  v2,  v4,  v0.s[0], v0.s[0] // -> out4 (v20 or v19)
738        mul_mls         v20, v3,  v5,  v0.s[0], v0.s[0] // -> out5 (v21 or v18)
739        srshr           v2.4s,  v18.4s, #12 // out3
740        mul_mla         v18, v3,  v5,  v0.s[0], v0.s[0] // -> out2 (v18 or v21)
741        srshr           v3.4s,  v20.4s, #12 // out5
742        srshr           \o2\().4s, v18.4s, #12 // out2 (v18 or v21)
743        srshr           \o4\().4s, v6.4s,  #12 // out4 (v20 or v19)
744
745        sqneg           \o3\().4s, v2.4s     // out3
746        sqneg           \o5\().4s, v3.4s     // out5
747.endm
748
749function inv_adst_4s_x8_neon
750        iadst_8         v16, v17, v18, v19, v20, v21, v22, v23
751        ret
752endfunc
753
754function inv_flipadst_4s_x8_neon
755        iadst_8         v23, v22, v21, v20, v19, v18, v17, v16
756        ret
757endfunc
758
759function inv_identity_4s_x8_neon
760        sqshl           v16.4s,  v16.4s,  #1
761        sqshl           v17.4s,  v17.4s,  #1
762        sqshl           v18.4s,  v18.4s,  #1
763        sqshl           v19.4s,  v19.4s,  #1
764        sqshl           v20.4s,  v20.4s,  #1
765        sqshl           v21.4s,  v21.4s,  #1
766        sqshl           v22.4s,  v22.4s,  #1
767        sqshl           v23.4s,  v23.4s,  #1
768        ret
769endfunc
770
771function inv_txfm_add_8x8_neon
772        movi            v31.4s,  #0
773
774        cmp             w3,  w13
775        mov             x11, #32
776        b.lt            1f
777
778        add             x6,  x2,  #16
779.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
780        ld1             {\i},     [x6]
781        st1             {v31.4s}, [x6], x11
782.endr
783
784        blr             x4
785
786        sqrshrn         v24.4h,  v16.4s,  #1
787        sqrshrn         v25.4h,  v17.4s,  #1
788        sqrshrn         v26.4h,  v18.4s,  #1
789        sqrshrn         v27.4h,  v19.4s,  #1
790        sqrshrn2        v24.8h,  v20.4s,  #1
791        sqrshrn2        v25.8h,  v21.4s,  #1
792        sqrshrn2        v26.8h,  v22.4s,  #1
793        sqrshrn2        v27.8h,  v23.4s,  #1
794
795        transpose_4x8h  v24, v25, v26, v27, v2, v3, v4, v5
796
797        b               2f
798
7991:
800.irp i, v24.8h, v25.8h, v26.8h, v27.8h
801        movi            \i,  #0
802.endr
803
8042:
805
806.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
807        ld1             {\i},     [x2]
808        st1             {v31.4s}, [x2], x11
809.endr
810
811        blr             x4
812
813        sqrshrn         v16.4h,  v16.4s,  #1
814        sqrshrn         v17.4h,  v17.4s,  #1
815        sqrshrn         v18.4h,  v18.4s,  #1
816        sqrshrn         v19.4h,  v19.4s,  #1
817        sqrshrn2        v16.8h,  v20.4s,  #1
818        sqrshrn2        v17.8h,  v21.4s,  #1
819        sqrshrn2        v18.8h,  v22.4s,  #1
820        sqrshrn2        v19.8h,  v23.4s,  #1
821
822        transpose_4x8h  v16, v17, v18, v19, v20, v21, v22, v23
823
824        mov             v20.16b, v24.16b
825        mov             v21.16b, v25.16b
826        mov             v22.16b, v26.16b
827        mov             v23.16b, v27.16b
828
829        blr             x5
830
831        load_add_store_8x8 x0, x7
832        br              x15
833endfunc
834
835.macro def_fn_8x8 txfm1, txfm2, eob_half
836function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
837        mov             x15, x30
838
839.ifc \txfm1\()_\txfm2, dct_dct
840        idct_dc         8,   8,   1
841.endif
842        movrel          x5,  X(inv_\txfm2\()_8h_x8_neon)
843        mov             w13, #\eob_half
844        adr             x4,  inv_\txfm1\()_4s_x8_neon
845        b               inv_txfm_add_8x8_neon
846endfunc
847.endm
848
849def_fn_8x8 dct, dct, 10
850def_fn_8x8 identity, identity, 10
851def_fn_8x8 dct, adst, 10
852def_fn_8x8 dct, flipadst, 10
853def_fn_8x8 dct, identity, 4
854def_fn_8x8 adst, dct, 10
855def_fn_8x8 adst, adst, 10
856def_fn_8x8 adst, flipadst, 10
857def_fn_8x8 flipadst, dct, 10
858def_fn_8x8 flipadst, adst, 10
859def_fn_8x8 flipadst, flipadst, 10
860def_fn_8x8 identity, dct, 4
861def_fn_8x8 adst, identity, 4
862def_fn_8x8 flipadst, identity, 4
863def_fn_8x8 identity, adst, 4
864def_fn_8x8 identity, flipadst, 4
865
866function inv_txfm_add_8x4_neon
867        movi            v28.4s,  #0
868        movi            v29.4s,  #0
869        movi            v30.4s,  #0
870        movi            v31.4s,  #0
871        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
872        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
873        movz            w16, #2896*8, lsl #16
874        dup             v0.2s,   w16
875        ld1             {v20.4s,v21.4s,v22.4s,v23.4s}, [x2]
876        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x2]
877
878        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
879
880        blr             x4
881
882        sqxtn           v16.4h,  v16.4s
883        sqxtn           v17.4h,  v17.4s
884        sqxtn           v18.4h,  v18.4s
885        sqxtn           v19.4h,  v19.4s
886        sqxtn           v20.4h,  v20.4s
887        sqxtn           v21.4h,  v21.4s
888        sqxtn           v22.4h,  v22.4s
889        sqxtn           v23.4h,  v23.4s
890
891        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
892        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
893        ins             v16.d[1], v20.d[0]
894        ins             v17.d[1], v21.d[0]
895        ins             v18.d[1], v22.d[0]
896        ins             v19.d[1], v23.d[0]
897
898        blr             x5
899
900        load_add_store_8x4 x0, x7
901        br              x15
902endfunc
903
904function inv_txfm_add_4x8_neon
905        movz            w16, #2896*8, lsl #16
906        movi            v31.4s,  #0
907        dup             v30.2s,  w16
908
909        cmp             w3,  w13
910        mov             x11, #32
911        b.lt            1f
912
913        add             x6,  x2,  #16
914.irp i, v16.4s, v17.4s, v18.4s, v19.4s
915        ld1             {\i},     [x6]
916        st1             {v31.4s}, [x6], x11
917.endr
918        scale_input     .4s, v30.s[0], v16, v17, v18, v19
919        blr             x4
920        sqxtn           v20.4h,  v16.4s
921        sqxtn           v21.4h,  v17.4s
922        sqxtn           v22.4h,  v18.4s
923        sqxtn           v23.4h,  v19.4s
924        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
925
926        b               2f
927
9281:
929.irp i, v20, v21, v22, v23
930        movi            \i\().4h, #0
931.endr
932
9332:
934
935.irp i, v16.4s, v17.4s, v18.4s, v19.4s
936        ld1             {\i},     [x2]
937        st1             {v31.4s}, [x2], x11
938.endr
939        scale_input     .4s, v30.s[0], v16, v17, v18, v19
940        blr             x4
941        sqxtn           v16.4h,  v16.4s
942        sqxtn           v17.4h,  v17.4s
943        sqxtn           v18.4h,  v18.4s
944        sqxtn           v19.4h,  v19.4s
945        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
946
947        blr             x5
948
949        load_add_store_4x8 x0, x7
950        br              x15
951endfunc
952
953.macro def_fn_48 w, h, txfm1, txfm2, eob_half
954function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
955        mov             x15, x30
956
957.ifc \txfm1\()_\txfm2, dct_dct
958        idct_dc         \w,  \h,  0
959.endif
960        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
961.if \w == 4
962        mov             w13, #\eob_half
963.endif
964        movrel          x5,  X(inv_\txfm2\()_\w\()h_x\h\()_neon)
965        b               inv_txfm_add_\w\()x\h\()_neon
966endfunc
967.endm
968
969.macro def_fns_48 w, h
970def_fn_48 \w, \h, dct, dct, 13
971def_fn_48 \w, \h, identity, identity, 13
972def_fn_48 \w, \h, dct, adst, 13
973def_fn_48 \w, \h, dct, flipadst, 13
974def_fn_48 \w, \h, dct, identity, 4
975def_fn_48 \w, \h, adst, dct, 13
976def_fn_48 \w, \h, adst, adst, 13
977def_fn_48 \w, \h, adst, flipadst, 13
978def_fn_48 \w, \h, flipadst, dct, 13
979def_fn_48 \w, \h, flipadst, adst, 13
980def_fn_48 \w, \h, flipadst, flipadst, 13
981def_fn_48 \w, \h, identity, dct, 16
982def_fn_48 \w, \h, adst, identity, 4
983def_fn_48 \w, \h, flipadst, identity, 4
984def_fn_48 \w, \h, identity, adst, 16
985def_fn_48 \w, \h, identity, flipadst, 16
986.endm
987
988def_fns_48 4, 8
989def_fns_48 8, 4
990
991
992function inv_dct_4s_x16_neon
993        movrel          x16, idct_coeffs
994        ld1             {v0.4s, v1.4s}, [x16], #32
995
996        idct_8          v16, v18, v20, v22, v24, v26, v28, v30
997
998        ld1             {v0.4s, v1.4s}, [x16]
999        sub             x16, x16, #32
1000
1001        mul_mls         v2,  v17, v31, v0.s[0], v0.s[1] // -> t8a
1002        mul_mla         v4,  v17, v31, v0.s[1], v0.s[0] // -> t15a
1003        mul_mls         v6,  v25, v23, v0.s[2], v0.s[3] // -> t9a
1004        srshr           v17.4s, v2.4s,  #12             // t8a
1005        srshr           v31.4s, v4.4s,  #12             // t15a
1006        mul_mla         v2,  v25, v23, v0.s[3], v0.s[2] // -> t14a
1007        mul_mls         v4,  v21, v27, v1.s[0], v1.s[1] // -> t10a
1008        srshr           v23.4s, v6.4s,  #12             // t9a
1009        srshr           v25.4s, v2.4s,  #12             // t14a
1010        mul_mla         v6,  v21, v27, v1.s[1], v1.s[0] // -> t13a
1011        mul_mls         v2,  v29, v19, v1.s[2], v1.s[3] // -> t11a
1012        srshr           v21.4s, v4.4s,  #12             // t10a
1013        srshr           v27.4s, v6.4s,  #12             // t13a
1014        mul_mla         v4,  v29, v19, v1.s[3], v1.s[2] // -> t12a
1015        srshr           v19.4s, v2.4s,  #12             // t11a
1016        srshr           v29.4s, v4.4s,  #12             // t12a
1017
1018        ld1             {v0.4s}, [x16]
1019
1020        sqsub           v2.4s,   v17.4s,  v23.4s  // t9
1021        sqadd           v17.4s,  v17.4s,  v23.4s  // t8
1022        sqsub           v3.4s,   v31.4s,  v25.4s  // t14
1023        sqadd           v31.4s,  v31.4s,  v25.4s  // t15
1024        sqsub           v23.4s,  v19.4s,  v21.4s  // t10
1025        sqadd           v19.4s,  v19.4s,  v21.4s  // t11
1026        sqadd           v25.4s,  v29.4s,  v27.4s  // t12
1027        sqsub           v29.4s,  v29.4s,  v27.4s  // t13
1028
1029        mul_mls         v4,  v3,  v2,  v0.s[2], v0.s[3] // -> t9a
1030        mul_mla         v6,  v3,  v2,  v0.s[3], v0.s[2] // -> t14a
1031        srshr           v21.4s, v4.4s,  #12             // t9a
1032        srshr           v27.4s, v6.4s,  #12             // t14a
1033
1034        mul_mls         v4,  v29, v23, v0.s[2], v0.s[3] // -> t13a
1035        mul_mla         v6,  v29, v23, v0.s[3], v0.s[2] // -> t10a
1036        srshr           v29.4s, v4.4s,  #12             // t13a
1037        neg             v6.4s,   v6.4s
1038        srshr           v23.4s, v6.4s,  #12             // t10a
1039
1040        sqsub           v2.4s,   v17.4s,  v19.4s  // t11a
1041        sqadd           v17.4s,  v17.4s,  v19.4s  // t8a
1042        sqsub           v3.4s,   v31.4s,  v25.4s  // t12a
1043        sqadd           v31.4s,  v31.4s,  v25.4s  // t15a
1044        sqadd           v19.4s,  v21.4s,  v23.4s  // t9
1045        sqsub           v21.4s,  v21.4s,  v23.4s  // t10
1046        sqsub           v25.4s,  v27.4s,  v29.4s  // t13
1047        sqadd           v27.4s,  v27.4s,  v29.4s  // t14
1048
1049        mul_mls         v4,  v3,  v2,  v0.s[0], v0.s[0] // -> t11
1050        mul_mla         v6,  v3,  v2,  v0.s[0], v0.s[0] // -> t12
1051        mul_mls         v2,  v25, v21, v0.s[0], v0.s[0] // -> t10a
1052
1053        srshr           v4.4s,  v4.4s,  #12   // t11
1054        srshr           v5.4s,  v6.4s,  #12   // t12
1055        mul_mla         v6,  v25, v21, v0.s[0], v0.s[0] // -> t10a
1056        srshr           v2.4s,  v2.4s,  #12   // t10a
1057        srshr           v3.4s,  v6.4s,  #12   // t13a
1058
1059        sqadd           v6.4s,   v16.4s,  v31.4s  // out0
1060        sqsub           v31.4s,  v16.4s,  v31.4s  // out15
1061        mov             v16.16b, v6.16b
1062        sqadd           v23.4s,  v30.4s,  v17.4s  // out7
1063        sqsub           v7.4s,   v30.4s,  v17.4s  // out8
1064        sqadd           v17.4s,  v18.4s,  v27.4s  // out1
1065        sqsub           v30.4s,  v18.4s,  v27.4s  // out14
1066        sqadd           v18.4s,  v20.4s,  v3.4s   // out2
1067        sqsub           v29.4s,  v20.4s,  v3.4s   // out13
1068        sqadd           v3.4s,   v28.4s,  v19.4s  // out6
1069        sqsub           v25.4s,  v28.4s,  v19.4s  // out9
1070        sqadd           v19.4s,  v22.4s,  v5.4s   // out3
1071        sqsub           v28.4s,  v22.4s,  v5.4s   // out12
1072        sqadd           v20.4s,  v24.4s,  v4.4s   // out4
1073        sqsub           v27.4s,  v24.4s,  v4.4s   // out11
1074        sqadd           v21.4s,  v26.4s,  v2.4s   // out5
1075        sqsub           v26.4s,  v26.4s,  v2.4s   // out10
1076        mov             v24.16b, v7.16b
1077        mov             v22.16b, v3.16b
1078
1079        ret
1080endfunc
1081
1082.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
1083        movrel          x16, iadst16_coeffs
1084        ld1             {v0.4s, v1.4s}, [x16], #32
1085
1086        mul_mla         v2,  v31, v16, v0.s[0], v0.s[1] // -> t0
1087        mul_mls         v4,  v31, v16, v0.s[1], v0.s[0] // -> t1
1088        mul_mla         v6,  v29, v18, v0.s[2], v0.s[3] // -> t2
1089        srshr           v16.4s, v2.4s,  #12             // t0
1090        srshr           v31.4s, v4.4s,  #12             // t1
1091        mul_mls         v2,  v29, v18, v0.s[3], v0.s[2] // -> t3
1092        mul_mla         v4,  v27, v20, v1.s[0], v1.s[1] // -> t4
1093        srshr           v18.4s, v6.4s,  #12             // t2
1094        srshr           v29.4s, v2.4s,  #12             // t3
1095        mul_mls         v6,  v27, v20, v1.s[1], v1.s[0] // -> t5
1096        mul_mla         v2,  v25, v22, v1.s[2], v1.s[3] // -> t6
1097        srshr           v20.4s, v4.4s,  #12             // t4
1098        srshr           v27.4s, v6.4s,  #12             // t5
1099        mul_mls         v4,  v25, v22, v1.s[3], v1.s[2] // -> t7
1100        ld1             {v0.4s, v1.4s}, [x16]
1101        movrel          x16, idct_coeffs
1102        mul_mla         v6,  v23, v24, v0.s[0], v0.s[1] // -> t8
1103        srshr           v22.4s, v2.4s,  #12             // t6
1104        srshr           v25.4s, v4.4s,  #12             // t7
1105        mul_mls         v2,  v23, v24, v0.s[1], v0.s[0] // -> t9
1106        mul_mla         v4,  v21, v26, v0.s[2], v0.s[3] // -> t10
1107        srshr           v23.4s, v6.4s,  #12             // t8
1108        srshr           v24.4s, v2.4s,  #12             // t9
1109        mul_mls         v6,  v21, v26, v0.s[3], v0.s[2] // -> t11
1110        mul_mla         v2,  v19, v28, v1.s[0], v1.s[1] // -> t12
1111        srshr           v21.4s, v4.4s,  #12             // t10
1112        srshr           v26.4s, v6.4s,  #12             // t11
1113        mul_mls         v4,  v19, v28, v1.s[1], v1.s[0] // -> t13
1114        mul_mla         v6,  v17, v30, v1.s[2], v1.s[3] // -> t14
1115        srshr           v19.4s, v2.4s,  #12             // t12
1116        srshr           v28.4s, v4.4s,  #12             // t13
1117        mul_mls         v2,  v17, v30, v1.s[3], v1.s[2] // -> t15
1118        srshr           v17.4s, v6.4s,  #12             // t14
1119        srshr           v30.4s, v2.4s,  #12             // t15
1120
1121        ld1             {v0.4s, v1.4s}, [x16]
1122
1123        sqsub           v2.4s,   v16.4s,  v23.4s // t8a
1124        sqadd           v16.4s,  v16.4s,  v23.4s // t0a
1125        sqsub           v3.4s,   v31.4s,  v24.4s // t9a
1126        sqadd           v31.4s,  v31.4s,  v24.4s // t1a
1127        sqadd           v23.4s,  v18.4s,  v21.4s // t2a
1128        sqsub           v18.4s,  v18.4s,  v21.4s // t10a
1129        sqadd           v24.4s,  v29.4s,  v26.4s // t3a
1130        sqsub           v29.4s,  v29.4s,  v26.4s // t11a
1131        sqadd           v21.4s,  v20.4s,  v19.4s // t4a
1132        sqsub           v20.4s,  v20.4s,  v19.4s // t12a
1133        sqadd           v26.4s,  v27.4s,  v28.4s // t5a
1134        sqsub           v27.4s,  v27.4s,  v28.4s // t13a
1135        sqadd           v19.4s,  v22.4s,  v17.4s // t6a
1136        sqsub           v22.4s,  v22.4s,  v17.4s // t14a
1137        sqadd           v28.4s,  v25.4s,  v30.4s // t7a
1138        sqsub           v25.4s,  v25.4s,  v30.4s // t15a
1139
1140        mul_mla         v4,  v2,  v3,  v1.s[1], v1.s[0] // -> t8
1141        mul_mls         v6,  v2,  v3,  v1.s[0], v1.s[1] // -> t9
1142        mul_mla         v2,  v18, v29, v1.s[3], v1.s[2] // -> t10
1143        srshr           v17.4s, v4.4s,  #12             // t8
1144        srshr           v30.4s, v6.4s,  #12             // t9
1145        mul_mls         v4,  v18, v29, v1.s[2], v1.s[3] // -> t11
1146        mul_mls         v6,  v27, v20, v1.s[1], v1.s[0] // -> t12
1147        srshr           v18.4s, v2.4s,  #12             // t10
1148        srshr           v29.4s, v4.4s,  #12             // t11
1149        mul_mla         v2,  v27, v20, v1.s[0], v1.s[1] // -> t13
1150        mul_mls         v4,  v25, v22, v1.s[3], v1.s[2] // -> t14
1151        srshr           v27.4s, v6.4s,  #12             // t12
1152        srshr           v20.4s, v2.4s,  #12             // t13
1153        mul_mla         v6,  v25, v22, v1.s[2], v1.s[3] // -> t15
1154        srshr           v25.4s, v4.4s,  #12             // t14
1155        srshr           v22.4s, v6.4s,  #12             // t15
1156
1157        sqsub           v2.4s,   v16.4s,  v21.4s // t4
1158        sqadd           v16.4s,  v16.4s,  v21.4s // t0
1159        sqsub           v3.4s,   v31.4s,  v26.4s // t5
1160        sqadd           v31.4s,  v31.4s,  v26.4s // t1
1161        sqadd           v21.4s,  v23.4s,  v19.4s // t2
1162        sqsub           v23.4s,  v23.4s,  v19.4s // t6
1163        sqadd           v26.4s,  v24.4s,  v28.4s // t3
1164        sqsub           v24.4s,  v24.4s,  v28.4s // t7
1165        sqadd           v19.4s,  v17.4s,  v27.4s // t8a
1166        sqsub           v17.4s,  v17.4s,  v27.4s // t12a
1167        sqadd           v28.4s,  v30.4s,  v20.4s // t9a
1168        sqsub           v30.4s,  v30.4s,  v20.4s // t13a
1169        sqadd           v27.4s,  v18.4s,  v25.4s // t10a
1170        sqsub           v18.4s,  v18.4s,  v25.4s // t14a
1171        sqadd           v20.4s,  v29.4s,  v22.4s // t11a
1172        sqsub           v29.4s,  v29.4s,  v22.4s // t15a
1173
1174        mul_mla         v4,  v2,  v3,  v0.s[3], v0.s[2] // -> t4a
1175        mul_mls         v6,  v2,  v3,  v0.s[2], v0.s[3] // -> t5a
1176        mul_mls         v2,  v24, v23, v0.s[3], v0.s[2] // -> t6a
1177        srshr           v22.4s, v4.4s,  #12             // t4a
1178        srshr           v25.4s, v6.4s,  #12             // t5a
1179        mul_mla         v4,  v24, v23, v0.s[2], v0.s[3] // -> t7a
1180        mul_mla         v6,  v17, v30, v0.s[3], v0.s[2] // -> t12
1181        srshr           v24.4s, v2.4s,  #12             // t6a
1182        srshr           v23.4s, v4.4s,  #12             // t7a
1183        mul_mls         v2,  v17, v30, v0.s[2], v0.s[3] // -> t13
1184        mul_mls         v4,  v29, v18, v0.s[3], v0.s[2] // -> t14
1185        srshr           v17.4s, v6.4s,  #12             // t12
1186        mul_mla         v6,  v29, v18, v0.s[2], v0.s[3] // -> t15
1187        srshr           v29.4s, v2.4s,  #12             // t13
1188        srshr           v30.4s, v4.4s,  #12             // t14
1189        srshr           v18.4s, v6.4s,  #12             // t15
1190
1191        sqsub           v2.4s,   v16.4s,  v21.4s // t2a
1192.ifc \o0, v16
1193        sqadd           \o0\().4s,  v16.4s,  v21.4s // out0
1194        sqsub           v21.4s,     v31.4s,  v26.4s // t3a
1195        sqadd           \o15\().4s, v31.4s,  v26.4s // out15
1196.else
1197        sqadd           v4.4s,      v16.4s,  v21.4s // out0
1198        sqsub           v21.4s,     v31.4s,  v26.4s // t3a
1199        sqadd           \o15\().4s, v31.4s,  v26.4s // out15
1200        mov             \o0\().16b, v4.16b
1201.endif
1202        sqneg           \o15\().4s, \o15\().4s      // out15
1203
1204        sqsub           v3.4s,      v29.4s,  v18.4s // t15a
1205        sqadd           \o13\().4s, v29.4s,  v18.4s // out13
1206        sqadd           \o2\().4s,  v17.4s,  v30.4s // out2
1207        sqsub           v26.4s,     v17.4s,  v30.4s // t14a
1208        sqneg           \o13\().4s, \o13\().4s      // out13
1209
1210        sqadd           \o1\().4s,  v19.4s,  v27.4s // out1
1211        sqsub           v27.4s,     v19.4s,  v27.4s // t10
1212        sqadd           \o14\().4s, v28.4s,  v20.4s // out14
1213        sqsub           v20.4s,     v28.4s,  v20.4s // t11
1214        sqneg           \o1\().4s,  \o1\().4s       // out1
1215
1216        sqadd           \o3\().4s,  v22.4s,  v24.4s // out3
1217        sqsub           v22.4s,     v22.4s,  v24.4s // t6
1218        sqadd           \o12\().4s, v25.4s,  v23.4s // out12
1219        sqsub           v23.4s,     v25.4s,  v23.4s // t7
1220        sqneg           \o3\().4s,  \o3\().4s       // out3
1221
1222        mul_mls         v24, v2,  v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
1223        mul_mla         v4,  v2,  v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24)
1224        mul_mla         v6,  v26, v3,  v0.s[0], v0.s[0] // -> out5 (v21 or v26)
1225
1226        srshr           v24.4s, v24.4s, #12             // out8
1227        srshr           v4.4s,  v4.4s,  #12             // out7
1228        srshr           v5.4s,  v6.4s,  #12             // out5
1229        mul_mls         v6,  v26, v3,  v0.s[0], v0.s[0] // -> out10 (v26 or v21)
1230        mul_mla         v2,  v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27)
1231        srshr           v26.4s, v6.4s,  #12             // out10
1232
1233        mul_mls         v6,  v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20)
1234        mul_mla         v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25)
1235        mul_mls         v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22)
1236
1237        srshr           \o4\().4s,   v2.4s,  #12        // out4
1238        srshr           v6.4s,       v6.4s,  #12        // out11
1239        srshr           v7.4s,       v21.4s, #12        // out9
1240        srshr           \o6\().4s,   v22.4s, #12        // out6
1241
1242.ifc \o8, v23
1243        mov             \o8\().16b,  v24.16b
1244        mov             \o10\().16b, v26.16b
1245.endif
1246
1247        sqneg           \o7\().4s,   v4.4s // out7
1248        sqneg           \o5\().4s,   v5.4s // out5
1249        sqneg           \o11\().4s,  v6.4s // out11
1250        sqneg           \o9\().4s,   v7.4s // out9
1251.endm
1252
1253function inv_adst_4s_x16_neon
1254        iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
1255        ret
1256endfunc
1257
1258function inv_flipadst_4s_x16_neon
1259        iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16
1260        ret
1261endfunc
1262
1263function inv_identity_4s_x16_neon
1264        movz            w16, #2*(5793-4096)*8, lsl #16
1265        dup             v0.2s,   w16
1266.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1267        sqrdmulh        v2.4s,      v\i\().4s,  v0.s[0]
1268        sqadd           v\i\().4s,  v\i\().4s,  v\i\().4s
1269        sqadd           v\i\().4s,  v\i\().4s,  v2.4s
1270.endr
1271        ret
1272endfunc
1273
1274.macro identity_4x16_shift1 c
1275.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1276        sqrdmulh        v3.4s,   \i,      \c
1277        srshr           v3.4s,   v3.4s,   #1
1278        sqadd           \i,      \i,      v3.4s
1279.endr
1280.endm
1281
1282.macro identity_4x16 c
1283.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1284        sqrdmulh        v3.4s,   \i,      \c
1285        sqadd           \i,      \i,      \i
1286        sqadd           \i,      \i,      v3.4s
1287.endr
1288.endm
1289
1290.macro def_horz_16 scale=0, shift=2, suffix
1291function inv_txfm_horz\suffix\()_16x4_neon
1292        mov             x14, x30
1293        movi            v7.4s,  #0
1294.if \scale
1295        movz            w16, #2896*8, lsl #16
1296        dup             v0.2s,   w16
1297.endif
1298.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1299        ld1             {\i}, [x7]
1300        st1             {v7.4s}, [x7], x8
1301.endr
1302.if \scale
1303        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1304        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
1305.endif
1306        blr             x4
1307        sqrshrn         v16.4h,  v16.4s,  #\shift
1308        sqrshrn         v17.4h,  v17.4s,  #\shift
1309        sqrshrn         v18.4h,  v18.4s,  #\shift
1310        sqrshrn         v19.4h,  v19.4s,  #\shift
1311        sqrshrn2        v16.8h,  v20.4s,  #\shift
1312        sqrshrn2        v17.8h,  v21.4s,  #\shift
1313        sqrshrn2        v18.8h,  v22.4s,  #\shift
1314        sqrshrn2        v19.8h,  v23.4s,  #\shift
1315        sqrshrn         v20.4h,  v24.4s,  #\shift
1316        sqrshrn         v21.4h,  v25.4s,  #\shift
1317        sqrshrn         v22.4h,  v26.4s,  #\shift
1318        sqrshrn         v23.4h,  v27.4s,  #\shift
1319        sqrshrn2        v20.8h,  v28.4s,  #\shift
1320        sqrshrn2        v21.8h,  v29.4s,  #\shift
1321        sqrshrn2        v22.8h,  v30.4s,  #\shift
1322        sqrshrn2        v23.8h,  v31.4s,  #\shift
1323        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
1324        transpose_4x8h  v20, v21, v22, v23, v4,  v5,  v6,  v7
1325
1326.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h
1327        st1             {\i}, [x6], #16
1328.endr
1329
1330        br              x14
1331endfunc
1332.endm
1333
1334def_horz_16 scale=0, shift=2
1335def_horz_16 scale=1, shift=1, suffix=_scale
1336
1337function inv_txfm_add_vert_8x16_neon
1338        mov             x14, x30
1339.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1340        ld1             {v\i\().8h}, [x7], x8
1341.endr
1342        blr             x5
1343        load_add_store_8x16 x6, x7
1344        br              x14
1345endfunc
1346
1347function inv_txfm_add_16x16_neon
1348        mov             x15, x30
1349        sub             sp,  sp,  #512
1350        ldrh            w12, [x13], #2
1351.irp i, 0, 4, 8, 12
1352        add             x6,  sp,  #(\i*16*2)
1353.if \i > 0
1354        mov             w8,  #(16 - \i)
1355        cmp             w3,  w12
1356        b.lt            1f
1357.if \i < 12
1358        ldrh            w12, [x13], #2
1359.endif
1360.endif
1361        add             x7,  x2,  #(\i*4)
1362        mov             x8,  #16*4
1363        bl              inv_txfm_horz_16x4_neon
1364.endr
1365        b               3f
13661:
1367        movi            v4.8h,  #0
1368        movi            v5.8h,  #0
1369        movi            v6.8h,  #0
1370        movi            v7.8h,  #0
13712:
1372        subs            w8,  w8,  #4
1373.rept 2
1374        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
1375.endr
1376        b.gt            2b
13773:
1378.irp i, 0, 8
1379        add             x6,  x0,  #(\i*2)
1380        add             x7,  sp,  #(\i*2)
1381        mov             x8,  #32
1382        bl              inv_txfm_add_vert_8x16_neon
1383.endr
1384
1385        add             sp,  sp,  #512
1386        br              x15
1387endfunc
1388
1389const eob_16x16
1390        .short 10, 36, 78, 256
1391endconst
1392
1393const eob_16x16_identity
1394        .short 4, 8, 12, 256
1395endconst
1396
1397.macro def_fn_16x16 txfm1, txfm2
1398function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
1399.ifc \txfm1\()_\txfm2, dct_dct
1400        idct_dc         16,  16,  2
1401.endif
1402        adr             x4,  inv_\txfm1\()_4s_x16_neon
1403        movrel          x5,  X(inv_\txfm2\()_8h_x16_neon)
1404.ifc \txfm1, identity
1405.ifc \txfm2, identity
1406        movrel          x13, eob_16x16
1407.else
1408        movrel          x13, eob_16x16_identity
1409.endif
1410.else
1411.ifc \txfm2, identity
1412        movrel          x13, eob_16x16_identity
1413.else
1414        movrel          x13, eob_16x16
1415.endif
1416.endif
1417        b               inv_txfm_add_16x16_neon
1418endfunc
1419.endm
1420
1421def_fn_16x16 dct, dct
1422def_fn_16x16 identity, identity
1423def_fn_16x16 dct, adst
1424def_fn_16x16 dct, flipadst
1425def_fn_16x16 dct, identity
1426def_fn_16x16 adst, dct
1427def_fn_16x16 adst, adst
1428def_fn_16x16 adst, flipadst
1429def_fn_16x16 flipadst, dct
1430def_fn_16x16 flipadst, adst
1431def_fn_16x16 flipadst, flipadst
1432def_fn_16x16 identity, dct
1433
1434function inv_txfm_add_16x4_neon
1435        mov             x15, x30
1436        movi            v4.4s,  #0
1437
1438.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1439        ld1             {\i},    [x2]
1440        st1             {v4.4s}, [x2], #16
1441.endr
1442
1443        blr             x4
1444
1445        sqrshrn         v16.4h,  v16.4s,  #1
1446        sqrshrn         v17.4h,  v17.4s,  #1
1447        sqrshrn         v18.4h,  v18.4s,  #1
1448        sqrshrn         v19.4h,  v19.4s,  #1
1449        sqrshrn2        v16.8h,  v20.4s,  #1
1450        sqrshrn2        v17.8h,  v21.4s,  #1
1451        sqrshrn2        v18.8h,  v22.4s,  #1
1452        sqrshrn2        v19.8h,  v23.4s,  #1
1453        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
1454        blr             x5
1455        mov             x6,  x0
1456        load_add_store_8x4 x6, x7
1457
1458        sqrshrn         v16.4h,  v24.4s,  #1
1459        sqrshrn         v17.4h,  v25.4s,  #1
1460        sqrshrn         v18.4h,  v26.4s,  #1
1461        sqrshrn         v19.4h,  v27.4s,  #1
1462        sqrshrn2        v16.8h,  v28.4s,  #1
1463        sqrshrn2        v17.8h,  v29.4s,  #1
1464        sqrshrn2        v18.8h,  v30.4s,  #1
1465        sqrshrn2        v19.8h,  v31.4s,  #1
1466        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
1467        blr             x5
1468        add             x6,  x0,  #16
1469        load_add_store_8x4 x6, x7
1470
1471        br              x15
1472endfunc
1473
1474function inv_txfm_add_4x16_neon
1475        ldrh            w12, [x13, #4]
1476        mov             x15, x30
1477
1478        mov             x11, #64
1479
1480        cmp             w3,  w12
1481        ldrh            w12, [x13, #2]
1482        b.lt            1f
1483
1484        add             x6,  x2,  #48
1485        movi            v2.4s,   #0
1486.irp i, v16.4s, v17.4s, v18.4s, v19.4s
1487        ld1             {\i},    [x6]
1488        st1             {v2.4s}, [x6], x11
1489.endr
1490        blr             x4
1491        rshrn           v28.4h,  v16.4s,  #1
1492        rshrn           v29.4h,  v17.4s,  #1
1493        rshrn           v30.4h,  v18.4s,  #1
1494        rshrn           v31.4h,  v19.4s,  #1
1495        transpose_4x4h  v28, v29, v30, v31, v4,  v5,  v6,  v7
1496
1497        b               2f
14981:
1499.irp i, v28.4h, v29.4h, v30.4h, v31.4h
1500        movi            \i,  #0
1501.endr
15022:
1503        cmp             w3,  w12
1504        ldrh            w12, [x13, #0]
1505        b.lt            1f
1506
1507        add             x6,  x2,  #32
1508        movi            v2.4s,   #0
1509.irp i, v16.4s, v17.4s, v18.4s, v19.4s
1510        ld1             {\i},    [x6]
1511        st1             {v2.4s}, [x6], x11
1512.endr
1513        blr             x4
1514        rshrn           v24.4h,  v16.4s,  #1
1515        rshrn           v25.4h,  v17.4s,  #1
1516        rshrn           v26.4h,  v18.4s,  #1
1517        rshrn           v27.4h,  v19.4s,  #1
1518        transpose_4x4h  v24, v25, v26, v27, v4,  v5,  v6,  v7
1519
1520        b               2f
15211:
1522.irp i, v24.4h, v25.4h, v26.4h, v27.4h
1523        movi            \i,  #0
1524.endr
15252:
1526        cmp             w3,  w12
1527        b.lt            1f
1528
1529        add             x6,  x2,  #16
1530        movi            v2.4s,   #0
1531.irp i, v16.4s, v17.4s, v18.4s, v19.4s
1532        ld1             {\i},    [x6]
1533        st1             {v2.4s}, [x6], x11
1534.endr
1535        blr             x4
1536        rshrn           v20.4h,  v16.4s,  #1
1537        rshrn           v21.4h,  v17.4s,  #1
1538        rshrn           v22.4h,  v18.4s,  #1
1539        rshrn           v23.4h,  v19.4s,  #1
1540        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
1541
1542        b               2f
15431:
1544.irp i, v20.4h, v21.4h, v22.4h, v23.4h
1545        movi            \i,  #0
1546.endr
15472:
1548
1549        movi            v2.4s,   #0
1550.irp i, v16.4s, v17.4s, v18.4s, v19.4s
1551        ld1             {\i},    [x2]
1552        st1             {v2.4s}, [x2], x11
1553.endr
1554        blr             x4
1555        rshrn           v16.4h,  v16.4s,  #1
1556        rshrn           v17.4h,  v17.4s,  #1
1557        rshrn           v18.4h,  v18.4s,  #1
1558        rshrn           v19.4h,  v19.4s,  #1
1559        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
1560
1561        blr             x5
1562
1563        load_add_store_4x16 x0, x6
1564
1565        br              x15
1566endfunc
1567
1568const eob_4x16
1569        .short 13, 29, 45, 64
1570endconst
1571
1572const eob_4x16_identity1
1573        .short 16, 32, 48, 64
1574endconst
1575
1576const eob_4x16_identity2
1577        .short 4, 8, 12, 64
1578endconst
1579
1580.macro def_fn_416 w, h, txfm1, txfm2
1581function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
1582.ifc \txfm1\()_\txfm2, dct_dct
1583        idct_dc         \w,  \h,  1
1584.endif
1585.if \w == 4
1586        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
1587        movrel          x5,  X(inv_\txfm2\()_4h_x\h\()_neon)
1588.ifc \txfm1, identity
1589.ifc \txfm2, identity
1590        movrel          x13, eob_4x16
1591.else
1592        movrel          x13, eob_4x16_identity1
1593.endif
1594.else
1595.ifc \txfm2, identity
1596        movrel          x13, eob_4x16_identity2
1597.else
1598        movrel          x13, eob_4x16
1599.endif
1600.endif
1601.else
1602        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
1603        movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)
1604.endif
1605        b               inv_txfm_add_\w\()x\h\()_neon
1606endfunc
1607.endm
1608
1609.macro def_fns_416 w, h
1610def_fn_416 \w, \h, dct, dct
1611def_fn_416 \w, \h, identity, identity
1612def_fn_416 \w, \h, dct, adst
1613def_fn_416 \w, \h, dct, flipadst
1614def_fn_416 \w, \h, dct, identity
1615def_fn_416 \w, \h, adst, dct
1616def_fn_416 \w, \h, adst, adst
1617def_fn_416 \w, \h, adst, flipadst
1618def_fn_416 \w, \h, flipadst, dct
1619def_fn_416 \w, \h, flipadst, adst
1620def_fn_416 \w, \h, flipadst, flipadst
1621def_fn_416 \w, \h, identity, dct
1622def_fn_416 \w, \h, adst, identity
1623def_fn_416 \w, \h, flipadst, identity
1624def_fn_416 \w, \h, identity, adst
1625def_fn_416 \w, \h, identity, flipadst
1626.endm
1627
1628def_fns_416 4, 16
1629def_fns_416 16, 4
1630
1631
1632function inv_txfm_add_16x8_neon
1633        mov             x15, x30
1634        stp             d8,  d9,  [sp, #-0x40]!
1635        stp             d10, d11, [sp, #0x10]
1636        stp             d12, d13, [sp, #0x20]
1637        stp             d14, d15, [sp, #0x30]
1638
1639        cmp             w3,  w13
1640        mov             x11, #32
1641        b.lt            1f
1642
1643        movi            v4.4s,  #0
1644        movz            w16, #2896*8, lsl #16
1645        dup             v0.2s,   w16
1646
1647        add             x6,  x2,  #16
1648.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1649        ld1             {\i},    [x6]
1650        st1             {v4.4s}, [x6], x11
1651.endr
1652
1653        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1654        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
1655        blr             x4
1656
1657        sqrshrn         v8.4h,   v16.4s,  #1
1658        sqrshrn         v9.4h,   v17.4s,  #1
1659        sqrshrn         v10.4h,  v18.4s,  #1
1660        sqrshrn         v11.4h,  v19.4s,  #1
1661        sqrshrn2        v8.8h,   v20.4s,  #1
1662        sqrshrn2        v9.8h,   v21.4s,  #1
1663        sqrshrn2        v10.8h,  v22.4s,  #1
1664        sqrshrn2        v11.8h,  v23.4s,  #1
1665        sqrshrn         v12.4h,  v24.4s,  #1
1666        sqrshrn         v13.4h,  v25.4s,  #1
1667        sqrshrn         v14.4h,  v26.4s,  #1
1668        sqrshrn         v15.4h,  v27.4s,  #1
1669        sqrshrn2        v12.8h,  v28.4s,  #1
1670        sqrshrn2        v13.8h,  v29.4s,  #1
1671        sqrshrn2        v14.8h,  v30.4s,  #1
1672        sqrshrn2        v15.8h,  v31.4s,  #1
1673
1674        transpose_4x8h  v8,  v9,  v10, v11, v2,  v3,  v4,  v5
1675        transpose_4x8h  v12, v13, v14, v15, v2,  v3,  v4,  v5
1676
1677        b               2f
16781:
1679.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h
1680        movi            \i,  #0
1681.endr
16822:
1683        movz            w16, #2896*8, lsl #16
1684        dup             v0.2s,   w16
1685
1686        movi            v4.4s,  #0
1687.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1688        ld1             {\i},    [x2]
1689        st1             {v4.4s}, [x2], x11
1690.endr
1691
1692        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1693        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
1694        blr             x4
1695
1696        sqrshrn         v16.4h,  v16.4s,  #1
1697        sqrshrn         v17.4h,  v17.4s,  #1
1698        sqrshrn         v18.4h,  v18.4s,  #1
1699        sqrshrn         v19.4h,  v19.4s,  #1
1700        sqrshrn2        v16.8h,  v20.4s,  #1
1701        sqrshrn2        v17.8h,  v21.4s,  #1
1702        sqrshrn2        v18.8h,  v22.4s,  #1
1703        sqrshrn2        v19.8h,  v23.4s,  #1
1704
1705        mov             v20.16b, v8.16b
1706        mov             v21.16b, v9.16b
1707        mov             v22.16b, v10.16b
1708        mov             v23.16b, v11.16b
1709
1710        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
1711
1712        sqrshrn         v8.4h,   v24.4s,  #1
1713        sqrshrn         v9.4h,   v25.4s,  #1
1714        sqrshrn         v10.4h,  v26.4s,  #1
1715        sqrshrn         v11.4h,  v27.4s,  #1
1716        sqrshrn2        v8.8h,   v28.4s,  #1
1717        sqrshrn2        v9.8h,   v29.4s,  #1
1718        sqrshrn2        v10.8h,  v30.4s,  #1
1719        sqrshrn2        v11.8h,  v31.4s,  #1
1720
1721        transpose_4x8h  v8,  v9, v10, v11, v2,  v3,  v4,  v5
1722
1723        blr             x5
1724
1725        mov             x6,  x0
1726        load_add_store_8x8 x6, x7
1727
1728        mov             v16.16b, v8.16b
1729        mov             v17.16b, v9.16b
1730        mov             v18.16b, v10.16b
1731        mov             v19.16b, v11.16b
1732        mov             v20.16b, v12.16b
1733        mov             v21.16b, v13.16b
1734        mov             v22.16b, v14.16b
1735        mov             v23.16b, v15.16b
1736
1737        blr             x5
1738
1739        add             x0,  x0,  #16
1740        load_add_store_8x8 x0, x7
1741
1742        ldp             d14, d15, [sp, #0x30]
1743        ldp             d12, d13, [sp, #0x20]
1744        ldp             d10, d11, [sp, #0x10]
1745        ldp             d8,  d9,  [sp], 0x40
1746        br              x15
1747endfunc
1748
1749function inv_txfm_add_8x16_neon
1750        mov             x15, x30
1751        stp             d8,  d9,  [sp, #-0x20]!
1752        stp             d10, d11, [sp, #0x10]
1753        ldrh            w12, [x13, #4]
1754
1755        mov             x11, #64
1756
1757        cmp             w3,  w12
1758        ldrh            w12, [x13, #2]
1759        b.lt            1f
1760
1761        add             x6,  x2,  #48
1762        movi            v4.4s,   #0
1763        movz            w16, #2896*8, lsl #16
1764        dup             v0.2s,   w16
1765.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1766        ld1             {\i},    [x6]
1767        st1             {v4.4s}, [x6], x11
1768.endr
1769        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1770        blr             x4
1771
1772        sqrshrn         v28.4h,  v16.4s,  #1
1773        sqrshrn         v29.4h,  v17.4s,  #1
1774        sqrshrn         v30.4h,  v18.4s,  #1
1775        sqrshrn         v31.4h,  v19.4s,  #1
1776        sqrshrn2        v28.8h,  v20.4s,  #1
1777        sqrshrn2        v29.8h,  v21.4s,  #1
1778        sqrshrn2        v30.8h,  v22.4s,  #1
1779        sqrshrn2        v31.8h,  v23.4s,  #1
1780        transpose_4x8h  v28, v29, v30, v31, v2, v3, v4, v5
1781
1782        b               2f
1783
17841:
1785.irp i, v28.8h, v29.8h, v30.8h, v31.8h
1786        movi            \i,  #0
1787.endr
1788
17892:
1790        cmp             w3,  w12
1791        ldrh            w12, [x13, #0]
1792        b.lt            1f
1793
1794        add             x6,  x2,  #32
1795        movi            v4.4s,   #0
1796        movz            w16, #2896*8, lsl #16
1797        dup             v0.2s,   w16
1798.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1799        ld1             {\i},    [x6]
1800        st1             {v4.4s}, [x6], x11
1801.endr
1802        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1803        blr             x4
1804
1805        sqrshrn         v24.4h,  v16.4s,  #1
1806        sqrshrn         v25.4h,  v17.4s,  #1
1807        sqrshrn         v26.4h,  v18.4s,  #1
1808        sqrshrn         v27.4h,  v19.4s,  #1
1809        sqrshrn2        v24.8h,  v20.4s,  #1
1810        sqrshrn2        v25.8h,  v21.4s,  #1
1811        sqrshrn2        v26.8h,  v22.4s,  #1
1812        sqrshrn2        v27.8h,  v23.4s,  #1
1813        transpose_4x8h  v24, v25, v26, v27, v2, v3, v4, v5
1814
1815        b               2f
1816
18171:
1818.irp i, v24.8h, v25.8h, v26.8h, v27.8h
1819        movi            \i,  #0
1820.endr
1821
18222:
1823        cmp             w3,  w12
1824        b.lt            1f
1825
1826        add             x6,  x2,  #16
1827        movi            v4.4s,   #0
1828        movz            w16, #2896*8, lsl #16
1829        dup             v0.2s,   w16
1830.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1831        ld1             {\i},    [x6]
1832        st1             {v4.4s}, [x6], x11
1833.endr
1834        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1835        blr             x4
1836
1837        sqrshrn         v8.4h,   v16.4s,  #1
1838        sqrshrn         v9.4h,   v17.4s,  #1
1839        sqrshrn         v10.4h,  v18.4s,  #1
1840        sqrshrn         v11.4h,  v19.4s,  #1
1841        sqrshrn2        v8.8h,   v20.4s,  #1
1842        sqrshrn2        v9.8h,   v21.4s,  #1
1843        sqrshrn2        v10.8h,  v22.4s,  #1
1844        sqrshrn2        v11.8h,  v23.4s,  #1
1845        transpose_4x8h  v8,  v9,  v10, v11, v2, v3, v4, v5
1846
1847        b               2f
1848
18491:
1850.irp i, v8.8h, v9.8h, v10.8h, v11.8h
1851        movi            \i,  #0
1852.endr
1853
18542:
1855        movi            v4.4s,   #0
1856        movz            w16, #2896*8, lsl #16
1857        dup             v0.2s,   w16
1858.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1859        ld1             {\i},    [x2]
1860        st1             {v4.4s}, [x2], x11
1861.endr
1862        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
1863        blr             x4
1864
1865        sqrshrn         v16.4h,  v16.4s,  #1
1866        sqrshrn         v17.4h,  v17.4s,  #1
1867        sqrshrn         v18.4h,  v18.4s,  #1
1868        sqrshrn         v19.4h,  v19.4s,  #1
1869        sqrshrn2        v16.8h,  v20.4s,  #1
1870        sqrshrn2        v17.8h,  v21.4s,  #1
1871        sqrshrn2        v18.8h,  v22.4s,  #1
1872        sqrshrn2        v19.8h,  v23.4s,  #1
1873        transpose_4x8h  v16, v17, v18, v19, v2, v3, v4, v5
1874
1875        mov             v20.16b, v8.16b
1876        mov             v21.16b, v9.16b
1877        mov             v22.16b, v10.16b
1878        mov             v23.16b, v11.16b
1879
1880        blr             x5
1881
1882        load_add_store_8x16 x0, x6
1883
1884        ldp             d10, d11, [sp, #0x10]
1885        ldp             d8,  d9,  [sp], 0x20
1886
1887        br              x15
1888endfunc
1889
1890const eob_8x16
1891        .short 10, 43, 75, 128
1892endconst
1893
1894const eob_8x16_identity1
1895        .short 4, 64, 96, 128
1896endconst
1897
1898const eob_8x16_identity2
1899        .short 4, 8, 12, 128
1900endconst
1901
1902.macro def_fn_816 w, h, txfm1, txfm2
1903function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
1904.ifc \txfm1\()_\txfm2, dct_dct
1905        idct_dc         \w,  \h,  1
1906.endif
1907        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
1908        movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)
1909.ifc \txfm1, identity
1910.ifc \txfm2, identity
1911        movrel          x13, eob_8x16
1912.else
1913        movrel          x13, eob_8x16_identity1
1914.endif
1915.else
1916.ifc \txfm2, identity
1917        movrel          x13, eob_8x16_identity2
1918.else
1919        movrel          x13, eob_8x16
1920.endif
1921.endif
1922.if \h == 8
1923        ldrh            w13, [x13]
1924.endif
1925        b               inv_txfm_add_\w\()x\h\()_neon
1926endfunc
1927.endm
1928
1929.macro def_fns_816 w, h
1930def_fn_816 \w, \h, dct, dct
1931def_fn_816 \w, \h, identity, identity
1932def_fn_816 \w, \h, dct, adst
1933def_fn_816 \w, \h, dct, flipadst
1934def_fn_816 \w, \h, dct, identity
1935def_fn_816 \w, \h, adst, dct
1936def_fn_816 \w, \h, adst, adst
1937def_fn_816 \w, \h, adst, flipadst
1938def_fn_816 \w, \h, flipadst, dct
1939def_fn_816 \w, \h, flipadst, adst
1940def_fn_816 \w, \h, flipadst, flipadst
1941def_fn_816 \w, \h, identity, dct
1942def_fn_816 \w, \h, adst, identity
1943def_fn_816 \w, \h, flipadst, identity
1944def_fn_816 \w, \h, identity, adst
1945def_fn_816 \w, \h, identity, flipadst
1946.endm
1947
1948def_fns_816 8, 16
1949def_fns_816 16, 8
1950
1951function inv_dct32_odd_4s_x16_neon
1952        movrel          x16, idct_coeffs, 4*16
1953        ld1             {v0.4s, v1.4s}, [x16], #32
1954
1955        mul_mls         v2,  v16, v31, v0.s[0], v0.s[1] // -> t16a
1956        mul_mla         v4,  v16, v31, v0.s[1], v0.s[0] // -> t31a
1957        mul_mls         v6,  v24, v23, v0.s[2], v0.s[3] // -> t17a
1958        srshr           v16.4s, v2.4s,  #12             // t16a
1959        srshr           v31.4s, v4.4s,  #12             // t31a
1960        mul_mla         v2,  v24, v23, v0.s[3], v0.s[2] // -> t30a
1961        mul_mls         v4,  v20, v27, v1.s[0], v1.s[1] // -> t18a
1962        srshr           v24.4s, v6.4s,  #12             // t17a
1963        srshr           v23.4s, v2.4s,  #12             // t30a
1964        mul_mla         v6,  v20, v27, v1.s[1], v1.s[0] // -> t29a
1965        mul_mls         v2,  v28, v19, v1.s[2], v1.s[3] // -> t19a
1966        srshr           v20.4s, v4.4s,  #12             // t18a
1967        srshr           v27.4s, v6.4s,  #12             // t29a
1968        mul_mla         v4,  v28, v19, v1.s[3], v1.s[2] // -> t28a
1969        ld1             {v0.4s, v1.4s}, [x16]
1970        sub             x16, x16, #4*24
1971        mul_mls         v6,  v18, v29, v0.s[0], v0.s[1] // -> t20a
1972        srshr           v28.4s, v2.4s,  #12             // t19a
1973        srshr           v19.4s, v4.4s,  #12             // t28a
1974        mul_mla         v2,  v18, v29, v0.s[1], v0.s[0] // -> t27a
1975        mul_mls         v4,  v26, v21, v0.s[2], v0.s[3] // -> t21a
1976        srshr           v18.4s, v6.4s,  #12             // t20a
1977        srshr           v29.4s, v2.4s,  #12             // t27a
1978        mul_mla         v6,  v26, v21, v0.s[3], v0.s[2] // -> t26a
1979        mul_mls         v2,  v22, v25, v1.s[0], v1.s[1] // -> t22a
1980        srshr           v26.4s, v4.4s,  #12             // t21a
1981        srshr           v21.4s, v6.4s,  #12             // t26a
1982        mul_mla         v4,  v22, v25, v1.s[1], v1.s[0] // -> t25a
1983        mul_mls         v6,  v30, v17, v1.s[2], v1.s[3] // -> t23a
1984        srshr           v22.4s, v2.4s,  #12             // t22a
1985        srshr           v25.4s, v4.4s,  #12             // t25a
1986        mul_mla         v2,  v30, v17, v1.s[3], v1.s[2] // -> t24a
1987        srshr           v30.4s, v6.4s,  #12             // t23a
1988        srshr           v17.4s, v2.4s,  #12             // t24a
1989
1990        ld1             {v0.4s, v1.4s}, [x16]
1991
1992        sqsub           v2.4s,   v16.4s,  v24.4s // t17
1993        sqadd           v16.4s,  v16.4s,  v24.4s // t16
1994        sqsub           v3.4s,   v31.4s,  v23.4s // t30
1995        sqadd           v31.4s,  v31.4s,  v23.4s // t31
1996        sqsub           v24.4s,  v28.4s,  v20.4s // t18
1997        sqadd           v28.4s,  v28.4s,  v20.4s // t19
1998        sqadd           v23.4s,  v18.4s,  v26.4s // t20
1999        sqsub           v18.4s,  v18.4s,  v26.4s // t21
2000        sqsub           v20.4s,  v30.4s,  v22.4s // t22
2001        sqadd           v30.4s,  v30.4s,  v22.4s // t23
2002        sqadd           v26.4s,  v17.4s,  v25.4s // t24
2003        sqsub           v17.4s,  v17.4s,  v25.4s // t25
2004        sqsub           v22.4s,  v29.4s,  v21.4s // t26
2005        sqadd           v29.4s,  v29.4s,  v21.4s // t27
2006        sqadd           v25.4s,  v19.4s,  v27.4s // t28
2007        sqsub           v19.4s,  v19.4s,  v27.4s // t29
2008
2009        mul_mls         v4,  v3,  v2,  v1.s[0], v1.s[1] // -> t17a
2010        mul_mla         v6,  v3,  v2,  v1.s[1], v1.s[0] // -> t30a
2011        mul_mla         v2,  v19, v24, v1.s[1], v1.s[0] // -> t18a
2012        srshr           v21.4s, v4.4s,  #12             // t17a
2013        srshr           v27.4s, v6.4s,  #12             // t30a
2014        neg             v2.4s,   v2.4s                  // -> t18a
2015        mul_mls         v4,  v19, v24, v1.s[0], v1.s[1] // -> t29a
2016        mul_mls         v6,  v22, v18, v1.s[2], v1.s[3] // -> t21a
2017        srshr           v19.4s, v2.4s,  #12             // t18a
2018        srshr           v24.4s, v4.4s,  #12             // t29a
2019        mul_mla         v2,  v22, v18, v1.s[3], v1.s[2] // -> t26a
2020        mul_mla         v4,  v17, v20, v1.s[3], v1.s[2] // -> t22a
2021        srshr           v22.4s, v6.4s,  #12             // t21a
2022        srshr           v18.4s, v2.4s,  #12             // t26a
2023        neg             v4.4s,   v4.4s                  // -> t22a
2024        mul_mls         v6,  v17, v20, v1.s[2], v1.s[3] // -> t25a
2025        srshr           v17.4s, v4.4s,  #12             // t22a
2026        srshr           v20.4s, v6.4s,  #12             // t25a
2027
2028        sqsub           v2.4s,   v27.4s,  v24.4s // t29
2029        sqadd           v27.4s,  v27.4s,  v24.4s // t30
2030        sqsub           v3.4s,   v21.4s,  v19.4s // t18
2031        sqadd           v21.4s,  v21.4s,  v19.4s // t17
2032        sqsub           v24.4s,  v16.4s,  v28.4s // t19a
2033        sqadd           v16.4s,  v16.4s,  v28.4s // t16a
2034        sqsub           v19.4s,  v30.4s,  v23.4s // t20a
2035        sqadd           v30.4s,  v30.4s,  v23.4s // t23a
2036        sqsub           v28.4s,  v17.4s,  v22.4s // t21
2037        sqadd           v17.4s,  v17.4s,  v22.4s // t22
2038        sqadd           v23.4s,  v26.4s,  v29.4s // t24a
2039        sqsub           v26.4s,  v26.4s,  v29.4s // t27a
2040        sqadd           v22.4s,  v20.4s,  v18.4s // t25
2041        sqsub           v20.4s,  v20.4s,  v18.4s // t26
2042        sqsub           v29.4s,  v31.4s,  v25.4s // t28a
2043        sqadd           v31.4s,  v31.4s,  v25.4s // t31a
2044
2045        mul_mls         v4,  v2,  v3,  v0.s[2], v0.s[3] // -> t18a
2046        mul_mla         v6,  v2,  v3,  v0.s[3], v0.s[2] // -> t29a
2047        mul_mls         v2,  v29, v24, v0.s[2], v0.s[3] // -> t19
2048        srshr           v18.4s, v4.4s,  #12             // t18a
2049        srshr           v25.4s, v6.4s,  #12             // t29a
2050        mul_mla         v4,  v29, v24, v0.s[3], v0.s[2] // -> t28
2051        mul_mla         v6,  v26, v19, v0.s[3], v0.s[2] // -> t20
2052        srshr           v29.4s, v2.4s,  #12             // t19
2053        srshr           v24.4s, v4.4s,  #12             // t28
2054        neg             v6.4s,   v6.4s                  // -> t20
2055        mul_mls         v2,  v26, v19, v0.s[2], v0.s[3] // -> t27
2056        mul_mla         v4,  v20, v28, v0.s[3], v0.s[2] // -> t21a
2057        srshr           v26.4s, v6.4s,  #12             // t20
2058        srshr           v19.4s, v2.4s,  #12             // t27
2059        neg             v4.4s,   v4.4s                  // -> t21a
2060        mul_mls         v6,  v20, v28, v0.s[2], v0.s[3] // -> t26a
2061        srshr           v20.4s, v4.4s,  #12             // t21a
2062        srshr           v28.4s, v6.4s,  #12             // t26a
2063
2064        sqsub           v2.4s,   v16.4s,  v30.4s // t23
2065        sqadd           v16.4s,  v16.4s,  v30.4s // t16 = out16
2066        sqsub           v3.4s,   v31.4s,  v23.4s // t24
2067        sqadd           v31.4s,  v31.4s,  v23.4s // t31 = out31
2068        sqsub           v23.4s,  v21.4s,  v17.4s // t22a
2069        sqadd           v17.4s,  v21.4s,  v17.4s // t17a = out17
2070        sqadd           v30.4s,  v27.4s,  v22.4s // t30a = out30
2071        sqsub           v21.4s,  v27.4s,  v22.4s // t25a
2072        sqsub           v27.4s,  v18.4s,  v20.4s // t21
2073        sqadd           v18.4s,  v18.4s,  v20.4s // t18 = out18
2074        sqadd           v4.4s,   v29.4s,  v26.4s // t19a = out19
2075        sqsub           v26.4s,  v29.4s,  v26.4s // t20a
2076        sqadd           v29.4s,  v25.4s,  v28.4s // t29 = out29
2077        sqsub           v25.4s,  v25.4s,  v28.4s // t26
2078        sqadd           v28.4s,  v24.4s,  v19.4s // t28a = out28
2079        sqsub           v24.4s,  v24.4s,  v19.4s // t27a
2080        mov             v19.16b, v4.16b          // out19
2081
2082        mul_mls         v4,  v24, v26, v0.s[0], v0.s[0] // -> t20
2083        mul_mla         v6,  v24, v26, v0.s[0], v0.s[0] // -> t27
2084        srshr           v20.4s, v4.4s,  #12             // t20
2085        srshr           v22.4s, v6.4s,  #12             // t27
2086
2087        mul_mla         v4,  v25, v27, v0.s[0], v0.s[0] // -> t26a
2088        mul_mls         v6,  v25, v27, v0.s[0], v0.s[0] // -> t21a
2089        mov             v27.16b,  v22.16b               // t27
2090        srshr           v26.4s, v4.4s,  #12             // t26a
2091
2092        mul_mls         v24, v21, v23, v0.s[0], v0.s[0] // -> t22
2093        mul_mla         v4,  v21, v23, v0.s[0], v0.s[0] // -> t25
2094        srshr           v21.4s, v6.4s,  #12             // t21a
2095        srshr           v22.4s, v24.4s, #12             // t22
2096        srshr           v25.4s, v4.4s,  #12             // t25
2097
2098        mul_mls         v4,  v3,  v2,  v0.s[0], v0.s[0] // -> t23a
2099        mul_mla         v6,  v3,  v2,  v0.s[0], v0.s[0] // -> t24a
2100        srshr           v23.4s, v4.4s,  #12             // t23a
2101        srshr           v24.4s, v6.4s,  #12             // t24a
2102
2103        ret
2104endfunc
2105
2106.macro def_horz_32 scale=0, shift=2, suffix
2107function inv_txfm_horz\suffix\()_dct_32x4_neon
2108        mov             x14, x30
2109        movi            v7.4s,  #0
2110        lsl             x8,  x8,  #1
2111.if \scale
2112        movz            w16, #2896*8, lsl #16
2113        dup             v0.2s,   w16
2114.endif
2115
2116.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
2117        ld1             {\i}, [x7]
2118        st1             {v7.4s}, [x7], x8
2119.endr
2120        sub             x7,  x7,  x8, lsl #4
2121        add             x7,  x7,  x8, lsr #1
2122.if \scale
2123        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
2124        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
2125.endif
2126        bl              inv_dct_4s_x16_neon
2127        transpose_4x4s  v16, v17, v18, v19, v2,  v3,  v4,  v5
2128        transpose_4x4s  v20, v21, v22, v23, v2,  v3,  v4,  v5
2129        transpose_4x4s  v24, v25, v26, v27, v2,  v3,  v4,  v5
2130        transpose_4x4s  v28, v29, v30, v31, v2,  v3,  v4,  v5
2131
2132.macro store1 r0, r1, r2, r3
2133        st1             {\r0}, [x6], #16
2134        st1             {\r1}, [x6], #16
2135        st1             {\r2}, [x6], #16
2136        st1             {\r3}, [x6], #16
2137.endm
2138        store1          v16.4s,  v20.4s,  v24.4s,  v28.4s
2139        store1          v17.4s,  v21.4s,  v25.4s,  v29.4s
2140        store1          v18.4s,  v22.4s,  v26.4s,  v30.4s
2141        store1          v19.4s,  v23.4s,  v27.4s,  v31.4s
2142.purgem store1
2143        sub             x6,  x6,  #64*4
2144
2145        movi            v7.4s,  #0
2146.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
2147        ld1             {\i}, [x7]
2148        st1             {v7.4s}, [x7], x8
2149.endr
2150.if \scale
2151        // This relies on the fact that the idct also leaves the right coeff in v0.s[1]
2152        scale_input     .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23
2153        scale_input     .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31
2154.endif
2155        bl              inv_dct32_odd_4s_x16_neon
2156        transpose_4x4s  v31, v30, v29, v28, v2,  v3,  v4,  v5
2157        transpose_4x4s  v27, v26, v25, v24, v2,  v3,  v4,  v5
2158        transpose_4x4s  v23, v22, v21, v20, v2,  v3,  v4,  v5
2159        transpose_4x4s  v19, v18, v17, v16, v2,  v3,  v4,  v5
2160.macro store2 r0, r1, r2, r3, shift
2161        ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]
2162        sqsub           v4.4s,   v0.4s,   \r0
2163        sqadd           v0.4s,   v0.4s,   \r0
2164        sqsub           v5.4s,   v1.4s,   \r1
2165        sqadd           v1.4s,   v1.4s,   \r1
2166        sqsub           v6.4s,   v2.4s,   \r2
2167        sqadd           v2.4s,   v2.4s,   \r2
2168        sqsub           v7.4s,   v3.4s,   \r3
2169        sqadd           v3.4s,   v3.4s,   \r3
2170        sqrshrn         v0.4h,   v0.4s,   #\shift
2171        sqrshrn2        v0.8h,   v1.4s,   #\shift
2172        sqrshrn         v1.4h,   v2.4s,   #\shift
2173        sqrshrn2        v1.8h,   v3.4s,   #\shift
2174        sqrshrn         v2.4h,   v7.4s,   #\shift
2175        sqrshrn2        v2.8h,   v6.4s,   #\shift
2176        sqrshrn         v3.4h,   v5.4s,   #\shift
2177        sqrshrn2        v3.8h,   v4.4s,   #\shift
2178        st1             {v0.8h, v1.8h}, [x6], #32
2179        rev64           v2.8h,   v2.8h
2180        rev64           v3.8h,   v3.8h
2181        st1             {v2.8h, v3.8h}, [x6], #32
2182.endm
2183
2184        store2          v31.4s,  v27.4s,  v23.4s,  v19.4s,  \shift
2185        store2          v30.4s,  v26.4s,  v22.4s,  v18.4s,  \shift
2186        store2          v29.4s,  v25.4s,  v21.4s,  v17.4s,  \shift
2187        store2          v28.4s,  v24.4s,  v20.4s,  v16.4s,  \shift
2188.purgem store2
2189        br              x14
2190endfunc
2191.endm
2192
2193def_horz_32 scale=0, shift=2
2194def_horz_32 scale=1, shift=1, suffix=_scale
2195
2196function inv_txfm_add_vert_dct_8x32_neon
2197        mov             x14, x30
2198        lsl             x8,  x8,  #1
2199
2200.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2201        ld1             {v\i\().8h}, [x7], x8
2202.endr
2203        sub             x7,  x7,  x8, lsl #4
2204
2205        bl              X(inv_dct_8h_x16_neon)
2206
2207.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2208        st1             {v\i\().8h}, [x7], x8
2209.endr
2210        sub             x7,  x7,  x8, lsl #4
2211        add             x7,  x7,  x8, lsr #1
2212
2213.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2214        ld1             {v\i\().8h}, [x7], x8
2215.endr
2216        sub             x7,  x7,  x8, lsl #4
2217        sub             x7,  x7,  x8, lsr #1
2218        bl              X(inv_dct32_odd_8h_x16_neon)
2219
2220        neg             x9,  x8
2221        mov             x10, x6
2222        movi            v0.8h,   #0
2223        mvni            v1.8h,   #0xfc, lsl #8 // 0x3ff
2224.macro combine r0, r1, r2, r3, op, stride
2225        ld1             {v5.8h}, [x7],    \stride
2226        ld1             {v2.8h}, [x10],   x1
2227        ld1             {v6.8h}, [x7],    \stride
2228        ld1             {v3.8h}, [x10],   x1
2229        \op             v5.8h,   v5.8h,   \r0
2230        ld1             {v7.8h}, [x7],    \stride
2231        ld1             {v4.8h}, [x10],   x1
2232        srshr           v5.8h,   v5.8h,   #4
2233        \op             v6.8h,   v6.8h,   \r1
2234        sqadd           v5.8h,   v5.8h,   v2.8h
2235        srshr           v6.8h,   v6.8h,   #4
2236        \op             v7.8h,   v7.8h,   \r2
2237        smax            v2.8h,   v5.8h,   v0.8h
2238        ld1             {v5.8h}, [x7],    \stride
2239        sqadd           v6.8h,   v6.8h,   v3.8h
2240        smin            v2.8h,   v2.8h,   v1.8h
2241        srshr           v7.8h,   v7.8h,   #4
2242        \op             v5.8h,   v5.8h,   \r3
2243        st1             {v2.8h}, [x6],    x1
2244        ld1             {v2.8h}, [x10],   x1
2245        smax            v3.8h,   v6.8h,   v0.8h
2246        sqadd           v7.8h,   v7.8h,   v4.8h
2247        smin            v3.8h,   v3.8h,   v1.8h
2248        srshr           v5.8h,   v5.8h,   #4
2249        st1             {v3.8h}, [x6],    x1
2250        smax            v4.8h,   v7.8h,   v0.8h
2251        sqadd           v5.8h,   v5.8h,   v2.8h
2252        smin            v4.8h,   v4.8h,   v1.8h
2253        st1             {v4.8h}, [x6],    x1
2254        smax            v2.8h,   v5.8h,   v0.8h
2255        smin            v2.8h,   v2.8h,   v1.8h
2256        st1             {v2.8h}, [x6],    x1
2257.endm
2258        combine         v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
2259        combine         v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
2260        combine         v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
2261        combine         v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
2262        sub             x7,  x7,  x8
2263        combine         v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
2264        combine         v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
2265        combine         v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
2266        combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
2267.purgem combine
2268
2269        br              x14
2270endfunc
2271
2272const eob_32x32
2273        .short 10, 36, 78, 136, 210, 300, 406, 1024
2274endconst
2275
2276const eob_16x32
2277        .short 10, 36, 78, 151, 215, 279, 343, 512
2278endconst
2279
2280const eob_16x32_shortside
2281        .short 10, 36, 78, 512
2282endconst
2283
2284const eob_8x32
2285        .short 10, 43, 75, 107, 139, 171, 203, 256
2286endconst
2287
2288function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
2289        movi            v0.8h,  #0
2290        movi            v1.8h,  #0
2291        movrel          x13, eob_32x32, 2
2292
2293        mov             x8,  #4*32
22941:
2295        mov             w9,  #0
2296        movrel          x12, eob_32x32, 2
22972:
2298        add             w9,  w9,  #8
2299        ld1             {v16.4s, v17.4s}, [x2]
2300        st1             {v0.4s, v1.4s},   [x2], x8
2301        ld1             {v18.4s, v19.4s}, [x2]
2302        st1             {v0.4s, v1.4s},   [x2], x8
2303        ld1             {v20.4s, v21.4s}, [x2]
2304        st1             {v0.4s, v1.4s},   [x2], x8
2305        ld1             {v22.4s, v23.4s}, [x2]
2306        st1             {v0.4s, v1.4s},   [x2], x8
2307        ld1             {v24.4s, v25.4s}, [x2]
2308        st1             {v0.4s, v1.4s},   [x2], x8
2309        ld1             {v26.4s, v27.4s}, [x2]
2310        st1             {v0.4s, v1.4s},   [x2], x8
2311        ld1             {v28.4s, v29.4s}, [x2]
2312        st1             {v0.4s, v1.4s},   [x2], x8
2313        ld1             {v30.4s, v31.4s}, [x2]
2314        st1             {v0.4s, v1.4s},   [x2], x8
2315        sqxtn           v16.4h,  v16.4s
2316        sqxtn2          v16.8h,  v17.4s
2317        sqxtn           v17.4h,  v18.4s
2318        sqxtn2          v17.8h,  v19.4s
2319        sqxtn           v18.4h,  v20.4s
2320        sqxtn2          v18.8h,  v21.4s
2321        sqxtn           v19.4h,  v22.4s
2322        sqxtn2          v19.8h,  v23.4s
2323        sqxtn           v20.4h,  v24.4s
2324        sqxtn2          v20.8h,  v25.4s
2325        sqxtn           v21.4h,  v26.4s
2326        sqxtn2          v21.8h,  v27.4s
2327        sqxtn           v22.4h,  v28.4s
2328        sqxtn2          v22.8h,  v29.4s
2329        sqxtn           v23.4h,  v30.4s
2330        sqxtn2          v23.8h,  v31.4s
2331        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
2332
2333        load_add_store_8x8 x0, x7, shiftbits=2
2334        ldrh            w11, [x12], #4
2335        sub             x0,  x0,  x1, lsl #3
2336        add             x0,  x0,  #2*8
2337        cmp             w3,  w11
2338        b.ge            2b
2339
2340        ldrh            w11, [x13], #4
2341        cmp             w3,  w11
2342        b.lt            9f
2343
2344        sub             x0,  x0,  w9, uxtw #1
2345        add             x0,  x0,  x1, lsl #3
2346        msub            x2,  x8,  x9,  x2
2347        add             x2,  x2,  #4*8
2348        b               1b
23499:
2350        ret
2351endfunc
2352
2353.macro shift_16_regs op, shift
2354.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
2355        \op             \i,  \i,  #\shift
2356.endr
2357.endm
2358
2359.macro def_identity_1632 w, h, wshort, hshort
2360function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
2361        movz            w16, #2896*8, lsl #16
2362        movz            w17, #2*(5793-4096)*8, lsl #16
2363        movi            v0.4s,   #0
2364        movi            v1.4s,   #0
2365        movrel          x13, eob_16x32\hshort, 2
2366
2367        mov             x8,  #4*\h
23681:
2369        mov             w9,  #0
2370        movrel          x12, eob_16x32\wshort, 2
23712:
2372        add             w9,  w9,  #8
2373        ld1             {v16.4s, v17.4s}, [x2]
2374        st1             {v0.4s, v1.4s},   [x2], x8
2375        dup             v2.2s,   w16
2376        ld1             {v18.4s, v19.4s}, [x2]
2377        st1             {v0.4s, v1.4s},   [x2], x8
2378        mov             v2.s[1], w17
2379        ld1             {v20.4s, v21.4s}, [x2]
2380        st1             {v0.4s, v1.4s},   [x2], x8
2381        ld1             {v22.4s, v23.4s}, [x2]
2382        st1             {v0.4s, v1.4s},   [x2], x8
2383        ld1             {v24.4s, v25.4s}, [x2]
2384        st1             {v0.4s, v1.4s},   [x2], x8
2385        ld1             {v26.4s, v27.4s}, [x2]
2386        st1             {v0.4s, v1.4s},   [x2], x8
2387        ld1             {v28.4s, v29.4s}, [x2]
2388        st1             {v0.4s, v1.4s},   [x2], x8
2389        ld1             {v30.4s, v31.4s}, [x2]
2390        st1             {v0.4s, v1.4s},   [x2], x8
2391        scale_input     .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23
2392        scale_input     .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31
2393
2394.if \w == 16
2395        // 16x32
2396        identity_4x16_shift1 v2.s[1]
2397.else
2398        // 32x16
2399        shift_16_regs   sqshl, 1
2400        identity_4x16   v2.s[1]
2401.endif
2402        sqxtn           v16.4h,  v16.4s
2403        sqxtn2          v16.8h,  v17.4s
2404        sqxtn           v17.4h,  v18.4s
2405        sqxtn2          v17.8h,  v19.4s
2406        sqxtn           v18.4h,  v20.4s
2407        sqxtn2          v18.8h,  v21.4s
2408        sqxtn           v19.4h,  v22.4s
2409        sqxtn2          v19.8h,  v23.4s
2410        sqxtn           v20.4h,  v24.4s
2411        sqxtn2          v20.8h,  v25.4s
2412        sqxtn           v21.4h,  v26.4s
2413        sqxtn2          v21.8h,  v27.4s
2414        sqxtn           v22.4h,  v28.4s
2415        sqxtn2          v22.8h,  v29.4s
2416        sqxtn           v23.4h,  v30.4s
2417        sqxtn2          v23.8h,  v31.4s
2418
2419        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
2420
2421.if \w == 16
2422        load_add_store_8x8 x0, x7, shiftbits=2
2423.else
2424        load_add_store_8x8 x0, x7, shiftbits=4
2425.endif
2426        ldrh            w11, [x12], #4
2427        sub             x0,  x0,  x1, lsl #3
2428        add             x0,  x0,  #16
2429        cmp             w3,  w11
2430        b.ge            2b
2431
2432        ldrh            w11, [x13], #4
2433        cmp             w3,  w11
2434        b.lt            9f
2435
2436        sub             x0,  x0,  w9, uxtw #1
2437        add             x0,  x0,  x1, lsl #3
2438        msub            x2,  x8,  x9,  x2
2439        add             x2,  x2,  #4*8
2440        b               1b
24419:
2442        ret
2443endfunc
2444.endm
2445
2446def_identity_1632 16, 32, _shortside,
2447def_identity_1632 32, 16, , _shortside
2448
2449.macro def_identity_832 w, h
2450function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
2451        movi            v0.4s,  #0
2452        movi            v1.4s,  #0
2453        // Working on 8x8 blocks, read every other entry from eob_8x32
2454        movrel          x13, eob_8x32, 2
2455
2456        mov             w8,  #4*\h
24571:
2458        // Working on 8x8 blocks, read every other entry from eob_8x32
2459        ldrh            w12, [x13], #4
2460        ld1             {v16.4s, v17.4s}, [x2]
2461        st1             {v0.4s, v1.4s},   [x2], x8
2462        ld1             {v18.4s, v19.4s}, [x2]
2463        st1             {v0.4s, v1.4s},   [x2], x8
2464        ld1             {v20.4s, v21.4s}, [x2]
2465        st1             {v0.4s, v1.4s},   [x2], x8
2466        ld1             {v22.4s, v23.4s}, [x2]
2467        st1             {v0.4s, v1.4s},   [x2], x8
2468        ld1             {v24.4s, v25.4s}, [x2]
2469        st1             {v0.4s, v1.4s},   [x2], x8
2470        ld1             {v26.4s, v27.4s}, [x2]
2471        st1             {v0.4s, v1.4s},   [x2], x8
2472        ld1             {v28.4s, v29.4s}, [x2]
2473        st1             {v0.4s, v1.4s},   [x2], x8
2474        ld1             {v30.4s, v31.4s}, [x2]
2475        st1             {v0.4s, v1.4s},   [x2], x8
2476
2477.if \w == 8
2478        sqrshrn         v16.4h,  v16.4s,  #1
2479        sqrshrn2        v16.8h,  v17.4s,  #1
2480        sqrshrn         v17.4h,  v18.4s,  #1
2481        sqrshrn2        v17.8h,  v19.4s,  #1
2482        sqrshrn         v18.4h,  v20.4s,  #1
2483        sqrshrn2        v18.8h,  v21.4s,  #1
2484        sqrshrn         v19.4h,  v22.4s,  #1
2485        sqrshrn2        v19.8h,  v23.4s,  #1
2486        sqrshrn         v20.4h,  v24.4s,  #1
2487        sqrshrn2        v20.8h,  v25.4s,  #1
2488        sqrshrn         v21.4h,  v26.4s,  #1
2489        sqrshrn2        v21.8h,  v27.4s,  #1
2490        sqrshrn         v22.4h,  v28.4s,  #1
2491        sqrshrn2        v22.8h,  v29.4s,  #1
2492        sqrshrn         v23.4h,  v30.4s,  #1
2493        sqrshrn2        v23.8h,  v31.4s,  #1
2494.else
2495        sqxtn           v16.4h,  v16.4s
2496        sqxtn2          v16.8h,  v17.4s
2497        sqxtn           v17.4h,  v18.4s
2498        sqxtn2          v17.8h,  v19.4s
2499        sqxtn           v18.4h,  v20.4s
2500        sqxtn2          v18.8h,  v21.4s
2501        sqxtn           v19.4h,  v22.4s
2502        sqxtn2          v19.8h,  v23.4s
2503        sqxtn           v20.4h,  v24.4s
2504        sqxtn2          v20.8h,  v25.4s
2505        sqxtn           v21.4h,  v26.4s
2506        sqxtn2          v21.8h,  v27.4s
2507        sqxtn           v22.4h,  v28.4s
2508        sqxtn2          v22.8h,  v29.4s
2509        sqxtn           v23.4h,  v30.4s
2510        sqxtn2          v23.8h,  v31.4s
2511.endif
2512
2513        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
2514
2515
2516        cmp             w3,  w12
2517.if \w == 8
2518        load_add_store_8x8 x0, x7, shiftbits=2
2519.else
2520        load_add_store_8x8 x0, x7, shiftbits=3
2521.endif
2522
2523        b.lt            9f
2524.if \w == 8
2525        sub             x2,  x2,  x8, lsl #3
2526        add             x2,  x2,  #4*8
2527.else
2528        sub             x0,  x0,  x1, lsl #3
2529        add             x0,  x0,  #2*8
2530.endif
2531        b               1b
2532
25339:
2534        ret
2535endfunc
2536.endm
2537
2538def_identity_832 8, 32
2539def_identity_832 32, 8
2540
2541function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
2542        idct_dc         32,  32,  2
2543
2544        mov             x15, x30
2545        sub             sp,  sp,  #2048
2546        movrel          x13, eob_32x32
2547        ldrh            w12, [x13], #2
2548
2549.irp i, 0, 4, 8, 12, 16, 20, 24, 28
2550        add             x6,  sp,  #(\i*32*2)
2551.if \i > 0
2552        mov             w8,  #(32 - \i)
2553        cmp             w3,  w12
2554        b.lt            1f
2555.if \i < 28
2556        ldrh            w12, [x13], #2
2557.endif
2558.endif
2559        add             x7,  x2,  #(\i*4)
2560        mov             x8,  #32*4
2561        bl              inv_txfm_horz_dct_32x4_neon
2562.endr
2563        b               3f
2564
25651:
2566        movi            v4.8h,  #0
2567        movi            v5.8h,  #0
2568        movi            v6.8h,  #0
2569        movi            v7.8h,  #0
25702:
2571        subs            w8,  w8,  #4
2572.rept 4
2573        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
2574.endr
2575        b.gt            2b
2576
25773:
2578.irp i, 0, 8, 16, 24
2579        add             x6,  x0,  #(\i*2)
2580        add             x7,  sp,  #(\i*2)
2581        mov             x8,  #32*2
2582        bl              inv_txfm_add_vert_dct_8x32_neon
2583.endr
2584
2585        add             sp,  sp,  #2048
2586        br              x15
2587endfunc
2588
2589function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
2590        idct_dc         16,  32,  1
2591
2592        mov             x15, x30
2593        sub             sp,  sp,  #1024
2594        movrel          x13, eob_16x32
2595        ldrh            w12, [x13], #2
2596        adr             x4,  inv_dct_4s_x16_neon
2597
2598.irp i, 0, 4, 8, 12, 16, 20, 24, 28
2599        add             x6,  sp,  #(\i*16*2)
2600        add             x7,  x2,  #(\i*4)
2601.if \i > 0
2602        mov             w8,  #(32 - \i)
2603        cmp             w3,  w12
2604        b.lt            1f
2605.if \i < 28
2606        ldrh            w12, [x13], #2
2607.endif
2608.endif
2609        mov             x8,  #4*32
2610        bl              inv_txfm_horz_scale_16x4_neon
2611.endr
2612        b               3f
2613
26141:
2615        movi            v4.8h,  #0
2616        movi            v5.8h,  #0
2617        movi            v6.8h,  #0
2618        movi            v7.8h,  #0
26192:
2620        subs            w8,  w8,  #4
2621.rept 2
2622        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
2623.endr
2624        b.gt            2b
2625
26263:
2627.irp i, 0, 8
2628        add             x6,  x0,  #(\i*2)
2629        add             x7,  sp,  #(\i*2)
2630        mov             x8,  #16*2
2631        bl              inv_txfm_add_vert_dct_8x32_neon
2632.endr
2633
2634        add             sp,  sp,  #1024
2635        br              x15
2636endfunc
2637
2638function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
2639        idct_dc         32,  16,  1
2640
2641        mov             x15, x30
2642        sub             sp,  sp,  #1024
2643
2644        movrel          x13, eob_16x32
2645        movrel          x5,  X(inv_dct_8h_x16_neon)
2646        ldrh            w12, [x13], #2
2647
2648.irp i, 0, 4, 8, 12
2649        add             x6,  sp,  #(\i*32*2)
2650        add             x7,  x2,  #(\i*4)
2651.if \i > 0
2652        mov             w8,  #(16 - \i)
2653        cmp             w3,  w12
2654        b.lt            1f
2655        ldrh            w12, [x13], #2
2656.endif
2657        mov             x8,  #4*16
2658        bl              inv_txfm_horz_scale_dct_32x4_neon
2659.endr
2660        b               3f
2661
26621:
2663        movi            v4.8h,  #0
2664        movi            v5.8h,  #0
2665        movi            v6.8h,  #0
2666        movi            v7.8h,  #0
26672:
2668        subs            w8,  w8,  #4
2669.rept 4
2670        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
2671.endr
2672        b.gt            2b
2673
26743:
2675.irp i, 0, 8, 16, 24
2676        add             x6,  x0,  #(\i*2)
2677        add             x7,  sp,  #(\i*2)
2678        mov             x8,  #32*2
2679        bl              inv_txfm_add_vert_8x16_neon
2680.endr
2681
2682        add             sp,  sp,  #1024
2683        br              x15
2684endfunc
2685
2686function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
2687        idct_dc         8,   32, 2
2688
2689        mov             x15, x30
2690        sub             sp,  sp,  #512
2691
2692        movrel          x13, eob_8x32
2693
2694        movi            v28.4s,  #0
2695        mov             x8,  #4*32
2696        mov             w9,  #32
2697        mov             x6,  sp
2698        mov             x7,  x2
26991:
2700.irp i, 16, 17, 18, 19, 20, 21, 22, 23
2701        ld1             {v\i\().4s}, [x7]
2702        st1             {v28.4s}, [x7], x8
2703.endr
2704        ldrh            w12, [x13], #2
2705        sub             w9,  w9,  #4
2706        sub             x7,  x7,  x8, lsl #3
2707        add             x7,  x7,  #4*4
2708
2709        bl              inv_dct_4s_x8_neon
2710
2711        sqrshrn         v16.4h,  v16.4s,  #2
2712        sqrshrn         v17.4h,  v17.4s,  #2
2713        sqrshrn         v18.4h,  v18.4s,  #2
2714        sqrshrn         v19.4h,  v19.4s,  #2
2715        sqrshrn2        v16.8h,  v20.4s,  #2
2716        sqrshrn2        v17.8h,  v21.4s,  #2
2717        sqrshrn2        v18.8h,  v22.4s,  #2
2718        sqrshrn2        v19.8h,  v23.4s,  #2
2719
2720        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
2721
2722        cmp             w3,  w12
2723        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
2724
2725        b.ge            1b
2726        cbz             w9,  3f
2727
2728        movi            v29.8h,  #0
2729        movi            v30.8h,  #0
2730        movi            v31.8h,  #0
27312:
2732        subs            w9,  w9,  #4
2733        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
2734        b.gt            2b
2735
27363:
2737        mov             x6,  x0
2738        mov             x7,  sp
2739        mov             x8,  #8*2
2740        bl              inv_txfm_add_vert_dct_8x32_neon
2741
2742        add             sp,  sp,  #512
2743        br              x15
2744endfunc
2745
2746function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
2747        idct_dc         32,  8,   2
2748
2749        mov             x15, x30
2750        sub             sp,  sp,  #512
2751
2752.irp i, 0, 4
2753        add             x6,  sp,  #(\i*32*2)
2754        add             x7,  x2,  #(\i*4)
2755.if \i > 0
2756        cmp             w3,  #10
2757        b.lt            1f
2758.endif
2759        mov             x8,  #8*4
2760        bl              inv_txfm_horz_dct_32x4_neon
2761.endr
2762        b               2f
2763
27641:
2765        movi            v4.8h,   #0
2766        movi            v5.8h,   #0
2767        movi            v6.8h,   #0
2768        movi            v7.8h,   #0
2769.rept 4
2770        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
2771.endr
2772
27732:
2774        mov             x8,  #2*32
2775        mov             w9,  #0
27761:
2777        add             x6,  x0,  x9, lsl #1
2778        add             x7,  sp,  x9, lsl #1 // #(\i*2)
2779
2780.irp i, 16, 17, 18, 19, 20, 21, 22, 23
2781        ld1             {v\i\().8h}, [x7], x8
2782.endr
2783        add             w9,  w9,  #8
2784
2785        bl              X(inv_dct_8h_x8_neon)
2786
2787        cmp             w9,  #32
2788
2789        load_add_store_8x8 x6, x7
2790
2791        b.lt            1b
2792
2793        add             sp,  sp,  #512
2794        br              x15
2795endfunc
2796
2797function inv_dct64_step1_neon
2798        // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
2799        // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
2800        // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
2801        // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
2802
2803        ld1             {v0.4s, v1.4s}, [x17], #32
2804
2805        sqrdmulh        v23.4s,  v16.4s,  v0.s[1]       // t63a
2806        sqrdmulh        v16.4s,  v16.4s,  v0.s[0]       // t32a
2807        sqrdmulh        v22.4s,  v17.4s,  v0.s[2]       // t62a
2808        sqrdmulh        v17.4s,  v17.4s,  v0.s[3]       // t33a
2809        sqrdmulh        v21.4s,  v18.4s,  v1.s[1]       // t61a
2810        sqrdmulh        v18.4s,  v18.4s,  v1.s[0]       // t34a
2811        sqrdmulh        v20.4s,  v19.4s,  v1.s[2]       // t60a
2812        sqrdmulh        v19.4s,  v19.4s,  v1.s[3]       // t35a
2813
2814        ld1             {v0.4s}, [x17], #16
2815
2816        sqadd           v24.4s,  v16.4s,  v17.4s        // t32
2817        sqsub           v25.4s,  v16.4s,  v17.4s        // t33
2818        sqsub           v26.4s,  v19.4s,  v18.4s        // t34
2819        sqadd           v27.4s,  v19.4s,  v18.4s        // t35
2820        sqadd           v28.4s,  v20.4s,  v21.4s        // t60
2821        sqsub           v29.4s,  v20.4s,  v21.4s        // t61
2822        sqsub           v30.4s,  v23.4s,  v22.4s        // t62
2823        sqadd           v31.4s,  v23.4s,  v22.4s        // t63
2824
2825        mul_mla         v2,  v29, v26, v0.s[0], v0.s[1] // -> t34a
2826        mul_mls         v4,  v29, v26, v0.s[1], v0.s[0] // -> t61a
2827        neg             v2.4s,   v2.4s                  // t34a
2828        mul_mls         v6,  v30, v25, v0.s[1], v0.s[0] // -> t33a
2829        srshr           v26.4s, v2.4s,  #12             // t34a
2830        mul_mla         v2,  v30, v25, v0.s[0], v0.s[1] // -> t62a
2831        srshr           v29.4s, v4.4s,  #12             // t61a
2832        srshr           v25.4s, v6.4s,  #12             // t33a
2833        srshr           v30.4s, v2.4s,  #12             // t62a
2834
2835        sqadd           v16.4s,  v24.4s,  v27.4s        // t32a
2836        sqsub           v19.4s,  v24.4s,  v27.4s        // t35a
2837        sqadd           v17.4s,  v25.4s,  v26.4s        // t33
2838        sqsub           v18.4s,  v25.4s,  v26.4s        // t34
2839        sqsub           v20.4s,  v31.4s,  v28.4s        // t60a
2840        sqadd           v23.4s,  v31.4s,  v28.4s        // t63a
2841        sqsub           v21.4s,  v30.4s,  v29.4s        // t61
2842        sqadd           v22.4s,  v30.4s,  v29.4s        // t62
2843
2844        mul_mla         v2,  v21, v18, v0.s[2], v0.s[3] // -> t61a
2845        mul_mls         v4,  v21, v18, v0.s[3], v0.s[2] // -> t34a
2846        mul_mla         v6,  v20, v19, v0.s[2], v0.s[3] // -> t60
2847        srshr           v21.4s, v2.4s,  #12             // t61a
2848        srshr           v18.4s, v4.4s,  #12             // t34a
2849        mul_mls         v2,  v20, v19, v0.s[3], v0.s[2] // -> t35
2850        srshr           v20.4s, v6.4s,  #12             // t60
2851        srshr           v19.4s, v2.4s,  #12             // t35
2852
2853        st1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64
2854        st1             {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64
2855
2856        ret
2857endfunc
2858
2859function inv_dct64_step2_neon
2860        movrel          x16, idct_coeffs
2861        ld1             {v0.4s}, [x16]
28621:
2863        // t32a/33/34a/35/60/61a/62/63a
2864        // t56a/57/58a/59/36/37a/38/39a
2865        // t40a/41/42a/43/52/53a/54/55a
2866        // t48a/49/50a/51/44/45a/46/47a
2867        ldr             q16, [x6, #4*4*0]  // t32a
2868        ldr             q17, [x9, #4*4*8]  // t39a
2869        ldr             q18, [x9, #4*4*0]  // t63a
2870        ldr             q19, [x6, #4*4*8]  // t56a
2871        ldr             q20, [x6, #4*4*16] // t40a
2872        ldr             q21, [x9, #4*4*24] // t47a
2873        ldr             q22, [x9, #4*4*16] // t55a
2874        ldr             q23, [x6, #4*4*24] // t48a
2875
2876        sqadd           v24.4s,  v16.4s, v17.4s         // t32
2877        sqsub           v25.4s,  v16.4s, v17.4s         // t39
2878        sqadd           v26.4s,  v18.4s, v19.4s         // t63
2879        sqsub           v27.4s,  v18.4s, v19.4s         // t56
2880        sqsub           v28.4s,  v21.4s, v20.4s         // t40
2881        sqadd           v29.4s,  v21.4s, v20.4s         // t47
2882        sqadd           v30.4s,  v23.4s, v22.4s         // t48
2883        sqsub           v31.4s,  v23.4s, v22.4s         // t55
2884
2885        mul_mla         v2,  v27, v25, v0.s[3], v0.s[2] // -> t56a
2886        mul_mls         v4,  v27, v25, v0.s[2], v0.s[3] // -> t39a
2887        mul_mla         v6,  v31, v28, v0.s[3], v0.s[2] // -> t40a
2888        srshr           v25.4s, v2.4s,  #12             // t56a
2889        srshr           v27.4s, v4.4s,  #12             // t39a
2890        neg             v6.4s,   v6.4s                  // t40a
2891        mul_mls         v2,  v31, v28, v0.s[2], v0.s[3] // -> t55a
2892        srshr           v31.4s, v6.4s,  #12             // t40a
2893        srshr           v28.4s, v2.4s,  #12             // t55a
2894
2895        sqadd           v16.4s,  v24.4s,  v29.4s        // t32a
2896        sqsub           v19.4s,  v24.4s,  v29.4s        // t47a
2897        sqadd           v17.4s,  v27.4s,  v31.4s        // t39
2898        sqsub           v18.4s,  v27.4s,  v31.4s        // t40
2899        sqsub           v20.4s,  v26.4s,  v30.4s        // t48a
2900        sqadd           v23.4s,  v26.4s,  v30.4s        // t63a
2901        sqsub           v21.4s,  v25.4s,  v28.4s        // t55
2902        sqadd           v22.4s,  v25.4s,  v28.4s        // t56
2903
2904        mul_mls         v2,  v21, v18, v0.s[0], v0.s[0] // -> t40a
2905        mul_mla         v4,  v21, v18, v0.s[0], v0.s[0] // -> t55a
2906        mul_mls         v6,  v20, v19, v0.s[0], v0.s[0] // -> t47
2907        srshr           v18.4s, v2.4s,  #12             // t40a
2908        srshr           v21.4s, v4.4s,  #12             // t55a
2909        mul_mla         v2,  v20, v19, v0.s[0], v0.s[0] // -> t48
2910        srshr           v19.4s, v6.4s,  #12             // t47
2911        srshr           v20.4s, v2.4s,  #12             // t48
2912
2913        str             q16, [x6, #4*4*0]  // t32a
2914        str             q17, [x9, #4*4*0]  // t39
2915        str             q18, [x6, #4*4*8]  // t40a
2916        str             q19, [x9, #4*4*8]  // t47
2917        str             q20, [x6, #4*4*16] // t48
2918        str             q21, [x9, #4*4*16] // t55a
2919        str             q22, [x6, #4*4*24] // t56
2920        str             q23, [x9, #4*4*24] // t63a
2921
2922        add             x6,  x6,  #4*4
2923        sub             x9,  x9,  #4*4
2924        cmp             x6,  x9
2925        b.lt            1b
2926        ret
2927endfunc
2928
2929.macro load8 src, strd, zero, clear
2930.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
2931.if \clear
2932        ld1             {\i}, [\src]
2933        st1             {\zero}, [\src], \strd
2934.else
2935        ld1             {\i}, [\src], \strd
2936.endif
2937.endr
2938.endm
2939
2940.macro store16 dst
2941.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
2942        st1             {\i}, [\dst], #16
2943.endr
2944.endm
2945
2946.macro clear_upper8
2947.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
2948        movi            \i,  #0
2949.endr
2950.endm
2951
2952.macro movi_if reg, val, cond
2953.if \cond
2954        movi            \reg, \val
2955.endif
2956.endm
2957
2958.macro movz16dup_if reg, gpr, val, cond
2959.if \cond
2960        movz            \gpr, \val, lsl #16
2961        dup             \reg, \gpr
2962.endif
2963.endm
2964
2965.macro st1_if regs, dst, cond
2966.if \cond
2967        st1             \regs, \dst
2968.endif
2969.endm
2970
2971.macro str_if reg, dst, cond
2972.if \cond
2973        str             \reg, \dst
2974.endif
2975.endm
2976
2977.macro stroff_if reg, dst, dstoff, cond
2978.if \cond
2979        str             \reg, \dst, \dstoff
2980.endif
2981.endm
2982
2983.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
2984.if \cond
2985        scale_input     .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
2986.endif
2987.endm
2988
2989.macro def_dct64_func suffix, clear=0, scale=0
2990function inv_txfm_dct\suffix\()_4s_x64_neon
2991        mov             x14, x30
2992        mov             x6,  sp
2993        lsl             x8,  x8,  #2
2994
2995        movz16dup_if    v0.2s, w16, #2896*8, \scale
2996        movi_if         v7.4s,  #0, \clear
2997        load8           x7,  x8,  v7.4s, \clear
2998        clear_upper8
2999        sub             x7,  x7,  x8, lsl #3
3000        add             x7,  x7,  x8, lsr #1
3001        scale_if        \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
3002
3003        bl              inv_dct_4s_x16_neon
3004
3005        store16         x6
3006
3007        movz16dup_if    v0.2s, w16, #2896*8, \scale
3008        movi_if         v7.8h,  #0, \clear
3009        load8           x7,  x8,  v7.4s, \clear
3010        clear_upper8
3011        sub             x7,  x7,  x8, lsl #3
3012        lsr             x8,  x8,  #1
3013        sub             x7,  x7,  x8, lsr #1
3014        scale_if        \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
3015
3016        bl              inv_dct32_odd_4s_x16_neon
3017
3018        add             x10, x6,  #16*15
3019        sub             x6,  x6,  #16*16
3020
3021        mov             x9,  #-16
3022
3023.macro store_addsub r0, r1, r2, r3
3024        ld1             {v2.4s}, [x6], #16
3025        ld1             {v3.4s}, [x6], #16
3026        sqadd           v6.4s,  v2.4s,  \r0
3027        sqsub           \r0,    v2.4s,  \r0
3028        ld1             {v4.4s}, [x6], #16
3029        sqadd           v7.4s,  v3.4s,  \r1
3030        sqsub           \r1,    v3.4s,  \r1
3031        ld1             {v5.4s}, [x6], #16
3032        sqadd           v2.4s,  v4.4s,  \r2
3033        sub             x6,  x6,  #16*4
3034        sqsub           \r2,    v4.4s,  \r2
3035        st1             {v6.4s}, [x6], #16
3036        st1             {\r0},   [x10], x9
3037        sqadd           v3.4s,  v5.4s,  \r3
3038        sqsub           \r3,    v5.4s,  \r3
3039        st1             {v7.4s}, [x6], #16
3040        st1             {\r1},   [x10], x9
3041        st1             {v2.4s}, [x6], #16
3042        st1             {\r2},   [x10], x9
3043        st1             {v3.4s}, [x6], #16
3044        st1             {\r3},   [x10], x9
3045.endm
3046        store_addsub    v31.4s, v30.4s, v29.4s, v28.4s
3047        store_addsub    v27.4s, v26.4s, v25.4s, v24.4s
3048        store_addsub    v23.4s, v22.4s, v21.4s, v20.4s
3049        store_addsub    v19.4s, v18.4s, v17.4s, v16.4s
3050.purgem store_addsub
3051
3052        add             x6,  x6,  #4*4*16
3053
3054        movrel          x17, idct64_coeffs
3055        movz16dup_if    v0.2s, w16, #2896*8, \scale
3056        movi_if         v7.4s,  #0, \clear
3057        add             x9,  x7,  x8, lsl #4 // offset 16
3058        add             x10, x7,  x8, lsl #3 // offset 8
3059        sub             x9,  x9,  x8         // offset 15
3060        sub             x11, x10, x8         // offset 7
3061        ld1             {v16.4s}, [x7]  // in1  (offset 0)
3062        ld1             {v17.4s}, [x9]  // in31 (offset 15)
3063        ld1             {v18.4s}, [x10] // in17 (offset 8)
3064        ld1             {v19.4s}, [x11] // in15 (offset 7)
3065        st1_if          {v7.4s}, [x7],  \clear
3066        st1_if          {v7.4s}, [x9],  \clear
3067        st1_if          {v7.4s}, [x10], \clear
3068        st1_if          {v7.4s}, [x11], \clear
3069        scale_if        \scale, v0.s[0], v16, v17, v18, v19
3070        bl              inv_dct64_step1_neon
3071        movz16dup_if    v0.2s, w16, #2896*8, \scale
3072        movi_if         v7.4s,  #0, \clear
3073        add             x7,  x7,  x8, lsl #2 // offset 4
3074        sub             x9,  x9,  x8, lsl #2 // offset 11
3075        sub             x10, x7,  x8         // offset 3
3076        add             x11, x9,  x8         // offset 12
3077        ld1             {v16.4s}, [x10] // in7  (offset 3)
3078        ld1             {v17.4s}, [x11] // in25 (offset 12)
3079        ld1             {v18.4s}, [x9]  // in23 (offset 11)
3080        ld1             {v19.4s}, [x7]  // in9  (offset 4)
3081        st1_if          {v7.4s}, [x7],  \clear
3082        st1_if          {v7.4s}, [x9],  \clear
3083        st1_if          {v7.4s}, [x10], \clear
3084        st1_if          {v7.4s}, [x11], \clear
3085        scale_if        \scale, v0.s[0], v16, v17, v18, v19
3086        bl              inv_dct64_step1_neon
3087        movz16dup_if    v0.2s, w16, #2896*8, \scale
3088        movi_if         v7.4s,  #0, \clear
3089        sub             x10, x10, x8, lsl #1 // offset 1
3090        sub             x9,  x9,  x8, lsl #1 // offset 9
3091        add             x7,  x7,  x8         // offset 5
3092        add             x11, x11, x8         // offset 13
3093        ldr             q16, [x10, x8] // in5  (offset 2)
3094        ldr             q17, [x11]     // in27 (offset 13)
3095        ldr             q18, [x9,  x8] // in21 (offset 10)
3096        ldr             q19, [x7]      // in11 (offset 5)
3097        stroff_if       q7,  [x10, x8], \clear
3098        str_if          q7,  [x11],     \clear
3099        stroff_if       q7,  [x9,  x8], \clear
3100        str_if          q7,  [x7],      \clear
3101        scale_if        \scale, v0.s[0], v16, v17, v18, v19
3102        bl              inv_dct64_step1_neon
3103        movz16dup_if    v0.2s, w16, #2896*8, \scale
3104        movi_if         v7.4s,  #0, \clear
3105        ldr             q16, [x10]     // in3  (offset 1)
3106        ldr             q17, [x11, x8] // in29 (offset 14)
3107        ldr             q18, [x9]      // in19 (offset 9)
3108        ldr             q19, [x7,  x8] // in13 (offset 6)
3109        str_if          q7,  [x10],     \clear
3110        stroff_if       q7,  [x11, x8], \clear
3111        str_if          q7,  [x9],      \clear
3112        stroff_if       q7,  [x7,  x8], \clear
3113        scale_if        \scale, v0.s[0], v16, v17, v18, v19
3114        bl              inv_dct64_step1_neon
3115
3116        sub             x6,  x6,  #4*4*32
3117        add             x9,  x6,  #4*4*7
3118
3119        bl              inv_dct64_step2_neon
3120
3121        br              x14
3122endfunc
3123.endm
3124
3125def_dct64_func _clear, clear=1
3126def_dct64_func _clear_scale, clear=1, scale=1
3127
3128
3129function inv_txfm_horz_dct_64x4_neon
3130        mov             x14, x30
3131
3132        mov             x7,  sp
3133        add             x8,  sp,  #4*4*(64 - 4)
3134        add             x9,  x6,  #2*56
3135        mov             x10, #2*64
3136        mov             x11, #-4*4*4
3137
3138        dup             v7.4s,  w12
31391:
3140        ld1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64
3141        ld1             {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11
3142        ld1             {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64
3143        ld1             {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11
3144        transpose_4x4s  v16, v17, v18, v19, v2,  v3,  v4,  v5
3145        transpose_4x4s  v20, v21, v22, v23, v2,  v3,  v4,  v5
3146        transpose_4x4s  v31, v30, v29, v28, v2,  v3,  v4,  v5
3147        transpose_4x4s  v27, v26, v25, v24, v2,  v3,  v4,  v5
3148
3149.macro store_addsub src0, src1, src2, src3
3150        sqsub           v1.4s,   \src0,   \src1
3151        sqadd           v0.4s,   \src0,   \src1
3152        sqsub           v3.4s,   \src2,   \src3
3153        srshl           v1.4s,   v1.4s,   v7.4s
3154        sqadd           v2.4s,   \src2,   \src3
3155        srshl           v3.4s,   v3.4s,   v7.4s
3156        srshl           v0.4s,   v0.4s,   v7.4s
3157        srshl           v2.4s,   v2.4s,   v7.4s
3158        sqxtn           v3.4h,   v3.4s
3159        sqxtn2          v3.8h,   v1.4s
3160        sqxtn           v0.4h,   v0.4s
3161        sqxtn2          v0.8h,   v2.4s
3162        rev64           v3.8h,   v3.8h
3163        st1             {v0.8h},  [x6], x10
3164        st1             {v3.8h},  [x9], x10
3165.endm
3166        store_addsub    v16.4s,  v31.4s,  v20.4s,  v27.4s
3167        store_addsub    v17.4s,  v30.4s,  v21.4s,  v26.4s
3168        store_addsub    v18.4s,  v29.4s,  v22.4s,  v25.4s
3169        store_addsub    v19.4s,  v28.4s,  v23.4s,  v24.4s
3170.purgem store_addsub
3171        sub             x6,  x6,  x10, lsl #2
3172        sub             x9,  x9,  x10, lsl #2
3173        add             x6,  x6,  #16
3174        sub             x9,  x9,  #16
3175
3176        cmp             x7,  x8
3177        b.lt            1b
3178        br              x14
3179endfunc
3180
3181function inv_txfm_add_vert_dct_8x64_neon
3182        mov             x14, x30
3183        lsl             x8,  x8,  #1
3184
3185        mov             x7,  sp
3186        add             x8,  sp,  #2*8*(64 - 4)
3187        add             x9,  x6,  x1, lsl #6
3188        sub             x9,  x9,  x1
3189        neg             x10, x1
3190        mov             x11, #-2*8*4
3191
31921:
3193        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
3194        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
3195        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
3196        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
3197
3198        movi            v6.8h,   #0
3199        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
3200.macro add_dest_addsub src0, src1, src2, src3
3201        ld1             {v0.8h}, [x6], x1
3202        ld1             {v1.8h}, [x9], x10
3203        sqadd           v4.8h,   \src0,   \src1
3204        ld1             {v2.8h}, [x6]
3205        sqsub           \src0,   \src0,   \src1
3206        ld1             {v3.8h}, [x9]
3207        sqadd           v5.8h,   \src2,   \src3
3208        sqsub           \src2,   \src2,   \src3
3209        sub             x6,  x6,  x1
3210        sub             x9,  x9,  x10
3211        srshr           v4.8h,   v4.8h,   #4
3212        srshr           v5.8h,   v5.8h,   #4
3213        srshr           \src0,   \src0,   #4
3214        sqadd           v0.8h,   v0.8h,   v4.8h
3215        srshr           \src2,   \src2,   #4
3216        sqadd           v1.8h,   v1.8h,   \src0
3217        sqadd           v2.8h,   v2.8h,   v5.8h
3218        smax            v0.8h,   v0.8h,   v6.8h
3219        sqadd           v3.8h,   v3.8h,   \src2
3220        smax            v1.8h,   v1.8h,   v6.8h
3221        smin            v0.8h,   v0.8h,   v7.8h
3222        smax            v2.8h,   v2.8h,   v6.8h
3223        smin            v1.8h,   v1.8h,   v7.8h
3224        st1             {v0.8h}, [x6], x1
3225        smax            v3.8h,   v3.8h,   v6.8h
3226        smin            v2.8h,   v2.8h,   v7.8h
3227        st1             {v1.8h}, [x9], x10
3228        smin            v3.8h,   v3.8h,   v7.8h
3229        st1             {v2.8h}, [x6], x1
3230        st1             {v3.8h}, [x9], x10
3231.endm
3232        add_dest_addsub v16.8h,  v31.8h,  v17.8h,  v30.8h
3233        add_dest_addsub v18.8h,  v29.8h,  v19.8h,  v28.8h
3234        add_dest_addsub v20.8h,  v27.8h,  v21.8h,  v26.8h
3235        add_dest_addsub v22.8h,  v25.8h,  v23.8h,  v24.8h
3236.purgem add_dest_addsub
3237        cmp             x7,  x8
3238        b.lt            1b
3239
3240        br              x14
3241endfunc
3242
3243.macro sub_sp space
3244#ifdef _WIN32
3245.if \space > 8192
3246        // Here, we'd need to touch two (or more) pages while decrementing
3247        // the stack pointer.
3248        .error          "sub_sp_align doesn't support values over 8K at the moment"
3249.elseif \space > 4096
3250        sub             x16, sp,  #4096
3251        ldr             xzr, [x16]
3252        sub             sp,  x16, #(\space - 4096)
3253.else
3254        sub             sp,  sp,  #\space
3255.endif
3256#else
3257.if \space >= 4096
3258        sub             sp,  sp,  #(\space)/4096*4096
3259.endif
3260.if (\space % 4096) != 0
3261        sub             sp,  sp,  #(\space)%4096
3262.endif
3263#endif
3264.endm
3265
3266function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
3267        idct_dc         64,  64,  2
3268
3269        mov             x15, x30
3270
3271        sub_sp          64*32*2+64*4*4
3272        add             x5,  sp, #64*4*4
3273
3274        movrel          x13, eob_32x32
3275
3276.irp i, 0, 4, 8, 12, 16, 20, 24, 28
3277        add             x6,  x5,  #(\i*64*2)
3278.if \i > 0
3279        mov             w8,  #(32 - \i)
3280        cmp             w3,  w12
3281        b.lt            1f
3282.endif
3283        add             x7,  x2,  #(\i*4)
3284        mov             x8,  #32*4
3285        mov             x12, #-2 // shift
3286        bl              inv_txfm_dct_clear_4s_x64_neon
3287        add             x6,  x5,  #(\i*64*2)
3288        bl              inv_txfm_horz_dct_64x4_neon
3289.if \i < 28
3290        ldrh            w12, [x13], #2
3291.endif
3292.endr
3293        b               3f
3294
32951:
3296        movi            v4.8h,  #0
3297        movi            v5.8h,  #0
3298        movi            v6.8h,  #0
3299        movi            v7.8h,  #0
33002:
3301        subs            w8,  w8,  #2
3302.rept 4
3303        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3304.endr
3305        b.gt            2b
3306
33073:
3308.irp i, 0, 8, 16, 24, 32, 40, 48, 56
3309        add             x7,  x5,  #(\i*2)
3310        mov             x8,  #64*2
3311        bl              X(inv_txfm_dct_8h_x64_neon)
3312        add             x6,  x0,  #(\i*2)
3313        bl              inv_txfm_add_vert_dct_8x64_neon
3314.endr
3315
3316        add             sp,  x5,  #64*32*2
3317        br              x15
3318endfunc
3319
3320function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
3321        idct_dc         64,  32,  1
3322
3323        mov             x15, x30
3324
3325        sub_sp          64*32*2+64*4*4
3326        add             x5,  sp, #64*4*4
3327
3328        movrel          x13, eob_32x32
3329
3330.irp i, 0, 4, 8, 12, 16, 20, 24, 28
3331        add             x6,  x5,  #(\i*64*2)
3332.if \i > 0
3333        mov             w8,  #(32 - \i)
3334        cmp             w3,  w12
3335        b.lt            1f
3336.endif
3337        add             x7,  x2,  #(\i*4)
3338        mov             x8,  #32*4
3339        mov             x12, #-1 // shift
3340        bl              inv_txfm_dct_clear_scale_4s_x64_neon
3341        add             x6,  x5,  #(\i*64*2)
3342        bl              inv_txfm_horz_dct_64x4_neon
3343.if \i < 28
3344        ldrh            w12, [x13], #2
3345.endif
3346.endr
3347        b               3f
3348
33491:
3350        movi            v4.8h,  #0
3351        movi            v5.8h,  #0
3352        movi            v6.8h,  #0
3353        movi            v7.8h,  #0
33542:
3355        subs            w8,  w8,  #2
3356.rept 4
3357        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3358.endr
3359        b.gt            2b
3360
33613:
3362.irp i, 0, 8, 16, 24, 32, 40, 48, 56
3363        add             x6,  x0,  #(\i*2)
3364        add             x7,  x5,  #(\i*2)
3365        mov             x8,  #64*2
3366        bl              inv_txfm_add_vert_dct_8x32_neon
3367.endr
3368
3369        add             sp,  x5,  #64*32*2
3370        br              x15
3371endfunc
3372
3373function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
3374        idct_dc         32,  64,  1
3375
3376        mov             x15, x30
3377
3378        sub_sp          32*32*2+64*8*2
3379        add             x5,  sp, #64*8*2
3380
3381        movrel          x13, eob_32x32
3382        ldrh            w12, [x13], #2
3383
3384.irp i, 0, 4, 8, 12, 16, 20, 24, 28
3385        add             x6,  x5,  #(\i*32*2)
3386.if \i > 0
3387        mov             w8,  #(32 - \i)
3388        cmp             w3,  w12
3389        b.lt            1f
3390        ldrh            w12, [x13], #2
3391.endif
3392        add             x7,  x2,  #(\i*4)
3393        mov             x8,  #32*4
3394        bl              inv_txfm_horz_scale_dct_32x4_neon
3395.endr
3396        b               3f
3397
33981:
3399        movi            v4.8h,  #0
3400        movi            v5.8h,  #0
3401        movi            v6.8h,  #0
3402        movi            v7.8h,  #0
34032:
3404        subs            w8,  w8,  #4
3405.rept 4
3406        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3407.endr
3408        b.gt            2b
3409
34103:
3411.irp i, 0, 8, 16, 24
3412        add             x7,  x5,  #(\i*2)
3413        mov             x8,  #32*2
3414        bl              X(inv_txfm_dct_8h_x64_neon)
3415        add             x6,  x0,  #(\i*2)
3416        bl              inv_txfm_add_vert_dct_8x64_neon
3417.endr
3418
3419        add             sp,  x5,  #32*32*2
3420        br              x15
3421endfunc
3422
3423function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
3424        idct_dc         64,  16,  2
3425
3426        mov             x15, x30
3427
3428        sub_sp          64*16*2+64*4*4
3429        add             x4,  sp, #64*4*4
3430
3431        movrel          x13, eob_16x32
3432
3433.irp i, 0, 4, 8, 12
3434        add             x6,  x4,  #(\i*64*2)
3435.if \i > 0
3436        mov             w8,  #(16 - \i)
3437        cmp             w3,  w12
3438        b.lt            1f
3439.endif
3440        add             x7,  x2,  #(\i*4)
3441        mov             x8,  #16*4
3442        mov             x12, #-2 // shift
3443        bl              inv_txfm_dct_clear_4s_x64_neon
3444        add             x6,  x4,  #(\i*64*2)
3445        bl              inv_txfm_horz_dct_64x4_neon
3446.if \i < 12
3447        ldrh            w12, [x13], #2
3448.endif
3449.endr
3450        b               3f
3451
34521:
3453        movi            v4.8h,  #0
3454        movi            v5.8h,  #0
3455        movi            v6.8h,  #0
3456        movi            v7.8h,  #0
34572:
3458        subs            w8,  w8,  #2
3459.rept 4
3460        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3461.endr
3462        b.gt            2b
3463
34643:
3465        movrel          x5,  X(inv_dct_8h_x16_neon)
3466.irp i, 0, 8, 16, 24, 32, 40, 48, 56
3467        add             x6,  x0,  #(\i*2)
3468        add             x7,  x4,  #(\i*2)
3469        mov             x8,  #64*2
3470        bl              inv_txfm_add_vert_8x16_neon
3471.endr
3472
3473        add             sp,  x4,  #64*16*2
3474        br              x15
3475endfunc
3476
3477function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
3478        idct_dc         16,  64,  2
3479
3480        mov             x15, x30
3481
3482        sub_sp          16*32*2+64*8*2
3483        add             x5,  sp, #64*8*2
3484
3485        movrel          x13, eob_16x32
3486        ldrh            w12, [x13], #2
3487
3488        adr             x4,  inv_dct_4s_x16_neon
3489.irp i, 0, 4, 8, 12, 16, 20, 24, 28
3490        add             x6,  x5,  #(\i*16*2)
3491.if \i > 0
3492        mov             w8,  #(32 - \i)
3493        cmp             w3,  w12
3494        b.lt            1f
3495        ldrh            w12, [x13], #2
3496.endif
3497        add             x7,  x2,  #(\i*4)
3498        mov             x8,  #32*4
3499        bl              inv_txfm_horz_16x4_neon
3500.endr
3501        b               3f
3502
35031:
3504        movi            v4.8h,  #0
3505        movi            v5.8h,  #0
3506        movi            v6.8h,  #0
3507        movi            v7.8h,  #0
35082:
3509        subs            w8,  w8,  #4
3510.rept 2
3511        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
3512.endr
3513        b.gt            2b
3514
35153:
3516.irp i, 0, 8
3517        add             x7,  x5,  #(\i*2)
3518        mov             x8,  #16*2
3519        bl              X(inv_txfm_dct_8h_x64_neon)
3520        add             x6,  x0,  #(\i*2)
3521        bl              inv_txfm_add_vert_dct_8x64_neon
3522.endr
3523
3524        add             sp,  x5,  #16*32*2
3525        br              x15
3526endfunc
3527