1/****************************************************************************
2 * dct-a.S: aarch64 transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2009-2021 x264 project
5 *
6 * Authors: David Conrad <lessen42@gmail.com>
7 *          Janne Grunau <janne-x264@jannau.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22 *
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
26
27#include "asm.S"
28
29const scan4x4_frame, align=4
30.byte    0,1,   8,9,   2,3,   4,5
31.byte   10,11, 16,17, 24,25, 18,19
32.byte   12,13,  6,7,  14,15, 20,21
33.byte   26,27, 28,29, 22,23, 30,31
34endconst
35
36const scan4x4_field, align=4
37.byte    0,1,   2,3,   8,9,   4,5
38.byte    6,7,  10,11, 12,13, 14,15
39endconst
40
41const sub4x4_frame, align=4
42.byte    0,  1,  4,  8
43.byte    5,  2,  3,  6
44.byte    9, 12, 13, 10
45.byte    7, 11, 14, 15
46endconst
47
48const sub4x4_field, align=4
49.byte    0,  4,  1,  8
50.byte   12,  5,  9, 13
51.byte    2,  6, 10, 14
52.byte    3,  7, 11, 15
53endconst
54
55// sum = a + (b>>shift)   sub = (a>>shift) - b
56.macro SUMSUB_SHR shift sum sub a b t0 t1
57    sshr        \t0,  \b, #\shift
58    sshr        \t1,  \a, #\shift
59    add         \sum, \a, \t0
60    sub         \sub, \t1, \b
61.endm
62
63// sum = (a>>shift) + b   sub = a - (b>>shift)
64.macro SUMSUB_SHR2 shift sum sub a b t0 t1
65    sshr        \t0,  \a, #\shift
66    sshr        \t1,  \b, #\shift
67    add         \sum, \t0, \b
68    sub         \sub, \a, \t1
69.endm
70
71// a += 1.5*ma   b -= 1.5*mb
72.macro SUMSUB_15 a b ma mb t0 t1
73    sshr        \t0, \ma, #1
74    sshr        \t1, \mb, #1
75    add         \t0, \t0, \ma
76    add         \t1, \t1, \mb
77    add         \a,  \a,  \t0
78    sub         \b,  \b,  \t1
79.endm
80
81
82function dct4x4dc_neon, export=1
83    ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
84    movi        v31.4h, #1
85    SUMSUB_AB   v4.4h,  v5.4h,  v0.4h,  v1.4h
86    SUMSUB_AB   v6.4h,  v7.4h,  v2.4h,  v3.4h
87    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
88    SUMSUB_AB   v3.4h,  v1.4h,  v5.4h,  v7.4h
89    transpose   v4.4h,  v6.4h,  v0.4h,  v2.4h
90    transpose   v5.4h,  v7.4h,  v1.4h,  v3.4h
91    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
92    SUMSUB_AB   v1.4h,  v3.4h,  v5.4h,  v7.4h
93    transpose   v4.2s,  v5.2s,  v0.2s,  v1.2s
94    transpose   v6.2s,  v7.2s,  v2.2s,  v3.2s
95    add         v16.4h, v4.4h,  v31.4h
96    add         v17.4h, v6.4h,  v31.4h
97    srhadd      v0.4h,  v4.4h,  v5.4h
98    shsub       v1.4h,  v16.4h, v5.4h
99    shsub       v2.4h,  v17.4h, v7.4h
100    srhadd      v3.4h,  v6.4h,  v7.4h
101    st1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
102    ret
103endfunc
104
105function idct4x4dc_neon, export=1
106    ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
107    SUMSUB_AB   v4.4h,  v5.4h,  v0.4h,  v1.4h
108    SUMSUB_AB   v6.4h,  v7.4h,  v2.4h,  v3.4h
109    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
110    SUMSUB_AB   v3.4h,  v1.4h,  v5.4h,  v7.4h
111    transpose   v4.4h,  v6.4h,  v0.4h,  v2.4h
112    transpose   v5.4h,  v7.4h,  v1.4h,  v3.4h
113    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
114    SUMSUB_AB   v1.4h,  v3.4h,  v5.4h,  v7.4h
115    transpose   v4.2s,  v5.2s,  v0.2s,  v1.2s
116    transpose   v6.2s,  v7.2s,  v2.2s,  v3.2s
117    SUMSUB_AB   v0.4h,  v1.4h,  v4.4h,  v5.4h
118    SUMSUB_AB   v3.4h,  v2.4h,  v6.4h,  v7.4h
119    st1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
120    ret
121endfunc
122
123.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
124    SUMSUB_AB   \v1, \v6, \v5, \v6
125    SUMSUB_AB   \v3, \v7, \v4, \v7
126    add         \v0, \v3, \v1
127    add         \v4, \v7, \v7
128    add         \v5, \v6, \v6
129    sub         \v2, \v3, \v1
130    add         \v1, \v4, \v6
131    sub         \v3, \v7, \v5
132.endm
133
134function sub4x4_dct_neon, export=1
135    mov         x3, #FENC_STRIDE
136    mov         x4, #FDEC_STRIDE
137    ld1        {v0.s}[0], [x1], x3
138    ld1        {v1.s}[0], [x2], x4
139    ld1        {v2.s}[0], [x1], x3
140    usubl       v16.8h, v0.8b,  v1.8b
141    ld1        {v3.s}[0], [x2], x4
142    ld1        {v4.s}[0], [x1], x3
143    usubl       v17.8h, v2.8b,  v3.8b
144    ld1        {v5.s}[0], [x2], x4
145    ld1        {v6.s}[0], [x1], x3
146    usubl       v18.8h, v4.8b,  v5.8b
147    ld1        {v7.s}[0], [x2], x4
148    usubl       v19.8h, v6.8b,  v7.8b
149
150    DCT_1D      v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
151    transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
152    DCT_1D      v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
153    st1        {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
154    ret
155endfunc
156
157function sub8x4_dct_neon
158    ld1        {v0.8b}, [x1], x3
159    ld1        {v1.8b}, [x2], x4
160    usubl       v16.8h, v0.8b,  v1.8b
161    ld1        {v2.8b}, [x1], x3
162    ld1        {v3.8b}, [x2], x4
163    usubl       v17.8h, v2.8b,  v3.8b
164    ld1        {v4.8b}, [x1], x3
165    ld1        {v5.8b}, [x2], x4
166    usubl       v18.8h, v4.8b,  v5.8b
167    ld1        {v6.8b}, [x1], x3
168    ld1        {v7.8b}, [x2], x4
169    usubl       v19.8h, v6.8b,  v7.8b
170
171    DCT_1D      v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
172    transpose4x8.h  v0, v1, v2, v3, v4, v5, v6, v7
173
174    SUMSUB_AB   v16.8h, v19.8h, v0.8h,  v3.8h
175    SUMSUB_AB   v17.8h, v18.8h, v1.8h,  v2.8h
176    add         v22.8h, v19.8h, v19.8h
177    add         v21.8h, v18.8h, v18.8h
178    add         v0.8h,  v16.8h, v17.8h
179    sub         v1.8h,  v16.8h, v17.8h
180
181    add         v2.8h,  v22.8h, v18.8h
182    sub         v3.8h,  v19.8h, v21.8h
183
184    zip1        v4.2d,  v0.2d,  v2.2d
185    zip2        v6.2d,  v0.2d,  v2.2d
186    zip1        v5.2d,  v1.2d,  v3.2d
187    zip2        v7.2d,  v1.2d,  v3.2d
188
189    st1        {v4.8h}, [x0], #16
190    st1        {v5.8h}, [x0], #16
191    st1        {v6.8h}, [x0], #16
192    st1        {v7.8h}, [x0], #16
193    ret
194endfunc
195
196function sub8x8_dct_neon, export=1
197    mov         x5,  x30
198    mov         x3, #FENC_STRIDE
199    mov         x4, #FDEC_STRIDE
200    bl          sub8x4_dct_neon
201    mov         x30, x5
202    b           sub8x4_dct_neon
203endfunc
204
205function sub16x16_dct_neon, export=1
206    mov         x5,  x30
207    mov         x3, #FENC_STRIDE
208    mov         x4, #FDEC_STRIDE
209    bl          sub8x4_dct_neon
210    bl          sub8x4_dct_neon
211    sub         x1, x1, #8*FENC_STRIDE-8
212    sub         x2, x2, #8*FDEC_STRIDE-8
213    bl          sub8x4_dct_neon
214    bl          sub8x4_dct_neon
215    sub         x1, x1, #8
216    sub         x2, x2, #8
217    bl          sub8x4_dct_neon
218    bl          sub8x4_dct_neon
219    sub         x1, x1, #8*FENC_STRIDE-8
220    sub         x2, x2, #8*FDEC_STRIDE-8
221    bl          sub8x4_dct_neon
222    mov         x30, x5
223    b           sub8x4_dct_neon
224endfunc
225
226
227.macro DCT8_1D type
228    SUMSUB_AB   v18.8h, v17.8h, v3.8h,  v4.8h   // s34/d34
229    SUMSUB_AB   v19.8h, v16.8h, v2.8h,  v5.8h   // s25/d25
230    SUMSUB_AB   v22.8h, v21.8h, v1.8h,  v6.8h   // s16/d16
231    SUMSUB_AB   v23.8h, v20.8h, v0.8h,  v7.8h   // s07/d07
232
233    SUMSUB_AB   v24.8h, v26.8h,  v23.8h, v18.8h  // a0/a2
234    SUMSUB_AB   v25.8h, v27.8h,  v22.8h, v19.8h  // a1/a3
235
236    SUMSUB_AB   v30.8h, v29.8h,  v20.8h, v17.8h  // a6/a5
237    sshr        v23.8h, v21.8h, #1
238    sshr        v18.8h, v16.8h, #1
239    add         v23.8h, v23.8h, v21.8h
240    add         v18.8h, v18.8h, v16.8h
241    sub         v30.8h, v30.8h, v23.8h
242    sub         v29.8h, v29.8h, v18.8h
243
244    SUMSUB_AB   v28.8h, v31.8h,  v21.8h, v16.8h   // a4/a7
245    sshr        v22.8h, v20.8h, #1
246    sshr        v19.8h, v17.8h, #1
247    add         v22.8h, v22.8h, v20.8h
248    add         v19.8h, v19.8h, v17.8h
249    add         v22.8h, v28.8h, v22.8h
250    add         v31.8h, v31.8h, v19.8h
251
252    SUMSUB_AB      v0.8h,  v4.8h,  v24.8h, v25.8h
253    SUMSUB_SHR  2, v1.8h,  v7.8h,  v22.8h, v31.8h, v16.8h, v17.8h
254    SUMSUB_SHR  1, v2.8h,  v6.8h,  v26.8h, v27.8h, v18.8h, v19.8h
255    SUMSUB_SHR2 2, v3.8h,  v5.8h,  v30.8h, v29.8h, v20.8h, v21.8h
256.endm
257
258function sub8x8_dct8_neon, export=1
259    mov         x3, #FENC_STRIDE
260    mov         x4, #FDEC_STRIDE
261    ld1        {v16.8b}, [x1], x3
262    ld1        {v17.8b}, [x2], x4
263    ld1        {v18.8b}, [x1], x3
264    ld1        {v19.8b}, [x2], x4
265    usubl       v0.8h,  v16.8b, v17.8b
266    ld1        {v20.8b}, [x1], x3
267    ld1        {v21.8b}, [x2], x4
268    usubl       v1.8h,  v18.8b, v19.8b
269    ld1        {v22.8b}, [x1], x3
270    ld1        {v23.8b}, [x2], x4
271    usubl       v2.8h,  v20.8b, v21.8b
272    ld1        {v24.8b}, [x1], x3
273    ld1        {v25.8b}, [x2], x4
274    usubl       v3.8h,  v22.8b, v23.8b
275    ld1        {v26.8b}, [x1], x3
276    ld1        {v27.8b}, [x2], x4
277    usubl       v4.8h,  v24.8b, v25.8b
278    ld1        {v28.8b}, [x1], x3
279    ld1        {v29.8b}, [x2], x4
280    usubl       v5.8h,  v26.8b, v27.8b
281    ld1        {v30.8b}, [x1], x3
282    ld1        {v31.8b}, [x2], x4
283    usubl       v6.8h,  v28.8b, v29.8b
284    usubl       v7.8h,  v30.8b, v31.8b
285
286    DCT8_1D row
287    transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
288    DCT8_1D col
289
290    st1        {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64
291    st1        {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64
292    ret
293endfunc
294
295function sub16x16_dct8_neon, export=1
296    mov         x7,  x30
297    bl          X(sub8x8_dct8_neon)
298    sub         x1,  x1,  #FENC_STRIDE*8 - 8
299    sub         x2,  x2,  #FDEC_STRIDE*8 - 8
300    bl          X(sub8x8_dct8_neon)
301    sub         x1,  x1,  #8
302    sub         x2,  x2,  #8
303    bl          X(sub8x8_dct8_neon)
304    mov         x30, x7
305    sub         x1,  x1,  #FENC_STRIDE*8 - 8
306    sub         x2,  x2,  #FDEC_STRIDE*8 - 8
307    b           X(sub8x8_dct8_neon)
308endfunc
309
310
311// First part of IDCT (minus final SUMSUB_BA)
312.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
313    SUMSUB_AB   \d4, \d5, \d0, \d2
314    sshr        \d7, \d1, #1
315    sshr        \d6, \d3, #1
316    sub         \d7, \d7, \d3
317    add         \d6, \d6, \d1
318.endm
319
320function add4x4_idct_neon, export=1
321    mov         x2, #FDEC_STRIDE
322    ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
323
324    IDCT_1D     v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
325    ld1        {v28.s}[0], [x0], x2
326    SUMSUB_AB   v0.4h, v2.4h, v4.4h, v6.4h
327    SUMSUB_AB   v1.4h, v3.4h, v5.4h, v7.4h
328
329    transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
330
331    IDCT_1D     v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h
332    ld1        {v29.s}[0], [x0], x2
333    SUMSUB_AB   v0.4h, v2.4h, v4.4h, v6.4h
334    SUMSUB_AB   v1.4h, v3.4h, v5.4h, v7.4h
335
336    srshr       v0.4h,  v0.4h,  #6
337    srshr       v1.4h,  v1.4h,  #6
338    ld1        {v31.s}[0], [x0], x2
339    srshr       v2.4h,  v2.4h,  #6
340    srshr       v3.4h,  v3.4h,  #6
341    ld1        {v30.s}[0], [x0], x2
342
343    sub         x0,  x0,  x2,  lsl #2
344    uaddw       v0.8h,  v0.8h,  v28.8b
345    uaddw       v1.8h,  v1.8h,  v29.8b
346    uaddw       v2.8h,  v2.8h,  v30.8b
347    uaddw       v3.8h,  v3.8h,  v31.8b
348    sqxtun      v0.8b,  v0.8h
349    sqxtun      v1.8b,  v1.8h
350    sqxtun      v2.8b,  v2.8h
351    sqxtun      v3.8b,  v3.8h
352
353    st1        {v0.s}[0], [x0], x2
354    st1        {v1.s}[0], [x0], x2
355    st1        {v3.s}[0], [x0], x2
356    st1        {v2.s}[0], [x0], x2
357    ret
358endfunc
359
360function add8x4_idct_neon, export=1
361    ld1        {v0.8h,v1.8h}, [x1], #32
362    ld1        {v2.8h,v3.8h}, [x1], #32
363    transpose   v20.2d, v21.2d, v0.2d, v2.2d
364    transpose   v22.2d, v23.2d, v1.2d, v3.2d
365    IDCT_1D     v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
366    SUMSUB_AB   v0.8h,  v3.8h,  v16.8h, v18.8h
367    SUMSUB_AB   v1.8h,  v2.8h,  v17.8h, v19.8h
368
369    transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
370
371    IDCT_1D     v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
372    SUMSUB_AB   v0.8h,  v3.8h,  v16.8h, v18.8h
373    SUMSUB_AB   v1.8h,  v2.8h,  v17.8h, v19.8h
374
375    srshr       v0.8h,  v0.8h,  #6
376    ld1        {v28.8b}, [x0], x2
377    srshr       v1.8h,  v1.8h,  #6
378    ld1        {v29.8b}, [x0], x2
379    srshr       v2.8h,  v2.8h,  #6
380    ld1        {v30.8b}, [x0], x2
381    srshr       v3.8h,  v3.8h,  #6
382    ld1        {v31.8b}, [x0], x2
383
384    sub         x0,  x0,  x2,  lsl #2
385    uaddw       v0.8h,  v0.8h,  v28.8b
386    uaddw       v1.8h,  v1.8h,  v29.8b
387    uaddw       v2.8h,  v2.8h,  v30.8b
388    uaddw       v3.8h,  v3.8h,  v31.8b
389
390    sqxtun      v0.8b,  v0.8h
391    sqxtun      v1.8b,  v1.8h
392    st1        {v0.8b}, [x0], x2
393    sqxtun      v2.8b,  v2.8h
394    st1        {v1.8b}, [x0], x2
395    sqxtun      v3.8b,  v3.8h
396    st1        {v2.8b}, [x0], x2
397    st1        {v3.8b}, [x0], x2
398    ret
399endfunc
400
401function add8x8_idct_neon, export=1
402    mov             x2, #FDEC_STRIDE
403    mov             x5,  x30
404    bl              X(add8x4_idct_neon)
405    mov             x30, x5
406    b               X(add8x4_idct_neon)
407endfunc
408
409function add16x16_idct_neon, export=1
410    mov             x2, #FDEC_STRIDE
411    mov             x5,  x30
412    bl              X(add8x4_idct_neon)
413    bl              X(add8x4_idct_neon)
414    sub             x0, x0, #8*FDEC_STRIDE-8
415    bl              X(add8x4_idct_neon)
416    bl              X(add8x4_idct_neon)
417    sub             x0, x0, #8
418    bl              X(add8x4_idct_neon)
419    bl              X(add8x4_idct_neon)
420    sub             x0, x0, #8*FDEC_STRIDE-8
421    bl              X(add8x4_idct_neon)
422    mov             x30, x5
423    b               X(add8x4_idct_neon)
424endfunc
425
426.macro IDCT8_1D type
427    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v20.8h          // a0/a2
428.ifc \type, row
429    ld1        {v22.8h,v23.8h}, [x1], #32
430.endif
431    SUMSUB_SHR  1, v2.8h,  v3.8h,  v18.8h, v22.8h, v16.8h, v20.8h   // a6/a4
432    SUMSUB_AB   v16.8h, v18.8h, v21.8h, v19.8h
433    SUMSUB_15   v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h      // a7/a1
434    SUMSUB_AB   v22.8h, v23.8h, v23.8h, v17.8h
435    SUMSUB_15   v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h      // a5/a3
436
437    SUMSUB_SHR  2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h   // b3/b5
438    SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h   // b1/b7
439
440    SUMSUB_AB   v18.8h, v2.8h,  v0.8h,  v2.8h           // b0/b6
441    SUMSUB_AB   v19.8h, v3.8h,  v1.8h,  v3.8h           // b2/b4
442
443    SUMSUB_AB   v16.8h, v23.8h, v18.8h, v23.8h
444    SUMSUB_AB   v17.8h, v22.8h, v19.8h, v22.8h
445    SUMSUB_AB   v18.8h, v21.8h, v3.8h,  v21.8h
446    SUMSUB_AB   v19.8h, v20.8h, v2.8h,  v20.8h
447.endm
448
449function add8x8_idct8_neon, export=1
450    mov         x2,  #FDEC_STRIDE
451    ld1        {v16.8h,v17.8h}, [x1], #32
452    ld1        {v18.8h,v19.8h}, [x1], #32
453    ld1        {v20.8h,v21.8h}, [x1], #32
454
455    IDCT8_1D    row
456
457    transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31
458
459    IDCT8_1D    col
460
461    ld1        {v0.8b}, [x0], x2
462    srshr       v16.8h, v16.8h, #6
463    ld1        {v1.8b}, [x0], x2
464    srshr       v17.8h, v17.8h, #6
465    ld1        {v2.8b}, [x0], x2
466    srshr       v18.8h, v18.8h, #6
467    ld1        {v3.8b}, [x0], x2
468    srshr       v19.8h, v19.8h, #6
469    ld1        {v4.8b}, [x0], x2
470    srshr       v20.8h, v20.8h, #6
471    ld1        {v5.8b}, [x0], x2
472    srshr       v21.8h, v21.8h, #6
473    ld1        {v6.8b}, [x0], x2
474    srshr       v22.8h, v22.8h, #6
475    ld1        {v7.8b}, [x0], x2
476    srshr       v23.8h, v23.8h, #6
477    sub         x0,  x0,  x2,  lsl #3
478
479    uaddw       v16.8h, v16.8h, v0.8b
480    uaddw       v17.8h, v17.8h, v1.8b
481    uaddw       v18.8h, v18.8h, v2.8b
482    sqxtun      v0.8b,  v16.8h
483    sqxtun      v1.8b,  v17.8h
484    sqxtun      v2.8b,  v18.8h
485    uaddw       v19.8h, v19.8h, v3.8b
486    st1        {v0.8b}, [x0], x2
487    uaddw       v20.8h, v20.8h, v4.8b
488    st1        {v1.8b}, [x0], x2
489    uaddw       v21.8h, v21.8h, v5.8b
490    st1        {v2.8b}, [x0], x2
491    sqxtun      v3.8b,  v19.8h
492    sqxtun      v4.8b,  v20.8h
493    uaddw       v22.8h, v22.8h, v6.8b
494    uaddw       v23.8h, v23.8h, v7.8b
495    st1        {v3.8b}, [x0], x2
496    sqxtun      v5.8b,  v21.8h
497    st1        {v4.8b}, [x0], x2
498    sqxtun      v6.8b,  v22.8h
499    sqxtun      v7.8b,  v23.8h
500    st1        {v5.8b}, [x0], x2
501    st1        {v6.8b}, [x0], x2
502    st1        {v7.8b}, [x0], x2
503    ret
504endfunc
505
506function add16x16_idct8_neon, export=1
507    mov             x7,  x30
508    bl              X(add8x8_idct8_neon)
509    sub             x0,  x0,  #8*FDEC_STRIDE-8
510    bl              X(add8x8_idct8_neon)
511    sub             x0,  x0,  #8
512    bl              X(add8x8_idct8_neon)
513    sub             x0,  x0,  #8*FDEC_STRIDE-8
514    mov             x30, x7
515    b               X(add8x8_idct8_neon)
516endfunc
517
518function add8x8_idct_dc_neon, export=1
519    mov         x2,  #FDEC_STRIDE
520    ld1        {v16.4h}, [x1]
521    ld1        {v0.8b}, [x0], x2
522    srshr       v16.4h, v16.4h, #6
523    ld1        {v1.8b}, [x0], x2
524    dup         v20.8h, v16.h[0]
525    dup         v21.8h, v16.h[1]
526    ld1        {v2.8b}, [x0], x2
527    dup         v22.8h, v16.h[2]
528    dup         v23.8h, v16.h[3]
529    ld1        {v3.8b}, [x0], x2
530    trn1        v20.2d, v20.2d,  v21.2d
531    ld1        {v4.8b}, [x0], x2
532    trn1        v21.2d, v22.2d,  v23.2d
533    ld1        {v5.8b}, [x0], x2
534    neg         v22.8h, v20.8h
535    ld1        {v6.8b}, [x0], x2
536    neg         v23.8h, v21.8h
537    ld1        {v7.8b}, [x0], x2
538
539    sub         x0,  x0,  #8*FDEC_STRIDE
540
541    sqxtun      v20.8b,  v20.8h
542    sqxtun      v21.8b,  v21.8h
543    sqxtun      v22.8b,  v22.8h
544    sqxtun      v23.8b,  v23.8h
545
546    uqadd       v0.8b,  v0.8b,  v20.8b
547    uqadd       v1.8b,  v1.8b,  v20.8b
548    uqadd       v2.8b,  v2.8b,  v20.8b
549    uqadd       v3.8b,  v3.8b,  v20.8b
550    uqadd       v4.8b,  v4.8b,  v21.8b
551    uqadd       v5.8b,  v5.8b,  v21.8b
552    uqadd       v6.8b,  v6.8b,  v21.8b
553    uqadd       v7.8b,  v7.8b,  v21.8b
554    uqsub       v0.8b,  v0.8b,  v22.8b
555    uqsub       v1.8b,  v1.8b,  v22.8b
556    uqsub       v2.8b,  v2.8b,  v22.8b
557    uqsub       v3.8b,  v3.8b,  v22.8b
558    uqsub       v4.8b,  v4.8b,  v23.8b
559    uqsub       v5.8b,  v5.8b,  v23.8b
560    uqsub       v6.8b,  v6.8b,  v23.8b
561    uqsub       v7.8b,  v7.8b,  v23.8b
562
563    st1        {v0.8b}, [x0], x2
564    st1        {v1.8b}, [x0], x2
565    st1        {v2.8b}, [x0], x2
566    st1        {v3.8b}, [x0], x2
567    st1        {v4.8b}, [x0], x2
568    st1        {v5.8b}, [x0], x2
569    st1        {v6.8b}, [x0], x2
570    st1        {v7.8b}, [x0], x2
571    ret
572endfunc
573
574.macro ADD16x4_IDCT_DC dc
575    ld1         {v4.16b}, [x0], x3
576    dup         v24.8h,  \dc[0]
577    dup         v25.8h,  \dc[1]
578    ld1         {v5.16b}, [x0], x3
579    dup         v26.8h,  \dc[2]
580    dup         v27.8h,  \dc[3]
581    ld1         {v6.16b}, [x0], x3
582    trn1        v24.2d,  v24.2d,  v25.2d
583    ld1         {v7.16b}, [x0], x3
584    trn1        v25.2d,  v26.2d,  v27.2d
585    neg         v26.8h,  v24.8h
586    neg         v27.8h,  v25.8h
587
588    sqxtun      v20.8b,  v24.8h
589    sqxtun      v21.8b,  v26.8h
590    sqxtun2     v20.16b, v25.8h
591    sqxtun2     v21.16b, v27.8h
592
593    uqadd        v4.16b, v4.16b, v20.16b
594    uqadd        v5.16b, v5.16b, v20.16b
595    uqadd        v6.16b, v6.16b, v20.16b
596    uqadd        v7.16b, v7.16b, v20.16b
597
598    uqsub        v4.16b, v4.16b, v21.16b
599    uqsub        v5.16b, v5.16b, v21.16b
600    uqsub        v6.16b, v6.16b, v21.16b
601    st1         {v4.16b}, [x2], x3
602    uqsub        v7.16b, v7.16b, v21.16b
603    st1         {v5.16b}, [x2], x3
604    st1         {v6.16b}, [x2], x3
605    st1         {v7.16b}, [x2], x3
606.endm
607
608function add16x16_idct_dc_neon, export=1
609    mov         x2,  x0
610    mov         x3,  #FDEC_STRIDE
611
612    ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
613    srshr       v0.4h,  v0.4h,  #6
614    srshr       v1.4h,  v1.4h,  #6
615
616    ADD16x4_IDCT_DC v0.h
617    srshr       v2.4h,  v2.4h,  #6
618    ADD16x4_IDCT_DC v1.h
619    srshr       v3.4h,  v3.4h,  #6
620    ADD16x4_IDCT_DC v2.h
621    ADD16x4_IDCT_DC v3.h
622    ret
623endfunc
624
625.macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7
626    ld1        {\t0\().8b}, [x1], x3
627    ld1        {\t1\().8b}, [x2], x4
628    ld1        {\t2\().8b}, [x1], x3
629    ld1        {\t3\().8b}, [x2], x4
630    usubl       \t0\().8h,  \t0\().8b,  \t1\().8b
631    ld1        {\t4\().8b}, [x1], x3
632    ld1        {\t5\().8b}, [x2], x4
633    usubl       \t1\().8h,  \t2\().8b,  \t3\().8b
634    ld1        {\t6\().8b}, [x1], x3
635    ld1        {\t7\().8b}, [x2], x4
636    add         \dst\().8h, \t0\().8h,  \t1\().8h
637    usubl       \t2\().8h,  \t4\().8b,  \t5\().8b
638    usubl       \t3\().8h,  \t6\().8b,  \t7\().8b
639    add         \dst\().8h, \dst\().8h, \t2\().8h
640    add         \dst\().8h, \dst\().8h, \t3\().8h
641.endm
642
643function sub8x8_dct_dc_neon, export=1
644    mov             x3,  #FENC_STRIDE
645    mov             x4,  #FDEC_STRIDE
646
647    sub4x4x2_dct_dc  v0, v16, v17, v18, v19, v20, v21, v22, v23
648    sub4x4x2_dct_dc  v1, v24, v25, v26, v27, v28, v29, v30, v31
649
650    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
651    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
652    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
653    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
654    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
655
656    addp        v0.8h,  v2.8h,  v3.8h
657    addp        v0.8h,  v0.8h,  v0.8h
658
659    st1        {v0.4h}, [x0]
660    ret
661endfunc
662
663function sub8x16_dct_dc_neon, export=1
664    mov             x3,  #FENC_STRIDE
665    mov             x4,  #FDEC_STRIDE
666    sub4x4x2_dct_dc  v0, v16, v17, v18, v19, v20, v21, v22, v23
667    sub4x4x2_dct_dc  v1, v24, v25, v26, v27, v28, v29, v30, v31
668    sub4x4x2_dct_dc  v2, v16, v17, v18, v19, v20, v21, v22, v23
669    sub4x4x2_dct_dc  v3, v24, v25, v26, v27, v28, v29, v30, v31
670
671    addp             v4.8h,  v0.8h,  v2.8h
672    addp             v5.8h,  v1.8h,  v3.8h
673
674    transpose   v2.4s,  v3.4s,  v4.4s,  v5.4s
675    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
676
677    transpose   v2.4s,  v3.4s,  v0.4s,  v1.4s
678    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
679
680    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
681    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
682
683    trn1        v2.2d,  v0.2d,  v1.2d
684    trn2        v3.2d,  v1.2d,  v0.2d
685
686    addp        v0.8h,  v2.8h,  v3.8h
687
688    st1        {v0.8h}, [x0]
689    ret
690endfunc
691
692function zigzag_interleave_8x8_cavlc_neon, export=1
693    mov        x3,  #7
694    movi       v31.4s, #1
695    ld4        {v0.8h,v1.8h,v2.8h,v3.8h}, [x1],  #64
696    ld4        {v4.8h,v5.8h,v6.8h,v7.8h}, [x1],  #64
697    umax       v16.8h, v0.8h,  v4.8h
698    umax       v17.8h, v1.8h,  v5.8h
699    umax       v18.8h, v2.8h,  v6.8h
700    umax       v19.8h, v3.8h,  v7.8h
701    st1        {v0.8h}, [x0],  #16
702    st1        {v4.8h}, [x0],  #16
703    umaxp      v16.8h, v16.8h, v17.8h
704    umaxp      v18.8h, v18.8h, v19.8h
705    st1        {v1.8h}, [x0],  #16
706    st1        {v5.8h}, [x0],  #16
707    umaxp      v16.8h, v16.8h, v18.8h
708    st1        {v2.8h}, [x0],  #16
709    st1        {v6.8h}, [x0],  #16
710    cmhs       v16.4s, v16.4s, v31.4s
711    st1        {v3.8h}, [x0],  #16
712    and        v16.16b, v16.16b, v31.16b
713    st1        {v7.8h}, [x0],  #16
714    st1        {v16.b}[0],    [x2],  #1
715    st1        {v16.b}[4],    [x2],  x3
716    st1        {v16.b}[8],    [x2],  #1
717    st1        {v16.b}[12],   [x2]
718    ret
719endfunc
720
721function zigzag_scan_4x4_frame_neon, export=1
722    movrel      x2, scan4x4_frame
723    ld1        {v0.16b,v1.16b}, [x1]
724    ld1        {v16.16b,v17.16b}, [x2]
725    tbl         v2.16b, {v0.16b,v1.16b}, v16.16b
726    tbl         v3.16b, {v0.16b,v1.16b}, v17.16b
727    st1        {v2.16b,v3.16b},   [x0]
728    ret
729endfunc
730
731.macro zigzag_sub_4x4 f ac
732function zigzag_sub_4x4\ac\()_\f\()_neon, export=1
733    mov         x9,  #FENC_STRIDE
734    mov         x4,  #FDEC_STRIDE
735    movrel      x5,  sub4x4_\f
736    mov         x6,  x2
737    ld1        {v0.s}[0], [x1], x9
738    ld1        {v0.s}[1], [x1], x9
739    ld1        {v0.s}[2], [x1], x9
740    ld1        {v0.s}[3], [x1], x9
741    ld1        {v16.16b}, [x5]
742    ld1        {v1.s}[0], [x2], x4
743    ld1        {v1.s}[1], [x2], x4
744    ld1        {v1.s}[2], [x2], x4
745    ld1        {v1.s}[3], [x2], x4
746    tbl         v2.16b, {v0.16b}, v16.16b
747    tbl         v3.16b, {v1.16b}, v16.16b
748    st1        {v0.s}[0], [x6], x4
749    usubl       v4.8h,  v2.8b,  v3.8b
750.ifc \ac, ac
751    dup         h7, v4.h[0]
752    ins         v4.h[0], wzr
753    fmov        w5,  s7
754    strh        w5,  [x3]
755.endif
756    usubl2      v5.8h,  v2.16b, v3.16b
757    st1        {v0.s}[1], [x6], x4
758    umax        v6.8h,  v4.8h,  v5.8h
759    umaxv       h6,  v6.8h
760    st1        {v0.s}[2], [x6], x4
761    fmov        w7,  s6
762    st1        {v0.s}[3], [x6], x4
763    cmp         w7, #0
764    st1        {v4.8h,v5.8h},   [x0]
765    cset        w0, ne
766    ret
767endfunc
768.endm
769
770zigzag_sub_4x4 field
771zigzag_sub_4x4 field, ac
772zigzag_sub_4x4 frame
773zigzag_sub_4x4 frame, ac
774
775function zigzag_scan_4x4_field_neon, export=1
776    movrel      x2, scan4x4_field
777    ld1        {v0.8h,v1.8h},   [x1]
778    ld1        {v16.16b},       [x2]
779    tbl         v0.16b, {v0.16b}, v16.16b
780    st1        {v0.8h,v1.8h},   [x0]
781    ret
782endfunc
783
784function zigzag_scan_8x8_frame_neon, export=1
785    movrel      x2,  scan8x8_frame
786    ld1        {v0.8h,v1.8h},   [x1], #32
787    ld1        {v2.8h,v3.8h},   [x1], #32
788    ld1        {v4.8h,v5.8h},   [x1], #32
789    ld1        {v6.8h,v7.8h},   [x1]
790    ld1        {v16.16b,v17.16b}, [x2], #32
791    ld1        {v18.16b,v19.16b}, [x2], #32
792    ld1        {v20.16b,v21.16b}, [x2], #32
793    ld1        {v22.16b,v23.16b}, [x2], #32
794    tbl         v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
795    tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
796    tbl         v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
797    tbl         v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
798    tbl         v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
799    tbl         v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
800    tbl         v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
801    tbl         v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
802    mov         v25.h[6], v4.h[0]
803    mov         v25.h[7], v5.h[0]
804    mov         v26.h[0], v4.h[1]
805    mov         v27.h[4], v7.h[0]
806    mov         v28.h[7], v4.h[4]
807    mov         v29.h[7], v3.h[6]
808    mov         v30.h[0], v2.h[7]
809    mov         v30.h[1], v3.h[7]
810    st1        {v24.8h,v25.8h}, [x0], #32
811    st1        {v26.8h,v27.8h}, [x0], #32
812    st1        {v28.8h,v29.8h}, [x0], #32
813    st1        {v30.8h,v31.8h}, [x0]
814    ret
815endfunc
816
817#define Z(z)   2*(z), 2*(z)+1
818#define T(x,y) Z(x*8+y)
819const scan8x8_frame, align=5
820    .byte T(0,0), T(1,0), T(0,1), T(0,2)
821    .byte T(1,1), T(2,0), T(3,0), T(2,1)
822    .byte T(1,2), T(0,3), T(0,4), T(1,3)
823    .byte T(2,2), T(3,1), T(4,0), T(5,0)
824    .byte T(4,1), T(3,2), T(2,3), T(1,4)
825    .byte T(0,5), T(0,6), T(1,5), T(2,4)
826#undef T
827#define T(x,y) Z((x-3)*8+y)
828    .byte T(3,3), T(4,2), T(5,1), T(6,0)
829    .byte T(7,0), T(6,1), T(5,2), T(4,3)
830#undef T
831#define T(x,y) Z((x-0)*8+y)
832    .byte T(3,4), T(2,5), T(1,6), T(0,7)
833    .byte T(1,7), T(2,6), T(3,5), T(4,4)
834#undef T
835#define T(x,y) Z((x-4)*8+y)
836    .byte T(5,3), T(6,2), T(7,1), T(7,2)
837    .byte T(6,3), T(5,4), T(4,5), T(3,6)
838    .byte T(2,7), T(3,7), T(4,6), T(5,5)
839    .byte T(6,4), T(7,3), T(7,4), T(6,5)
840    .byte T(5,6), T(4,7), T(5,7), T(6,6)
841    .byte T(7,5), T(7,6), T(6,7), T(7,7)
842endconst
843
844function zigzag_scan_8x8_field_neon, export=1
845    movrel      x2,  scan8x8_field
846    ld1        {v0.8h,v1.8h},   [x1], #32
847    ld1        {v2.8h,v3.8h},   [x1], #32
848    ld1        {v4.8h,v5.8h},   [x1], #32
849    ld1        {v6.8h,v7.8h},   [x1]
850    ld1        {v16.16b,v17.16b}, [x2], #32
851    ld1        {v18.16b,v19.16b}, [x2], #32
852    ld1        {v20.16b,v21.16b}, [x2], #32
853    ld1        {v22.16b}, [x2]
854    ext         v31.16b, v7.16b, v7.16b, #4
855    tbl         v24.16b, {v0.16b,v1.16b},               v16.16b
856    tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
857    tbl         v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
858    tbl         v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
859    tbl         v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
860    tbl         v29.16b, {v4.16b,v5.16b,v6.16b},        v21.16b
861    tbl         v30.16b, {v5.16b,v6.16b,v7.16b},        v22.16b
862    ext         v31.16b, v6.16b, v31.16b, #12
863    st1        {v24.8h,v25.8h}, [x0], #32
864    st1        {v26.8h,v27.8h}, [x0], #32
865    st1        {v28.8h,v29.8h}, [x0], #32
866    st1        {v30.8h,v31.8h}, [x0]
867    ret
868endfunc
869
870.macro zigzag_sub8x8 f
871function zigzag_sub_8x8_\f\()_neon, export=1
872    movrel      x4,  sub8x8_\f
873    mov         x5,  #FENC_STRIDE
874    mov         x6,  #FDEC_STRIDE
875    mov         x7,  x2
876    ld1        {v0.d}[0], [x1], x5
877    ld1        {v0.d}[1], [x1], x5
878    ld1        {v1.d}[0], [x1], x5
879    ld1        {v1.d}[1], [x1], x5
880    ld1        {v2.d}[0], [x1], x5
881    ld1        {v2.d}[1], [x1], x5
882    ld1        {v3.d}[0], [x1], x5
883    ld1        {v3.d}[1], [x1]
884    ld1        {v4.d}[0], [x2], x6
885    ld1        {v4.d}[1], [x2], x6
886    ld1        {v5.d}[0], [x2], x6
887    ld1        {v5.d}[1], [x2], x6
888    ld1        {v6.d}[0], [x2], x6
889    ld1        {v6.d}[1], [x2], x6
890    ld1        {v7.d}[0], [x2], x6
891    ld1        {v7.d}[1], [x2]
892    ld1        {v16.16b,v17.16b}, [x4], #32
893    ld1        {v18.16b,v19.16b}, [x4], #32
894    tbl         v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
895    tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
896    tbl         v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
897    tbl         v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
898    tbl         v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
899    tbl         v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
900    tbl         v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
901    tbl         v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
902    usubl       v4.8h,  v24.8b,  v28.8b
903    usubl2      v5.8h,  v24.16b, v28.16b
904    usubl       v6.8h,  v25.8b,  v29.8b
905    usubl2      v7.8h,  v25.16b, v29.16b
906    usubl       v16.8h, v26.8b,  v30.8b
907    usubl2      v17.8h, v26.16b, v30.16b
908    usubl       v18.8h, v27.8b,  v31.8b
909    usubl2      v19.8h, v27.16b, v31.16b
910    umax        v20.8h, v4.8h,   v5.8h
911    umax        v21.8h, v6.8h,   v7.8h
912    umax        v22.8h, v16.8h,  v17.8h
913    umax        v23.8h, v18.8h,  v19.8h
914    umax        v20.8h, v20.8h,  v21.8h
915    umax        v21.8h, v22.8h,  v23.8h
916    umax        v20.8h, v20.8h,  v21.8h
917    umaxv       h22,    v20.8h
918    st1        {v0.d}[0], [x7], x6
919    st1        {v0.d}[1], [x7], x6
920    st1        {v1.d}[0], [x7], x6
921    st1        {v1.d}[1], [x7], x6
922    st1        {v2.d}[0], [x7], x6
923    st1        {v2.d}[1], [x7], x6
924    st1        {v3.d}[0], [x7], x6
925    st1        {v3.d}[1], [x7]
926    st1        {v4.8h,v5.8h},   [x0], #32
927    st1        {v6.8h,v7.8h},   [x0], #32
928    st1        {v16.8h,v17.8h}, [x0], #32
929    st1        {v18.8h,v19.8h}, [x0]
930    fmov        w9,  s22
931    cmp         w9, #0
932    cset        w0, ne
933    ret
934endfunc
935.endm
936
937zigzag_sub8x8 field
938zigzag_sub8x8 frame
939
940#undef T
941#define T(x,y) Z(x*8+y)
942const scan8x8_field, align=5
943    .byte T(0,0), T(0,1), T(0,2), T(1,0)
944    .byte T(1,1), T(0,3), T(0,4), T(1,2)
945    .byte T(2,0), T(1,3), T(0,5), T(0,6)
946    .byte T(0,7), T(1,4), T(2,1), T(3,0)
947#undef T
948#define T(x,y) Z((x-1)*8+y)
949    .byte T(2,2), T(1,5), T(1,6), T(1,7)
950    .byte T(2,3), T(3,1), T(4,0), T(3,2)
951#undef T
952#define T(x,y) Z((x-2)*8+y)
953    .byte T(2,4), T(2,5), T(2,6), T(2,7)
954    .byte T(3,3), T(4,1), T(5,0), T(4,2)
955#undef T
956#define T(x,y) Z((x-3)*8+y)
957    .byte T(3,4), T(3,5), T(3,6), T(3,7)
958    .byte T(4,3), T(5,1), T(6,0), T(5,2)
959#undef T
960#define T(x,y) Z((x-4)*8+y)
961    .byte T(4,4), T(4,5), T(4,6), T(4,7)
962    .byte T(5,3), T(6,1), T(6,2), T(5,4)
963#undef T
964#define T(x,y) Z((x-5)*8+y)
965    .byte T(5,5), T(5,6), T(5,7), T(6,3)
966    .byte T(7,0), T(7,1), T(6,4), T(6,5)
967endconst
968
969
970#undef T
971#define T(y,x) x*8+y
972const sub8x8_frame, align=5
973    .byte T(0,0), T(1,0), T(0,1), T(0,2)
974    .byte T(1,1), T(2,0), T(3,0), T(2,1)
975    .byte T(1,2), T(0,3), T(0,4), T(1,3)
976    .byte T(2,2), T(3,1), T(4,0), T(5,0)
977    .byte T(4,1), T(3,2), T(2,3), T(1,4)
978    .byte T(0,5), T(0,6), T(1,5), T(2,4)
979    .byte T(3,3), T(4,2), T(5,1), T(6,0)
980    .byte T(7,0), T(6,1), T(5,2), T(4,3)
981    .byte T(3,4), T(2,5), T(1,6), T(0,7)
982    .byte T(1,7), T(2,6), T(3,5), T(4,4)
983    .byte T(5,3), T(6,2), T(7,1), T(7,2)
984    .byte T(6,3), T(5,4), T(4,5), T(3,6)
985    .byte T(2,7), T(3,7), T(4,6), T(5,5)
986    .byte T(6,4), T(7,3), T(7,4), T(6,5)
987    .byte T(5,6), T(4,7), T(5,7), T(6,6)
988    .byte T(7,5), T(7,6), T(6,7), T(7,7)
989endconst
990
991const sub8x8_field, align=5
992    .byte T(0,0), T(0,1), T(0,2), T(1,0)
993    .byte T(1,1), T(0,3), T(0,4), T(1,2)
994    .byte T(2,0), T(1,3), T(0,5), T(0,6)
995    .byte T(0,7), T(1,4), T(2,1), T(3,0)
996    .byte T(2,2), T(1,5), T(1,6), T(1,7)
997    .byte T(2,3), T(3,1), T(4,0), T(3,2)
998    .byte T(2,4), T(2,5), T(2,6), T(2,7)
999    .byte T(3,3), T(4,1), T(5,0), T(4,2)
1000    .byte T(3,4), T(3,5), T(3,6), T(3,7)
1001    .byte T(4,3), T(5,1), T(6,0), T(5,2)
1002    .byte T(4,4), T(4,5), T(4,6), T(4,7)
1003    .byte T(5,3), T(6,1), T(6,2), T(5,4)
1004    .byte T(5,5), T(5,6), T(5,7), T(6,3)
1005    .byte T(7,0), T(7,1), T(6,4), T(6,5)
1006    .byte T(6,6), T(6,7), T(7,2), T(7,3)
1007    .byte T(7,4), T(7,5), T(7,6), T(7,7)
1008endconst
1009